diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 17f907eb07e8..388663eb1db7 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3987,6 +3987,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // reassociate and if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; + + // Try to convert a constant mask AND into a shuffle clear mask. + if (VT.isVector()) + if (SDValue Shuffle = XformToShuffleWithZero(N)) + return Shuffle; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); @@ -16480,6 +16486,8 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> /// vector_shuffle V, Zero, <0, 4, 2, 4> SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); + EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = peekThroughBitcast(N->getOperand(1)); @@ -16490,9 +16498,6 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { if (LegalOperations) return SDValue(); - if (N->getOpcode() != ISD::AND) - return SDValue(); - if (RHS.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); @@ -16581,10 +16586,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) return Fold; - // Try to convert a constant mask AND into a shuffle clear mask. - if (SDValue Shuffle = XformToShuffleWithZero(N)) - return Shuffle; - // Type legalization might introduce new shuffles in the DAG. // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) // -> (shuffle (VBinOp (A, B)), Undef, Mask). diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp index 195ddc78d454..5bbf49290f17 100644 --- a/lib/MC/MCParser/DarwinAsmParser.cpp +++ b/lib/MC/MCParser/DarwinAsmParser.cpp @@ -1086,7 +1086,7 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc, return false; } -Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) { +static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) { switch (Type) { case MachO::PLATFORM_MACOS: return Triple::MacOSX; case MachO::PLATFORM_IOS: return Triple::IOS; diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp index 38720c23ff26..f94e9d3c4785 100644 --- a/lib/MC/MCParser/ELFAsmParser.cpp +++ b/lib/MC/MCParser/ELFAsmParser.cpp @@ -423,8 +423,12 @@ bool ELFAsmParser::parseGroup(StringRef &GroupName) { if (L.isNot(AsmToken::Comma)) return TokError("expected group name"); Lex(); - if (getParser().parseIdentifier(GroupName)) + if (L.is(AsmToken::Integer)) { + GroupName = getTok().getString(); + Lex(); + } else if (getParser().parseIdentifier(GroupName)) { return true; + } if (L.is(AsmToken::Comma)) { Lex(); StringRef Linkage; diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 8b6c571dee02..740861851185 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -27,6 +27,8 @@ using namespace llvm; +namespace { + // -------------------------------------------------------------------- // Implementation of permutation networks. @@ -147,6 +149,7 @@ private: void build(); bool color(); }; +} // namespace std::pair Coloring::getUniqueColor(const NodeSet &Nodes) { uint8_t Color = None; @@ -300,6 +303,7 @@ void Coloring::dump() const { dbgs() << " }\n}\n"; } +namespace { // Base class of for reordering networks. They don't strictly need to be // permutations, as outputs with repeated occurrences of an input element // are allowed. @@ -408,7 +412,7 @@ struct BenesNetwork : public PermNetwork { private: bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step); }; - +} // namespace bool ForwardDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size, unsigned Step) { @@ -602,6 +606,7 @@ bool BenesNetwork::route(ElemType *P, RowType *T, unsigned Size, // Support for building selection results (output instructions that are // parts of the final selection). +namespace { struct OpRef { OpRef(SDValue V) : OpV(V) {} bool isValue() const { return OpV.getNode() != nullptr; } @@ -689,6 +694,7 @@ struct ResultStack { void print(raw_ostream &OS, const SelectionDAG &G) const; }; +} // namespace void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const { if (isValue()) { @@ -740,6 +746,7 @@ void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const { } } +namespace { struct ShuffleMask { ShuffleMask(ArrayRef M) : Mask(M) { for (unsigned I = 0, E = Mask.size(); I != E; ++I) { @@ -763,6 +770,7 @@ struct ShuffleMask { return ShuffleMask(Mask.take_back(H)); } }; +} // namespace // -------------------------------------------------------------------- // The HvxSelector class. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5ac5d0348f8a..2c1faa157ddb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15543,7 +15543,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { return DAG.getNode(X86ISD::CVTSI2P, dl, VT, @@ -15551,9 +15550,15 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, DAG.getUNDEF(SrcVT))); } if (SrcVT.getVectorElementType() == MVT::i1) { - if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT)) - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src)); + if (SrcVT == MVT::v2i1) { + // For v2i1, we need to widen to v4i1 first. + assert(VT == MVT::v2f64 && "Unexpected type"); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src, + DAG.getUNDEF(MVT::v2i1)); + return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src)); + } + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); @@ -15903,9 +15908,15 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SDLoc dl(Op); if (SrcVT.getVectorElementType() == MVT::i1) { - if (SrcVT == MVT::v2i1) - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0)); + if (SrcVT == MVT::v2i1) { + // For v2i1, we need to widen to v4i1 first. + assert(Op.getValueType() == MVT::v2f64 && "Unexpected type"); + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0, + DAG.getUNDEF(MVT::v2i1)); + return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64, + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0)); + } + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0)); @@ -33047,10 +33058,8 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // The right side has to be a 'trunc' or a constant vector. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; - ConstantSDNode *RHSConstSplat = nullptr; - if (auto *RHSBV = dyn_cast(N1)) - RHSConstSplat = RHSBV->getConstantSplatNode(); - if (!RHSTrunc && !RHSConstSplat) + if (!RHSTrunc && + !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -33060,13 +33069,10 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); - if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT.getVectorElementType(), - SDValue(RHSConstSplat, 0)); - N1 = DAG.getSplatBuildVector(VT, DL, N1); - } else if (RHSTrunc) { + if (RHSTrunc) N1 = N1->getOperand(0); - } + else + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); // Generate the wide operation. SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1); diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 7e89a4111d86..619b399ef8d8 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -141,6 +141,7 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // These instructions cannot address 80-bit memory. multiclass FPBinary { +let mayLoad = 1, hasSideEffects = 1 in { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, @@ -177,10 +178,8 @@ def _Fp80m64: FpI_<(outs RFP80:$dst), (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), (set RFP80:$dst, (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; -let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), !strconcat("f", asmstring, "{s}\t$src")>; -let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] @@ -226,12 +225,11 @@ def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), (set RFP80:$dst, (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; -let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), !strconcat("fi", asmstring, "{s}\t$src")>; -let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), !strconcat("fi", asmstring, "{l}\t$src")>; +} // mayLoad = 1, hasSideEffects = 1 } let Defs = [FPSW] in { diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index fdf3e73e4fcd..27c67500b26f 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -832,9 +832,11 @@ def NoVLX : Predicate<"!Subtarget->hasVLX()">; def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; def PKU : Predicate<"Subtarget->hasPKU()">; -def HasVNNI : Predicate<"Subtarget->hasVNNI()">; +def HasVNNI : Predicate<"Subtarget->hasVNNI()">, + AssemblerPredicate<"FeatureVNNI", "AVX-512 VNNI ISA">; -def HasBITALG : Predicate<"Subtarget->hasBITALG()">; +def HasBITALG : Predicate<"Subtarget->hasBITALG()">, + AssemblerPredicate<"FeatureBITALG", "AVX-512 BITALG ISA">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; def HasVAES : Predicate<"Subtarget->hasVAES()">; @@ -866,7 +868,8 @@ def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">; def HasVBMI : Predicate<"Subtarget->hasVBMI()">, AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">; -def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">; +def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">, + AssemblerPredicate<"FeatureVBMI2", "AVX-512 VBMI2 ISA">; def HasIFMA : Predicate<"Subtarget->hasIFMA()">, AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">; def HasRTM : Predicate<"Subtarget->hasRTM()">; diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index c3fa05a11a24..fe106e33bca1 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -880,9 +880,10 @@ bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop, /// If we are able to find such sequence, we return the instructions /// we found, namely %casted_phi and the instructions on its use-def chain up /// to the phi (not including the phi). -bool getCastsForInductionPHI( - PredicatedScalarEvolution &PSE, const SCEVUnknown *PhiScev, - const SCEVAddRecExpr *AR, SmallVectorImpl &CastInsts) { +static bool getCastsForInductionPHI(PredicatedScalarEvolution &PSE, + const SCEVUnknown *PhiScev, + const SCEVAddRecExpr *AR, + SmallVectorImpl &CastInsts) { assert(CastInsts.empty() && "CastInsts is expected to be empty."); auto *PN = cast(PhiScev->getValue()); diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 248462d0de51..821c65bef06a 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -228,14 +228,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 ; KNL-NEXT: callq _func8xi1 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT: movb $85, %al -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; @@ -247,12 +242,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func8xi1 +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: movb $85, %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kandb %k1, %k0, %k0 -; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vpsraw $15, %xmm0, %xmm0 ; SKX-NEXT: popq %rax ; SKX-NEXT: retq ; @@ -264,14 +256,9 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 ; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 ; KNL_X32-NEXT: calll _func8xi1 -; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL_X32-NEXT: movb $85, %al -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 -; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_X32-NEXT: vpand LCPI7_0, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl %cmpRes = icmp sgt <8 x i32>%a, %b diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index e88ec9d7b159..18e9f306bc1b 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -1691,8 +1691,8 @@ define <2 x double> @sbto2f64(<2 x double> %a) { ; VLDQ: # %bb.0: ; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2q %k0, %xmm0 -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 +; VLDQ-NEXT: vpmovm2d %k0, %xmm0 +; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLDQ-NEXT: retq ; ; VLNODQ-LABEL: sbto2f64: @@ -1700,12 +1700,8 @@ define <2 x double> @sbto2f64(<2 x double> %a) { ; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; VLNODQ-NEXT: retq %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> @@ -2002,30 +1998,22 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; NOVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 +; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; -; VLDQ-LABEL: ubto2f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLDQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: ubto2f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLNODQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VLNODQ-NEXT: retq +; VL-LABEL: ubto2f64: +; VL: # %bb.0: +; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; VL-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; VL-NEXT: vcvtudq2pd %xmm0, %xmm0 +; VL-NEXT: retq %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 78111874b58a..306b95f0f3ae 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -2602,16 +2602,16 @@ define <2 x double> @sbto2f64(<2 x double> %a) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33] -; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: sbto2f64: ; SKX: # %bb.0: ; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25] -; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> @@ -2989,8 +2989,8 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] ; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] -; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00] +; GENERIC-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [4:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: ubto2f64: @@ -2998,8 +2998,8 @@ define <2 x double> @ubto2f64(<2 x i32> %a) { ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] ; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vcvtudq2pd %xmm0, %xmm0 # sched: [5:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> diff --git a/test/CodeGen/X86/pr34080-2.ll b/test/CodeGen/X86/pr34080-2.ll new file mode 100644 index 000000000000..5c00f0e3706b --- /dev/null +++ b/test/CodeGen/X86/pr34080-2.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-openbsd6.2 | FileCheck %s + +%struct.DateTime = type { i64, i32, i32, i32, i32, i32, double, i8 } + +define void @computeJD(%struct.DateTime*) nounwind { +; CHECK-LABEL: computeJD: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $32, %esp +; CHECK-NEXT: movl 8(%ebp), %ebx +; CHECK-NEXT: movl 8(%ebx), %esi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $3, 12(%ebx) +; CHECK-NEXT: setl %al +; CHECK-NEXT: subl %eax, %esi +; CHECK-NEXT: movl $-1374389535, %ecx # imm = 0xAE147AE1 +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: imull %ecx +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $5, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $7, %edi +; CHECK-NEXT: addl %eax, %edi +; CHECK-NEXT: imull $36525, %esi, %eax # imm = 0x8EAD +; CHECK-NEXT: addl $172251900, %eax # imm = 0xA445AFC +; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; CHECK-NEXT: imull %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $5, %edx +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: addl 16(%ebx), %ecx +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: leal 257(%ecx,%edx), %eax +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fildl {{[0-9]+}}(%esp) +; CHECK-NEXT: fadds {{\.LCPI.*}} +; CHECK-NEXT: fmuls {{\.LCPI.*}} +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movw $3199, {{[0-9]+}}(%esp) # imm = 0xC7F +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb $1, 36(%ebx) +; CHECK-NEXT: imull $3600000, 20(%ebx), %eax # imm = 0x36EE80 +; CHECK-NEXT: imull $60000, 24(%ebx), %ecx # imm = 0xEA60 +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: fldl 28(%ebx) +; CHECK-NEXT: fmuls {{\.LCPI.*}} +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: movw $3199, (%esp) # imm = 0xC7F +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: movw %ax, (%esp) +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: sarl $31, %eax +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: adcl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ecx, (%ebx) +; CHECK-NEXT: movl %eax, 4(%ebx) +; CHECK-NEXT: leal -12(%ebp), %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl + %2 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 7 + %3 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 1 + %4 = load i32, i32* %3, align 4 + %5 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 2 + %6 = load i32, i32* %5, align 4 + %7 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 3 + %8 = load i32, i32* %7, align 4 + %9 = icmp slt i32 %6, 3 + %10 = add i32 %6, 12 + %11 = select i1 %9, i32 %10, i32 %6 + %12 = sext i1 %9 to i32 + %13 = add i32 %4, %12 + %14 = sdiv i32 %13, -100 + %15 = sdiv i32 %13, 400 + %16 = mul i32 %13, 36525 + %17 = add i32 %16, 172251900 + %18 = sdiv i32 %17, 100 + %19 = mul i32 %11, 306001 + %20 = add i32 %19, 306001 + %21 = sdiv i32 %20, 10000 + %22 = add i32 %8, 2 + %23 = add i32 %22, %14 + %24 = add i32 %23, %15 + %25 = add i32 %24, 255 + %26 = add i32 %25, %18 + %27 = sitofp i32 %26 to double + %28 = fadd double %27, -1.524500e+03 + %29 = fmul double %28, 8.640000e+07 + %30 = fptosi double %29 to i64 + %31 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 0 + store i8 1, i8* %2, align 4 + %32 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 4 + %33 = load i32, i32* %32, align 4 + %34 = mul i32 %33, 3600000 + %35 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 5 + %36 = load i32, i32* %35, align 4 + %37 = mul i32 %36, 60000 + %38 = add i32 %37, %34 + %39 = sext i32 %38 to i64 + %40 = getelementptr inbounds %struct.DateTime, %struct.DateTime* %0, i32 0, i32 6 + %41 = load double, double* %40, align 4 + %42 = fmul double %41, 1.000000e+03 + %43 = fptosi double %42 to i64 + %44 = add i64 %39, %43 + %45 = add i64 %44, %30 + store i64 %45, i64* %31, align 4 + ret void +} + +attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/X86/pr34080.ll b/test/CodeGen/X86/pr34080.ll index 72dbf3c48516..e0b09745ad9e 100644 --- a/test/CodeGen/X86/pr34080.ll +++ b/test/CodeGen/X86/pr34080.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-BROKEN +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-SCHEDULE ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 -mcpu=prescott | FileCheck %s --check-prefix=SSE3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX @@ -46,44 +46,44 @@ define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; SSE2-BROKEN-LABEL: _Z1fe: -; SSE2-BROKEN: ## %bb.0: ## %entry -; SSE2-BROKEN-NEXT: pushq %rbp -; SSE2-BROKEN-NEXT: .cfi_def_cfa_offset 16 -; SSE2-BROKEN-NEXT: .cfi_offset %rbp, -16 -; SSE2-BROKEN-NEXT: movq %rsp, %rbp -; SSE2-BROKEN-NEXT: .cfi_def_cfa_register %rbp -; SSE2-BROKEN-NEXT: fnstcw -4(%rbp) -; SSE2-BROKEN-NEXT: fldt 16(%rbp) -; SSE2-BROKEN-NEXT: movzwl -4(%rbp), %eax -; SSE2-BROKEN-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F -; SSE2-BROKEN-NEXT: fldcw -4(%rbp) -; SSE2-BROKEN-NEXT: movw %ax, -4(%rbp) -; SSE2-BROKEN-NEXT: fistl -8(%rbp) -; SSE2-BROKEN-NEXT: fldcw -4(%rbp) -; SSE2-BROKEN-NEXT: cvtsi2sdl -8(%rbp), %xmm0 -; SSE2-BROKEN-NEXT: movsd %xmm0, -64(%rbp) -; SSE2-BROKEN-NEXT: movsd %xmm0, -32(%rbp) -; SSE2-BROKEN-NEXT: fsubl -32(%rbp) -; SSE2-BROKEN-NEXT: flds {{.*}}(%rip) -; SSE2-BROKEN-NEXT: fnstcw -2(%rbp) -; SSE2-BROKEN-NEXT: fmul %st(0), %st(1) -; SSE2-BROKEN-NEXT: movzwl -2(%rbp), %eax -; SSE2-BROKEN-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F -; SSE2-BROKEN-NEXT: fldcw -2(%rbp) -; SSE2-BROKEN-NEXT: movw %ax, -2(%rbp) -; SSE2-BROKEN-NEXT: fxch %st(1) -; SSE2-BROKEN-NEXT: fistl -12(%rbp) -; SSE2-BROKEN-NEXT: fldcw -2(%rbp) -; SSE2-BROKEN-NEXT: xorps %xmm0, %xmm0 -; SSE2-BROKEN-NEXT: cvtsi2sdl -12(%rbp), %xmm0 -; SSE2-BROKEN-NEXT: movsd %xmm0, -56(%rbp) -; SSE2-BROKEN-NEXT: movsd %xmm0, -24(%rbp) -; SSE2-BROKEN-NEXT: fsubl -24(%rbp) -; SSE2-BROKEN-NEXT: fmulp %st(1) -; SSE2-BROKEN-NEXT: fstpl -48(%rbp) -; SSE2-BROKEN-NEXT: popq %rbp -; SSE2-BROKEN-NEXT: retq +; SSE2-SCHEDULE-LABEL: _Z1fe: +; SSE2-SCHEDULE: ## %bb.0: ## %entry +; SSE2-SCHEDULE-NEXT: pushq %rbp +; SSE2-SCHEDULE-NEXT: .cfi_def_cfa_offset 16 +; SSE2-SCHEDULE-NEXT: .cfi_offset %rbp, -16 +; SSE2-SCHEDULE-NEXT: movq %rsp, %rbp +; SSE2-SCHEDULE-NEXT: .cfi_def_cfa_register %rbp +; SSE2-SCHEDULE-NEXT: fnstcw -4(%rbp) +; SSE2-SCHEDULE-NEXT: fldt 16(%rbp) +; SSE2-SCHEDULE-NEXT: movzwl -4(%rbp), %eax +; SSE2-SCHEDULE-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F +; SSE2-SCHEDULE-NEXT: fldcw -4(%rbp) +; SSE2-SCHEDULE-NEXT: movw %ax, -4(%rbp) +; SSE2-SCHEDULE-NEXT: fistl -8(%rbp) +; SSE2-SCHEDULE-NEXT: fldcw -4(%rbp) +; SSE2-SCHEDULE-NEXT: cvtsi2sdl -8(%rbp), %xmm0 +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -64(%rbp) +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -32(%rbp) +; SSE2-SCHEDULE-NEXT: fsubl -32(%rbp) +; SSE2-SCHEDULE-NEXT: flds {{.*}}(%rip) +; SSE2-SCHEDULE-NEXT: fnstcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: fmul %st(0), %st(1) +; SSE2-SCHEDULE-NEXT: movzwl -2(%rbp), %eax +; SSE2-SCHEDULE-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F +; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: movw %ax, -2(%rbp) +; SSE2-SCHEDULE-NEXT: fxch %st(1) +; SSE2-SCHEDULE-NEXT: fistl -12(%rbp) +; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: xorps %xmm0, %xmm0 +; SSE2-SCHEDULE-NEXT: cvtsi2sdl -12(%rbp), %xmm0 +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -56(%rbp) +; SSE2-SCHEDULE-NEXT: movsd %xmm0, -24(%rbp) +; SSE2-SCHEDULE-NEXT: fsubl -24(%rbp) +; SSE2-SCHEDULE-NEXT: fmulp %st(1) +; SSE2-SCHEDULE-NEXT: fstpl -48(%rbp) +; SSE2-SCHEDULE-NEXT: popq %rbp +; SSE2-SCHEDULE-NEXT: retq ; ; SSE3-LABEL: _Z1fe: ; SSE3: ## %bb.0: ## %entry diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 5700b1df15bd..a516c709517d 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 @c = external global i32*, align 8 @@ -11,42 +15,69 @@ ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -71,38 +102,63 @@ entry: ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_4xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_4xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_4xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_4xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -127,44 +183,106 @@ entry: ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_8xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_8xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_8xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_8xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -189,64 +307,150 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-NEXT: pmullw %xmm3, %xmm4 -; X86-NEXT: movdqa %xmm4, %xmm3 -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X86-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_16xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_16xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: pmullw %xmm3, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm3, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_16xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-SSE-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm3, %xmm4 +; X64-SSE-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -271,36 +475,65 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -325,36 +558,61 @@ entry: ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_4xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_4xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_4xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_4xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -379,42 +637,104 @@ entry: ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_8xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_8xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_8xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_8xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -439,62 +759,148 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: pmulhuw %xmm0, %xmm4 -; X86-NEXT: pmullw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: movdqa %xmm3, %xmm4 -; X86-NEXT: pmulhuw %xmm1, %xmm4 -; X86-NEXT: pmullw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_16xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_16xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm4 -; X64-NEXT: pmulhuw %xmm0, %xmm4 -; X64-NEXT: pmullw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: pmulhuw %xmm1, %xmm4 -; X64-NEXT: pmullw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_16xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -519,46 +925,73 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm1 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm1 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm1 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm1 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -583,48 +1016,75 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8_sext_zext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_sext_zext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -649,36 +1109,63 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -703,62 +1190,93 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16_sext_zext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 -; X86-NEXT: psllq $32, %xmm3 -; X86-NEXT: pmuludq %xmm0, %xmm1 -; X86-NEXT: paddq %xmm3, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: paddq %xmm2, %xmm3 +; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: paddq %xmm3, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_sext_zext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: paddq %xmm2, %xmm3 -; X64-NEXT: psllq $32, %xmm3 -; X64-NEXT: pmuludq %xmm0, %xmm1 -; X64-NEXT: paddq %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: paddq %xmm2, %xmm3 +; X64-SSE-NEXT: psllq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -783,62 +1301,148 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi16_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: pmulhw %xmm0, %xmm4 -; X86-NEXT: pmullw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: movdqa %xmm3, %xmm4 -; X86-NEXT: pmulhw %xmm1, %xmm4 -; X86-NEXT: pmullw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl +; X86-SSE-LABEL: mul_16xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_16xi16_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm4 -; X64-NEXT: pmulhw %xmm0, %xmm4 -; X64-NEXT: pmullw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: pmulhw %xmm1, %xmm4 -; X64-NEXT: pmullw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-AVX1-LABEL: mul_16xi16_sext: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16_sext: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 +; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16_sext: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16_sext: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 +; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -862,31 +1466,54 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movl $255, %ecx +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -906,33 +1533,53 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst2: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -952,37 +1599,60 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst3: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst3: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1002,37 +1672,57 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst4: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst4: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1052,37 +1742,57 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst5: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst5: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst5: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst5: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst5: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst5: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1102,37 +1812,57 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst6: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi8_varconst6: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi8_varconst6: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi8_varconst6: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst6: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst6: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1152,31 +1882,58 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhuw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhuw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1196,31 +1953,51 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst2: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1240,45 +2017,72 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst3: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst3: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1298,45 +2102,68 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst4: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl +; X86-SSE-LABEL: mul_2xi16_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl ; -; X64-LABEL: mul_2xi16_varconst4: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-AVX-LABEL: mul_2xi16_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1355,99 +2182,389 @@ entry: ; define void @PR34947() { -; X86-LABEL: PR34947: -; X86: # %bb.0: -; X86-NEXT: movdqa (%eax), %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X86-NEXT: movd %xmm1, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X86-NEXT: movd %xmm2, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl (%eax) -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-NEXT: pmuludq %xmm2, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-NEXT: movl $8199, %eax # imm = 0x2007 -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: movd %xmm2, (%eax) -; X86-NEXT: movdqa %xmm1, (%eax) -; X86-NEXT: retl +; X86-SSE-LABEL: PR34947: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa (%eax), %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; X86-SSE-NEXT: movd %xmm1, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE-NEXT: movd %xmm0, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl (%eax) +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-SSE-NEXT: movd %eax, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movd %xmm2, (%eax) +; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: retl ; -; X64-LABEL: PR34947: -; X64: # %bb.0: -; X64-NEXT: movdqa (%rax), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl (%rax) -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm2, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: pmuludq %xmm2, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: movl $8199, %eax # imm = 0x2007 -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movd %xmm2, (%rax) -; X64-NEXT: movdqa %xmm1, (%rax) -; X64-NEXT: retq +; X86-AVX1-LABEL: PR34947: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 +; X86-AVX1-NEXT: subl $16, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 36 +; X86-AVX1-NEXT: .cfi_offset %esi, -20 +; X86-AVX1-NEXT: .cfi_offset %edi, -16 +; X86-AVX1-NEXT: .cfi_offset %ebx, -12 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl (%eax) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: vmovd %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi +; X86-AVX1-NEXT: divl %esi +; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi +; X86-AVX1-NEXT: divl %edi +; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vmovd %xmm0, %ebx +; X86-AVX1-NEXT: divl %ebx +; X86-AVX1-NEXT: vmovd %edx, %xmm0 +; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %ebp, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload +; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX1-NEXT: vmovd %eax, %xmm3 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovd %xmm1, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: addl $16, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: PR34947: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm1, %esi +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %esi +; X86-AVX2-NEXT: vmovd %edx, %xmm2 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm0, %esi +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %esi +; X86-AVX2-NEXT: vmovd %edx, %xmm2 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl (%eax) +; X86-AVX2-NEXT: vmovd %edx, %xmm1 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX2-NEXT: vmovd %eax, %xmm2 +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vmovd %xmm1, (%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: PR34947: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa (%rax), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; X64-SSE-NEXT: movd %xmm1, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl (%rax) +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-SSE-NEXT: movd %eax, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: movd %xmm2, (%rax) +; X64-SSE-NEXT: movdqa %xmm1, (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: PR34947: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX1-NEXT: pushq %rbx +; X64-AVX1-NEXT: .cfi_def_cfa_offset 24 +; X64-AVX1-NEXT: .cfi_offset %rbx, -24 +; X64-AVX1-NEXT: .cfi_offset %rbp, -16 +; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl (%rax) +; X64-AVX1-NEXT: movl %edx, %r8d +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r10d +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r11d +; X64-AVX1-NEXT: vmovd %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %esi +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebx +; X64-AVX1-NEXT: movl %edx, %ebx +; X64-AVX1-NEXT: vmovd %xmm0, %ebp +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebp +; X64-AVX1-NEXT: vmovd %edx, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %esi, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovd %r8d, %xmm1 +; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX1-NEXT: vmovd %eax, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vmovd %xmm1, (%rax) +; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX1-NEXT: popq %rbx +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: PR34947: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm1, %esi +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: vmovd %edx, %xmm2 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 +; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm0, %esi +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: vmovd %edx, %xmm2 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl (%rax) +; X64-AVX2-NEXT: vmovd %edx, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX2-NEXT: vmovd %eax, %xmm2 +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovd %xmm1, (%rax) +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %tmp = load <9 x i32>, <9 x i32>* undef, align 64 %rem = urem <9 x i32> zeroinitializer, %tmp %mul = mul <9 x i32> , %rem diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll index 5175850c734f..a799b0e6f12d 100644 --- a/test/CodeGen/X86/v8i1-masks.ll +++ b/test/CodeGen/X86/v8i1-masks.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X32-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64-AVX2 define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { ; X32-LABEL: and_masks: @@ -31,6 +33,37 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ; X64-NEXT: vmovaps %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X32-AVX2-LABEL: and_masks: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX2-NEXT: vmovups (%edx), %ymm0 +; X32-AVX2-NEXT: vmovups (%ecx), %ymm1 +; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovups (%eax), %ymm2 +; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: and_masks: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovups (%rsi), %ymm1 +; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovups (%rdx), %ymm2 +; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 %v1 = load <8 x float>, <8 x float>* %b, align 16 %m0 = fcmp olt <8 x float> %v1, %v0 @@ -62,6 +95,28 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ; X64-NEXT: vmovaps %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X32-AVX2-LABEL: neg_masks: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX2-NEXT: vmovups (%ecx), %ymm0 +; X32-AVX2-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: neg_masks: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vmovups (%rsi), %ymm0 +; X64-AVX2-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 %v1 = load <8 x float>, <8 x float>* %b, align 16 %m0 = fcmp olt <8 x float> %v1, %v0 @@ -71,3 +126,50 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ret void } +define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { +; X32-LABEL: and_mask_constant: +; X32: ## %bb.0: +; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; X32-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X32-NEXT: vpand LCPI2_0, %xmm0, %xmm0 +; X32-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: and_mask_constant: +; X64: ## %bb.0: +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: retq +; +; X32-AVX2-LABEL: and_mask_constant: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand LCPI2_0, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: and_mask_constant: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: retq + %m = icmp eq <8 x i32> %v0, zeroinitializer + %mand = and <8 x i1> %m, + %r = zext <8 x i1> %mand to <8 x i32> + ret <8 x i32> %r +} diff --git a/test/MC/ELF/comdat-name-number.s b/test/MC/ELF/comdat-name-number.s new file mode 100644 index 000000000000..21e2ed7399f0 --- /dev/null +++ b/test/MC/ELF/comdat-name-number.s @@ -0,0 +1,28 @@ +// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -filetype=obj -o %t.o +// RUN: llvm-readobj -elf-section-groups %t.o | FileCheck %s + +// Test that we can handle numeric COMDAT names. + +.section .foo,"G",@progbits,123,comdat +.section .bar,"G",@progbits,abc,comdat + +// CHECK: Groups { +// CHECK-NEXT: Group { +// CHECK-NEXT: Name: .group +// CHECK-NEXT: Index: +// CHECK-NEXT: Type: COMDAT +// CHECK-NEXT: Signature: 123 +// CHECK-NEXT: Section(s) in group [ +// CHECK-NEXT: .foo +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: Group { +// CHECK-NEXT: Name: .group +// CHECK-NEXT: Index: +// CHECK-NEXT: Type: COMDAT +// CHECK-NEXT: Signature: abc +// CHECK-NEXT: Section(s) in group [ +// CHECK-NEXT: .bar +// CHECK-NEXT: ] +// CHECK-NEXT: } +// CHECK-NEXT: }