From 42c45b40b3649a7ddf41d56d212c84160cfb0811 Mon Sep 17 00:00:00 2001 From: dim Date: Tue, 30 May 2017 17:37:59 +0000 Subject: [PATCH 1/3] Vendor import of libc++ trunk r304222: https://llvm.org/svn/llvm-project/libcxx/trunk@304222 --- include/experimental/coroutine | 30 ++++++++++-- include/module.modulemap | 1 + .../from_address.fail.cpp | 46 +++++++++++++++++++ .../support.coroutines/lit.local.cfg | 9 ++++ 4 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 test/std/experimental/language.support/support.coroutines/coroutine.handle/coroutine.handle.export/from_address.fail.cpp create mode 100644 test/std/experimental/language.support/support.coroutines/lit.local.cfg diff --git a/include/experimental/coroutine b/include/experimental/coroutine index d2a03ae8a8c8..21c1ea566403 100644 --- a/include/experimental/coroutine +++ b/include/experimental/coroutine @@ -145,6 +145,19 @@ public: return __tmp; } + // FIXME: Should from_address(nullptr) be allowed? + _LIBCPP_ALWAYS_INLINE + static coroutine_handle from_address(nullptr_t) _NOEXCEPT { + return coroutine_handle(nullptr); + } + + template + static coroutine_handle from_address(_Tp*) { + static_assert(_CallIsValid, + "coroutine_handle::from_address cannot be called with " + "non-void pointers"); + } + private: bool __is_suspended() const _NOEXCEPT { // FIXME actually implement a check for if the coro is suspended. @@ -218,11 +231,22 @@ public: // FIXME: should from_address work with nullptr? _LIBCPP_ALWAYS_INLINE static coroutine_handle from_address(nullptr_t) _NOEXCEPT { - return {}; + return coroutine_handle(nullptr); } - // from_address cannot be used with the coroutines promise type. - static coroutine_handle from_address(_Promise*) = delete; + template + static coroutine_handle from_address(_Tp*) { + static_assert(_CallIsValid, + "coroutine_handle::from_address cannot be called with " + "non-void pointers"); + } + + template + static coroutine_handle from_address(_Promise*) { + static_assert(_CallIsValid, + "coroutine_handle::from_address cannot be used with " + "pointers to the coroutine's promise type; use 'from_promise' instead"); + } _LIBCPP_ALWAYS_INLINE static coroutine_handle from_promise(_Promise& __promise) _NOEXCEPT { diff --git a/include/module.modulemap b/include/module.modulemap index 462d4234ae6c..98b1870afcc1 100644 --- a/include/module.modulemap +++ b/include/module.modulemap @@ -502,6 +502,7 @@ module std [system] { export * } module coroutine { + requires coroutines header "experimental/coroutine" export * } diff --git a/test/std/experimental/language.support/support.coroutines/coroutine.handle/coroutine.handle.export/from_address.fail.cpp b/test/std/experimental/language.support/support.coroutines/coroutine.handle/coroutine.handle.export/from_address.fail.cpp new file mode 100644 index 000000000000..1c87b9443ecf --- /dev/null +++ b/test/std/experimental/language.support/support.coroutines/coroutine.handle/coroutine.handle.export/from_address.fail.cpp @@ -0,0 +1,46 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++98, c++03, c++11 +// + +// template +// struct coroutine_handle; + +// static coroutine_handle from_address(void*) noexcept + +// Test that `from_address` is explicitly ill-formed when called with a typed +// pointer. The user cannot possibly have a typed pointer to the coroutine. +// FIXME: This behavior is an extension, and should upstreamed into the TS or +// the test removed if the TS changes are rejected. + +#include +#include +#include + +namespace coro = std::experimental; + +int main() +{ + { + using H = coro::coroutine_handle<>; + // expected-error@experimental/coroutine:* 3 {{coroutine_handle::from_address cannot be called with non-void pointers}} + H::from_address((int*)nullptr); // expected-note {{requested here}} + H::from_address((const void*)nullptr); // expected-note {{requested here}} + H::from_address((const char*)nullptr); // expected-note {{requested here}} + } + { + using H = coro::coroutine_handle; + // expected-error@experimental/coroutine:* 1 {{static_assert failed "coroutine_handle::from_address cannot be used with pointers to the coroutine's promise type; use 'from_promise' instead"}} + H::from_address((const char*)nullptr); // expected-note {{requested here}} + // expected-error@experimental/coroutine:* 1 {{coroutine_handle::from_address cannot be called with non-void pointers}} + H::from_address((int*)nullptr); // expected-note {{requested here}} + } +} diff --git a/test/std/experimental/language.support/support.coroutines/lit.local.cfg b/test/std/experimental/language.support/support.coroutines/lit.local.cfg new file mode 100644 index 000000000000..a0b3de8afaa3 --- /dev/null +++ b/test/std/experimental/language.support/support.coroutines/lit.local.cfg @@ -0,0 +1,9 @@ +# If the compiler doesn't support coroutines mark all of the tests under +# this directory as unsupported. Otherwise add the required `-fcoroutines-ts` +# flag. +if 'fcoroutines-ts' not in config.available_features: + config.unsupported = True +else: + import copy + config.test_format.cxx = copy.deepcopy(config.test_format.cxx) + config.test_format.cxx.compile_flags += ['-fcoroutines-ts'] From 331a7177748593fa59b59825ff1e8dc8d73e4fc2 Mon Sep 17 00:00:00 2001 From: dim Date: Thu, 1 Jun 2017 20:58:36 +0000 Subject: [PATCH 2/3] Vendor import of llvm trunk r304460: https://llvm.org/svn/llvm-project/llvm/trunk@304460 --- docs/Vectorizers.rst | 6 +- include/llvm/Analysis/TargetTransformInfo.h | 7 + .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + include/llvm/Analysis/ValueTracking.h | 2 + .../CodeGen/GlobalISel/RegisterBankInfo.h | 2 +- include/llvm/CodeGen/ISDOpcodes.h | 12 +- include/llvm/CodeGen/MIRYamlMapping.h | 2 - include/llvm/CodeGen/MachineBasicBlock.h | 3 + include/llvm/CodeGen/MachineConstantPool.h | 17 +- include/llvm/CodeGen/MachineFunction.h | 116 +- .../llvm/CodeGen/MachineFunctionInitializer.h | 6 +- include/llvm/CodeGen/MachineInstr.h | 75 +- .../llvm/CodeGen/MachineInstrBundleIterator.h | 72 +- include/llvm/CodeGen/MachineLoopInfo.h | 24 +- include/llvm/CodeGen/MachineModuleInfo.h | 38 +- include/llvm/CodeGen/Passes.h | 4 + include/llvm/CodeGen/TargetPassConfig.h | 6 +- include/llvm/DebugInfo/CodeView/CodeView.h | 14 +- ...{CVSymbolTypes.def => CodeViewSymbols.def} | 0 .../{TypeRecords.def => CodeViewTypes.def} | 0 .../DebugInfo/CodeView/SymbolDeserializer.h | 13 +- .../llvm/DebugInfo/CodeView/SymbolRecord.h | 2 - .../DebugInfo/CodeView/SymbolRecordMapping.h | 2 +- .../DebugInfo/CodeView/SymbolSerializer.h | 13 +- .../CodeView/SymbolVisitorCallbackPipeline.h | 2 +- .../CodeView/SymbolVisitorCallbacks.h | 2 +- .../DebugInfo/CodeView/TypeDatabaseVisitor.h | 2 +- .../DebugInfo/CodeView/TypeDeserializer.h | 5 +- .../llvm/DebugInfo/CodeView/TypeDumpVisitor.h | 2 +- include/llvm/DebugInfo/CodeView/TypeRecord.h | 36 +- .../DebugInfo/CodeView/TypeRecordMapping.h | 2 +- .../llvm/DebugInfo/CodeView/TypeSerializer.h | 2 +- .../DebugInfo/CodeView/TypeTableBuilder.h | 2 +- .../CodeView/TypeVisitorCallbackPipeline.h | 2 +- .../DebugInfo/CodeView/TypeVisitorCallbacks.h | 6 +- include/llvm/DebugInfo/DIContext.h | 11 +- include/llvm/DebugInfo/DWARF/DWARFContext.h | 3 +- .../llvm/DebugInfo/PDB/Native/RawConstants.h | 1 - .../llvm/DebugInfo/PDB/Native/TpiHashing.h | 2 +- include/llvm/DebugInfo/PDB/PDBContext.h | 3 +- include/llvm/IR/Attributes.h | 197 +- include/llvm/IR/Function.h | 47 +- include/llvm/IR/Instructions.h | 18 + include/llvm/InitializePasses.h | 1 + include/llvm/Object/WindowsResource.h | 64 +- .../ObjectYAML/CodeViewYAMLDebugSections.h | 91 + include/llvm/ObjectYAML/CodeViewYAMLSymbols.h | 41 + include/llvm/ObjectYAML/CodeViewYAMLTypes.h | 48 + include/llvm/Passes/PassBuilder.h | 63 + include/llvm/Support/ARMTargetParser.def | 4 +- include/llvm/Support/BinaryStreamReader.h | 8 + include/llvm/Support/YAMLTraits.h | 38 + include/llvm/TableGen/Record.h | 12 +- include/llvm/Target/TargetLowering.h | 12 + include/llvm/Target/TargetMachine.h | 6 + .../Transforms/IPO/ThinLTOBitcodeWriter.h | 41 + include/llvm/Transforms/Scalar/GVN.h | 27 +- include/llvm/Transforms/Utils/CodeExtractor.h | 13 +- include/llvm/module.modulemap | 4 +- lib/Analysis/CFLGraph.h | 6 + lib/Analysis/ConstantFolding.cpp | 2 + lib/Analysis/EHPersonalities.cpp | 2 + lib/Analysis/InstructionSimplify.cpp | 7 +- lib/Analysis/MemoryDependenceAnalysis.cpp | 1 + lib/Analysis/ScalarEvolution.cpp | 4 + lib/Analysis/TargetTransformInfo.cpp | 4 + lib/Analysis/ValueTracking.cpp | 13 + lib/Bitcode/Writer/BitcodeWriter.cpp | 56 +- lib/CodeGen/AggressiveAntiDepBreaker.cpp | 2 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 20 +- lib/CodeGen/AsmPrinter/EHStreamer.cpp | 2 +- lib/CodeGen/CMakeLists.txt | 1 + lib/CodeGen/CodeGen.cpp | 1 + lib/CodeGen/CodeGenPrepare.cpp | 612 ++++++- lib/CodeGen/CriticalAntiDepBreaker.cpp | 2 +- lib/CodeGen/GlobalISel/Localizer.cpp | 10 +- lib/CodeGen/ImplicitNullChecks.cpp | 16 +- lib/CodeGen/LiveRangeShrink.cpp | 231 +++ lib/CodeGen/MIRParser/MIRParser.cpp | 2 - lib/CodeGen/MIRPrinter.cpp | 2 - lib/CodeGen/MachineBasicBlock.cpp | 7 + lib/CodeGen/MachineInstr.cpp | 86 +- lib/CodeGen/MachineModuleInfo.cpp | 65 +- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 132 +- .../SelectionDAG/LegalizeIntegerTypes.cpp | 70 +- lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/ScheduleDAGRRList.cpp | 278 +-- .../SelectionDAG/SelectionDAGDumper.cpp | 1 + lib/CodeGen/TargetLoweringBase.cpp | 8 +- lib/CodeGen/TargetPassConfig.cpp | 17 +- lib/DebugInfo/CodeView/CVSymbolVisitor.cpp | 2 +- lib/DebugInfo/CodeView/CVTypeVisitor.cpp | 4 +- lib/DebugInfo/CodeView/EnumTables.cpp | 4 +- lib/DebugInfo/CodeView/SymbolDumper.cpp | 2 +- lib/DebugInfo/CodeView/TypeDumpVisitor.cpp | 4 +- lib/DebugInfo/DWARF/DWARFContext.cpp | 8 +- lib/DebugInfo/PDB/Native/InfoStream.cpp | 1 + lib/DebugInfo/PDB/PDBContext.cpp | 3 +- lib/Fuzzer/test/dump_coverage.test | 8 +- lib/IR/Attributes.cpp | 45 +- lib/IR/Function.cpp | 64 +- lib/IR/Instructions.cpp | 32 +- lib/LTO/LTOBackend.cpp | 15 +- lib/MC/MCCodeView.cpp | 2 +- lib/Object/MachOObjectFile.cpp | 2 +- lib/Object/WindowsResource.cpp | 156 +- lib/ObjectYAML/CMakeLists.txt | 3 + lib/ObjectYAML/CodeViewYAMLDebugSections.cpp | 127 ++ lib/ObjectYAML/CodeViewYAMLSymbols.cpp | 496 +++++ lib/ObjectYAML/CodeViewYAMLTypes.cpp | 712 ++++++++ lib/ObjectYAML/LLVMBuild.txt | 2 +- lib/Passes/PassBuilder.cpp | 141 +- lib/Support/BinaryStreamReader.cpp | 20 + lib/Support/Unix/Path.inc | 5 + lib/TableGen/Record.cpp | 19 +- lib/TableGen/TGParser.cpp | 8 +- lib/TableGen/TGParser.h | 2 +- lib/Target/AArch64/AArch64.td | 2 + lib/Target/AArch64/AArch64SchedM1.td | 340 +++- lib/Target/AArch64/AArch64TargetMachine.cpp | 6 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 55 +- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 +- lib/Target/AMDGPU/AMDGPUTargetMachine.h | 5 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 324 +++- lib/Target/AMDGPU/DSInstructions.td | 6 +- .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 106 ++ .../AMDGPU/InstPrinter/AMDGPUInstPrinter.h | 2 + lib/Target/AMDGPU/SIDefines.h | 40 + lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 26 +- lib/Target/AMDGPU/SIInstrInfo.td | 13 + lib/Target/AMDGPU/SIRegisterInfo.h | 6 + lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp | 13 + lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h | 6 + lib/Target/ARM/ARMExpandPseudoInsts.cpp | 132 +- lib/Target/ARM/ARMFrameLowering.cpp | 5 +- lib/Target/ARM/ARMInstrThumb.td | 3 +- lib/Target/ARM/ARMInstrThumb2.td | 3 +- lib/Target/ARM/ARMTargetMachine.cpp | 4 +- lib/Target/ARM/ARMTargetMachine.h | 4 + lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 9 + lib/Target/ARM/Thumb1FrameLowering.cpp | 9 +- lib/Target/AVR/AVRISelLowering.cpp | 4 +- lib/Target/AVR/AVRInstrInfo.td | 2 +- lib/Target/AVR/AVRTargetMachine.cpp | 4 +- lib/Target/AVR/AVRTargetMachine.h | 4 + lib/Target/BPF/BPFTargetMachine.cpp | 4 +- lib/Target/BPF/CMakeLists.txt | 2 +- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp | 189 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 6 +- lib/Target/Hexagon/HexagonInstrInfo.cpp | 170 +- lib/Target/Hexagon/HexagonInstrInfo.h | 5 - .../Hexagon/HexagonLoopIdiomRecognition.cpp | 8 +- lib/Target/Hexagon/HexagonPatterns.td | 59 +- lib/Target/Hexagon/HexagonTargetMachine.cpp | 4 +- lib/Target/Hexagon/HexagonVLIWPacketizer.cpp | 49 +- lib/Target/Lanai/LanaiTargetMachine.cpp | 4 +- lib/Target/Lanai/LanaiTargetMachine.h | 4 + lib/Target/MSP430/MSP430TargetMachine.cpp | 4 +- lib/Target/Mips/Mips16FrameLowering.cpp | 24 +- lib/Target/Mips/MipsTargetMachine.cpp | 4 +- lib/Target/Mips/MipsTargetMachine.h | 4 + lib/Target/NVPTX/NVPTXTargetMachine.cpp | 4 +- lib/Target/NVPTX/NVPTXTargetMachine.h | 3 + lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 298 ++- lib/Target/PowerPC/PPCISelLowering.cpp | 110 +- lib/Target/PowerPC/PPCISelLowering.h | 8 + lib/Target/PowerPC/PPCInstr64Bit.td | 12 +- lib/Target/PowerPC/PPCInstrInfo.cpp | 4 + lib/Target/PowerPC/PPCInstrInfo.h | 1 + lib/Target/PowerPC/PPCInstrInfo.td | 5 + lib/Target/PowerPC/PPCInstrVSX.td | 4 +- lib/Target/PowerPC/PPCTargetMachine.cpp | 4 +- lib/Target/PowerPC/PPCTargetMachine.h | 4 + lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 20 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 1 + lib/Target/RISCV/RISCVTargetMachine.cpp | 2 +- lib/Target/Sparc/SparcTargetMachine.cpp | 4 +- lib/Target/Sparc/SparcTargetMachine.h | 4 + lib/Target/SystemZ/SystemZTargetMachine.cpp | 4 +- lib/Target/SystemZ/SystemZTargetMachine.h | 2 + .../WebAssembly/WebAssemblyTargetMachine.cpp | 4 +- .../X86/InstPrinter/X86InstComments.cpp | 4 + lib/Target/X86/X86FloatingPoint.cpp | 34 +- lib/Target/X86/X86FrameLowering.cpp | 9 +- lib/Target/X86/X86ISelDAGToDAG.cpp | 2 - lib/Target/X86/X86ISelLowering.cpp | 266 +-- lib/Target/X86/X86ISelLowering.h | 2 +- lib/Target/X86/X86InstrInfo.cpp | 1 + lib/Target/X86/X86TargetMachine.cpp | 5 +- lib/Target/X86/X86TargetMachine.h | 4 + lib/Target/XCore/XCoreTargetMachine.cpp | 4 +- lib/Transforms/Coroutines/CoroCleanup.cpp | 4 +- lib/Transforms/Coroutines/CoroEarly.cpp | 4 +- lib/Transforms/Coroutines/CoroElide.cpp | 4 +- lib/Transforms/Coroutines/CoroSplit.cpp | 4 +- lib/Transforms/IPO/PartialInlining.cpp | 23 + lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp | 20 +- .../InstCombine/InstCombineCalls.cpp | 10 +- .../Instrumentation/PGOInstrumentation.cpp | 64 + .../Instrumentation/SanitizerCoverage.cpp | 140 +- .../Scalar/CorrelatedValuePropagation.cpp | 9 +- lib/Transforms/Scalar/GVN.cpp | 164 +- .../Scalar/LowerExpectIntrinsic.cpp | 16 +- lib/Transforms/Scalar/NewGVN.cpp | 38 +- lib/Transforms/Utils/CodeExtractor.cpp | 83 +- lib/Transforms/Utils/PredicateInfo.cpp | 35 +- lib/Transforms/Utils/SimplifyLibCalls.cpp | 14 - lib/Transforms/Vectorize/LoopVectorize.cpp | 29 +- .../CFLAliasAnalysis/Andersen/struct.ll | 18 + .../thinlto-function-summary-callgraph.ll | 13 +- test/CodeGen/AArch64/GlobalISel/localizer.mir | 49 + .../AArch64/GlobalISel/select-pr32733.mir | 1 - test/CodeGen/AArch64/addcarry-crash.ll | 23 + test/CodeGen/AArch64/misched-fusion-aes.ll | 70 +- test/CodeGen/AArch64/pr33172.ll | 32 + test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll | 2 +- test/CodeGen/AMDGPU/merge-m0.mir | 1 - test/CodeGen/AMDGPU/sdwa-scalar-ops.mir | 2 - test/CodeGen/AMDGPU/waitcnt-permute.mir | 33 + test/CodeGen/ARM/cmpxchg-O0.ll | 9 +- test/CodeGen/ARM/v6-jumptable-clobber.mir | 2 - test/CodeGen/AVR/rot.ll | 4 +- .../Hexagon/invalid-dotnew-attempt.mir | 17 + .../Hexagon/loop-idiom/pmpy-long-loop.ll | 62 + test/CodeGen/Hexagon/mul64-sext.ll | 93 + test/CodeGen/MIR/Generic/multiRunPass.mir | 4 +- .../Mips/compactbranches/empty-block.mir | 1 - test/CodeGen/PowerPC/expand-isel.ll | 15 +- test/CodeGen/PowerPC/logic-ops-on-compares.ll | 130 ++ .../memCmpUsedInZeroEqualityComparison.ll | 121 ++ test/CodeGen/PowerPC/memcmp.ll | 87 + test/CodeGen/PowerPC/memcmpIR.ll | 194 ++ .../PowerPC/ppc64-get-cache-line-size.ll | 49 + test/CodeGen/PowerPC/pristine-and-livein.mir | 330 ---- test/CodeGen/PowerPC/testComparesieqsll.ll | 134 ++ test/CodeGen/PowerPC/testComparesiequll.ll | 134 ++ test/CodeGen/PowerPC/testCompareslleqsll.ll | 133 ++ test/CodeGen/PowerPC/testComparesllequll.ll | 133 ++ test/CodeGen/PowerPC/vec_xxpermdi.ll | 307 ++++ test/CodeGen/Thumb2/tbb-removeadd.mir | 1 - test/CodeGen/X86/2007-01-08-InstrSched.ll | 4 +- .../X86/GlobalISel/irtranslator-call.ll | 1 - test/CodeGen/X86/add-of-carry.ll | 6 +- test/CodeGen/X86/addcarry.ll | 21 +- test/CodeGen/X86/avg.ll | 867 +++++---- test/CodeGen/X86/avx.ll | 4 +- test/CodeGen/X86/avx512-cmp-kor-sequence.ll | 6 +- .../X86/avx512-gather-scatter-intrin.ll | 10 +- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 44 +- test/CodeGen/X86/avx512-intrinsics.ll | 83 +- test/CodeGen/X86/avx512-mask-spills.ll | 40 +- .../X86/avx512bw-intrinsics-upgrade.ll | 12 +- test/CodeGen/X86/avx512bw-intrinsics.ll | 16 +- .../X86/avx512bwvl-intrinsics-upgrade.ll | 24 +- .../X86/avx512cdvl-intrinsics-upgrade.ll | 2 +- test/CodeGen/X86/avx512cdvl-intrinsics.ll | 2 +- .../X86/avx512dq-intrinsics-upgrade.ll | 2 +- test/CodeGen/X86/avx512dq-intrinsics.ll | 4 +- .../X86/avx512dqvl-intrinsics-upgrade.ll | 10 +- test/CodeGen/X86/avx512dqvl-intrinsics.ll | 4 +- test/CodeGen/X86/avx512ifma-intrinsics.ll | 8 +- test/CodeGen/X86/avx512ifmavl-intrinsics.ll | 16 +- .../X86/avx512vl-intrinsics-upgrade.ll | 64 +- test/CodeGen/X86/avx512vl-intrinsics.ll | 28 +- test/CodeGen/X86/bitcast-and-setcc-128.ll | 631 ++----- test/CodeGen/X86/bitcast-and-setcc-256.ll | 823 ++++++--- test/CodeGen/X86/bitcast-setcc-128.ll | 637 ++----- test/CodeGen/X86/bitcast-setcc-256.ll | 254 +-- test/CodeGen/X86/bswap_tree2.ll | 35 +- test/CodeGen/X86/eh-unknown.ll | 32 + test/CodeGen/X86/fmsubadd-combine.ll | 8 +- test/CodeGen/X86/fold-tied-op.ll | 7 +- test/CodeGen/X86/fp128-i128.ll | 2 +- test/CodeGen/X86/gnu-seh-nolpads.ll | 34 + test/CodeGen/X86/implicit-null-checks.mir | 22 +- test/CodeGen/X86/lrshrink.ll | 57 + test/CodeGen/X86/madd.ll | 34 +- test/CodeGen/X86/misched-matrix.ll | 4 +- test/CodeGen/X86/mul-constant-i16.ll | 141 +- test/CodeGen/X86/mul-constant-i32.ll | 1589 ++-------------- test/CodeGen/X86/mul-constant-i64.ll | 1612 ++--------------- test/CodeGen/X86/oddshuffles.ll | 34 +- test/CodeGen/X86/pmul.ll | 55 +- test/CodeGen/X86/pr32284.ll | 206 ++- test/CodeGen/X86/pr32610.ll | 40 + test/CodeGen/X86/rotate.ll | 16 +- test/CodeGen/X86/sad.ll | 943 +++++----- test/CodeGen/X86/select.ll | 30 +- test/CodeGen/X86/setcc-lowering.ll | 61 +- test/CodeGen/X86/setcc-wide-types.ll | 56 +- test/CodeGen/X86/shrink_vmul_sse.ll | 2 +- test/CodeGen/X86/sse41.ll | 8 +- test/CodeGen/X86/vector-bitreverse.ll | 6 +- test/CodeGen/X86/vector-blend.ll | 4 +- test/CodeGen/X86/x86-interleaved-access.ll | 14 +- test/CodeGen/X86/xchg-nofold.ll | 24 +- test/DebugInfo/MIR/X86/empty-inline.mir | 1 - test/DebugInfo/omit-empty.ll | 1 + .../SanitizerCoverage/coverage-dbg.ll | 4 +- .../SanitizerCoverage/coverage.ll | 11 - test/Instrumentation/SanitizerCoverage/seh.ll | 1 - test/MC/AMDGPU/ds-err.s | 90 + test/MC/AMDGPU/ds.s | 144 +- test/MC/ARM/big-endian-thumb-fixup.s | 1 + test/MC/ARM/mixed-arm-thumb-bl-fixup.ll | 77 + test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt | 20 +- test/Other/new-pm-defaults.ll | 8 +- test/Other/new-pm-thinlto-defaults.ll | 221 +++ .../X86/{error-newpm.ll => newpm-basic.ll} | 6 +- .../CodeExtractor/PartialInlineAlloca.ll | 68 + .../CodeExtractor/PartialInlineAlloca2.ll | 65 + .../CodeExtractor/PartialInlineAlloca4.ll | 67 + .../CodeExtractor/PartialInlineAlloca5.ll | 67 + .../CodeExtractor/PartialInlineLiveAcross.ll | 61 + .../CodeExtractor/PartialInlineNoLiveOut.ll | 62 + test/Transforms/GVN/PRE/phi-translate-2.ll | 105 -- test/Transforms/GVN/PRE/pre-gep-load.ll | 2 +- test/Transforms/GVN/PRE/pre-load.ll | 6 +- test/Transforms/Inline/AArch64/gep-cost.ll | 25 +- test/Transforms/InstCombine/ctpop.ll | 16 + test/Transforms/InstCombine/intrinsics.ll | 28 + .../AArch64/no_vector_instructions.ll | 49 + .../LowerExpectIntrinsic/expect_nonboolean.ll | 104 ++ test/Transforms/NewGVN/completeness.ll | 17 + test/Transforms/NewGVN/pr33185.ll | 59 + test/Transforms/PGOProfile/branch1.ll | 5 + .../Transforms/ThinLTOBitcodeWriter/new-pm.ll | 9 + .../Transforms/Util/PredicateInfo/condprop.ll | 7 +- .../Util/PredicateInfo/testandor.ll | 27 +- test/tools/llvm-config/cflags.test | 2 +- .../tools/llvm-cvtres/Inputs/test_resource.rc | 6 + .../llvm-cvtres/Inputs/test_resource.res | Bin 2200 -> 2332 bytes test/tools/llvm-cvtres/resource.test | 46 +- tools/bugpoint/OptimizerDriver.cpp | 11 +- tools/llvm-config/BuildVariables.inc.in | 1 + tools/llvm-config/llvm-config.cpp | 4 +- tools/llvm-cvtres/llvm-cvtres.cpp | 32 +- tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 6 +- tools/llvm-objdump/MachODump.cpp | 5 +- tools/llvm-objdump/llvm-objdump.cpp | 5 +- tools/llvm-pdbdump/Analyze.cpp | 2 +- tools/llvm-pdbdump/CMakeLists.txt | 3 +- tools/llvm-pdbdump/CompactTypeDumpVisitor.cpp | 2 +- tools/llvm-pdbdump/PdbYaml.cpp | 200 +- tools/llvm-pdbdump/PdbYaml.h | 207 +-- tools/llvm-pdbdump/YAMLOutputStyle.cpp | 43 +- tools/llvm-pdbdump/YAMLOutputStyle.h | 2 +- tools/llvm-pdbdump/YamlSerializationContext.h | 39 - tools/llvm-pdbdump/YamlSymbolDumper.cpp | 413 ----- tools/llvm-pdbdump/YamlSymbolDumper.h | 66 - tools/llvm-pdbdump/YamlTypeDumper.cpp | 589 ------ tools/llvm-pdbdump/YamlTypeDumper.h | 116 -- tools/llvm-pdbdump/llvm-pdbdump.cpp | 17 +- tools/llvm-readobj/ELFDumper.cpp | 1 + tools/obj2yaml/macho2yaml.cpp | 2 + tools/opt/NewPMDriver.cpp | 15 +- tools/opt/NewPMDriver.h | 10 +- tools/opt/opt.cpp | 6 +- unittests/ADT/ArrayRefTest.cpp | 30 +- unittests/IR/AttributesTest.cpp | 7 + unittests/Support/TargetParserTest.cpp | 4 +- utils/TableGen/AsmMatcherEmitter.cpp | 45 +- utils/TableGen/AsmWriterEmitter.cpp | 46 +- utils/TableGen/Attributes.cpp | 4 +- utils/TableGen/CodeEmitterGen.cpp | 14 +- utils/TableGen/CodeGenDAGPatterns.cpp | 6 +- utils/TableGen/CodeGenDAGPatterns.h | 8 +- utils/TableGen/CodeGenRegisters.cpp | 7 +- utils/TableGen/CodeGenRegisters.h | 4 +- utils/TableGen/CodeGenSchedule.cpp | 2 +- utils/TableGen/FixedLenDecoderEmitter.cpp | 15 +- utils/TableGen/GlobalISelEmitter.cpp | 2 +- utils/TableGen/OptParserEmitter.cpp | 32 +- utils/TableGen/RegisterBankEmitter.cpp | 2 +- utils/TableGen/RegisterInfoEmitter.cpp | 7 +- utils/TableGen/SearchableTableEmitter.cpp | 4 +- utils/TableGen/SubtargetEmitter.cpp | 16 +- utils/TableGen/X86FoldTablesEmitter.cpp | 4 +- 378 files changed, 12500 insertions(+), 9924 deletions(-) rename include/llvm/DebugInfo/CodeView/{CVSymbolTypes.def => CodeViewSymbols.def} (100%) rename include/llvm/DebugInfo/CodeView/{TypeRecords.def => CodeViewTypes.def} (100%) create mode 100644 include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h create mode 100644 include/llvm/ObjectYAML/CodeViewYAMLSymbols.h create mode 100644 include/llvm/ObjectYAML/CodeViewYAMLTypes.h create mode 100644 include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h create mode 100644 lib/CodeGen/LiveRangeShrink.cpp create mode 100644 lib/ObjectYAML/CodeViewYAMLDebugSections.cpp create mode 100644 lib/ObjectYAML/CodeViewYAMLSymbols.cpp create mode 100644 lib/ObjectYAML/CodeViewYAMLTypes.cpp create mode 100644 test/Analysis/CFLAliasAnalysis/Andersen/struct.ll create mode 100644 test/CodeGen/AArch64/addcarry-crash.ll create mode 100644 test/CodeGen/AArch64/pr33172.ll create mode 100644 test/CodeGen/AMDGPU/waitcnt-permute.mir create mode 100644 test/CodeGen/Hexagon/invalid-dotnew-attempt.mir create mode 100644 test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll create mode 100644 test/CodeGen/Hexagon/mul64-sext.ll create mode 100644 test/CodeGen/PowerPC/logic-ops-on-compares.ll create mode 100644 test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll create mode 100644 test/CodeGen/PowerPC/memcmp.ll create mode 100644 test/CodeGen/PowerPC/memcmpIR.ll create mode 100644 test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll delete mode 100644 test/CodeGen/PowerPC/pristine-and-livein.mir create mode 100644 test/CodeGen/PowerPC/testComparesieqsll.ll create mode 100644 test/CodeGen/PowerPC/testComparesiequll.ll create mode 100644 test/CodeGen/PowerPC/testCompareslleqsll.ll create mode 100644 test/CodeGen/PowerPC/testComparesllequll.ll create mode 100644 test/CodeGen/PowerPC/vec_xxpermdi.ll create mode 100644 test/CodeGen/X86/eh-unknown.ll create mode 100644 test/CodeGen/X86/gnu-seh-nolpads.ll create mode 100644 test/CodeGen/X86/lrshrink.ll create mode 100644 test/CodeGen/X86/pr32610.ll create mode 100644 test/MC/ARM/mixed-arm-thumb-bl-fixup.ll create mode 100644 test/Other/new-pm-thinlto-defaults.ll rename test/ThinLTO/X86/{error-newpm.ll => newpm-basic.ll} (61%) create mode 100644 test/Transforms/CodeExtractor/PartialInlineAlloca.ll create mode 100644 test/Transforms/CodeExtractor/PartialInlineAlloca2.ll create mode 100644 test/Transforms/CodeExtractor/PartialInlineAlloca4.ll create mode 100644 test/Transforms/CodeExtractor/PartialInlineAlloca5.ll create mode 100644 test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll create mode 100644 test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll delete mode 100644 test/Transforms/GVN/PRE/phi-translate-2.ll create mode 100644 test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll create mode 100644 test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll create mode 100644 test/Transforms/NewGVN/pr33185.ll create mode 100644 test/Transforms/ThinLTOBitcodeWriter/new-pm.ll delete mode 100644 tools/llvm-pdbdump/YamlSerializationContext.h delete mode 100644 tools/llvm-pdbdump/YamlSymbolDumper.cpp delete mode 100644 tools/llvm-pdbdump/YamlSymbolDumper.h delete mode 100644 tools/llvm-pdbdump/YamlTypeDumper.cpp delete mode 100644 tools/llvm-pdbdump/YamlTypeDumper.h diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst index 317271af4032..92d6200e169f 100644 --- a/docs/Vectorizers.rst +++ b/docs/Vectorizers.rst @@ -44,12 +44,12 @@ Users can control the vectorization SIMD width using the command line flag "-for $ clang -mllvm -force-vector-width=8 ... $ opt -loop-vectorize -force-vector-width=8 ... -Users can control the unroll factor using the command line flag "-force-vector-unroll" +Users can control the unroll factor using the command line flag "-force-vector-interleave" .. code-block:: console - $ clang -mllvm -force-vector-unroll=2 ... - $ opt -loop-vectorize -force-vector-unroll=2 ... + $ clang -mllvm -force-vector-interleave=2 ... + $ opt -loop-vectorize -force-vector-interleave=2 ... Pragma loop hint directives ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 6cbe3a1f515e..7211508e975a 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -454,6 +454,9 @@ class TargetTransformInfo { /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Enable inline expansion of memcmp + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -828,6 +831,7 @@ class TargetTransformInfo::Concept { unsigned VF) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; + virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -1047,6 +1051,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override { + return Impl.expandMemCmp(I, MaxLoadSize); + } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index ad1a7cb748fe..d73a60eba850 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -274,6 +274,8 @@ class TargetTransformInfoImplBase { bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; } + bool enableInterleavedAccessVectorization() { return false; } bool isFPVectorizationPotentiallyUnsafe() { return false; } diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index b1ee76159c4b..612779b1ce86 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -85,6 +85,8 @@ template class ArrayRef; const Instruction *CxtI = nullptr, const DominatorTree *DT = nullptr); + bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI); + /// Return true if the given value is known to be non-zero when defined. For /// vectors, return true if every element is known to be non-zero when /// defined. For pointers, if the context instruction and dominator tree are diff --git a/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h index f32233b3a9e4..e3549d8988cd 100644 --- a/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h +++ b/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h @@ -396,7 +396,7 @@ class RegisterBankInfo { mutable DenseMap> MapOfInstructionMappings; - /// Create a RegisterBankInfo that can accomodate up to \p NumRegBanks + /// Create a RegisterBankInfo that can accommodate up to \p NumRegBanks /// RegisterBank instances. RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks); diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index 2300a106c358..bc5d2353f63e 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -410,12 +410,22 @@ namespace ISD { /// then the result type must also be a vector type. SETCC, - /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but + /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, and /// op #2 is a *carry value*. This operator checks the result of /// "LHS - RHS - Carry", and can be used to compare two wide integers: /// (setcce lhshi rhshi (subc lhslo rhslo) cc). Only valid for integers. + /// FIXME: This node is deprecated in favor of SETCCCARRY. + /// It is kept around for now to provide a smooth transition path + /// toward the use of SETCCCARRY and will eventually be removed. SETCCE, + /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but + /// op #2 is a boolean indicating if there is an incoming carry. This + /// operator checks the result of "LHS - RHS - Carry", and can be used to + /// compare two wide integers: (setcce lhshi rhshi (subc lhslo rhslo) cc). + /// Only valid for integers. + SETCCCARRY, + /// SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded /// integer shift operations. The operation ordering is: /// [Lo,Hi] = op [LoLHS,HiLHS], Amt diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h index 47b40de6fe1f..30e88fe38ac3 100644 --- a/include/llvm/CodeGen/MIRYamlMapping.h +++ b/include/llvm/CodeGen/MIRYamlMapping.h @@ -381,7 +381,6 @@ struct MachineFunction { StringRef Name; unsigned Alignment = 0; bool ExposesReturnsTwice = false; - bool NoVRegs; // GISel MachineFunctionProperties. bool Legalized = false; bool RegBankSelected = false; @@ -406,7 +405,6 @@ template <> struct MappingTraits { YamlIO.mapRequired("name", MF.Name); YamlIO.mapOptional("alignment", MF.Alignment); YamlIO.mapOptional("exposesReturnsTwice", MF.ExposesReturnsTwice); - YamlIO.mapOptional("noVRegs", MF.NoVRegs); YamlIO.mapOptional("legalized", MF.Legalized); YamlIO.mapOptional("regBankSelected", MF.RegBankSelected); YamlIO.mapOptional("selected", MF.Selected); diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 8da48c379d00..26ed8bb487a2 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -335,6 +335,9 @@ class MachineBasicBlock return make_range(livein_begin(), livein_end()); } + /// Remove entry from the livein set and return iterator to the next. + livein_iterator removeLiveIn(livein_iterator I); + /// Get the clobber mask for the start of this basic block. Funclets use this /// to prevent register allocation across funclet transitions. const uint32_t *getBeginClobberMask(const TargetRegisterInfo *TRI) const; diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h index d2036c4a29a5..1705a0f7e59b 100644 --- a/include/llvm/CodeGen/MachineConstantPool.h +++ b/include/llvm/CodeGen/MachineConstantPool.h @@ -1,4 +1,4 @@ -//===-- CodeGen/MachineConstantPool.h - Abstract Constant Pool --*- C++ -*-===// +//===- CodeGen/MachineConstantPool.h - Abstract Constant Pool ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,29 +18,28 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/MC/SectionKind.h" -#include #include #include namespace llvm { class Constant; -class FoldingSetNodeID; class DataLayout; -class TargetMachine; -class Type; +class FoldingSetNodeID; class MachineConstantPool; class raw_ostream; +class Type; /// Abstract base class for all machine specific constantpool value subclasses. /// class MachineConstantPoolValue { virtual void anchor(); + Type *Ty; public: explicit MachineConstantPoolValue(Type *ty) : Ty(ty) {} - virtual ~MachineConstantPoolValue() {} + virtual ~MachineConstantPoolValue() = default; /// getType - get type of this MachineConstantPoolValue. /// @@ -81,6 +80,7 @@ class MachineConstantPoolEntry { : Alignment(A) { Val.ConstVal = V; } + MachineConstantPoolEntry(MachineConstantPoolValue *V, unsigned A) : Alignment(A) { Val.MachineCPVal = V; @@ -153,13 +153,12 @@ class MachineConstantPool { /// print - Used by the MachineFunction printer to print information about /// constant pool objects. Implemented in MachineFunction.cpp - /// void print(raw_ostream &OS) const; /// dump - Call print(cerr) to be called from the debugger. void dump() const; }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINECONSTANTPOOL_H diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h index 5859a4e61fdd..10125864cd90 100644 --- a/include/llvm/CodeGen/MachineFunction.h +++ b/include/llvm/CodeGen/MachineFunction.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/MachineFunction.h --------------------------*- C++ -*-===// +//===- llvm/CodeGen/MachineFunction.h ---------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,38 +18,61 @@ #ifndef LLVM_CODEGEN_MACHINEFUNCTION_H #define LLVM_CODEGEN_MACHINEFUNCTION_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/ilist.h" +#include "llvm/ADT/iterator.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" +#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Recycler.h" +#include +#include +#include +#include +#include namespace llvm { -class Value; +class BasicBlock; +class BlockAddress; +class DataLayout; +class DIExpression; +class DILocalVariable; +class DILocation; class Function; -class GCModuleInfo; -class MachineRegisterInfo; -class MachineFrameInfo; +class GlobalValue; class MachineConstantPool; +class MachineFrameInfo; +class MachineFunction; class MachineJumpTableInfo; class MachineModuleInfo; +class MachineRegisterInfo; class MCContext; +class MCInstrDesc; class Pass; class PseudoSourceValueManager; +class raw_ostream; +class SlotIndexes; class TargetMachine; -class TargetSubtargetInfo; class TargetRegisterClass; -struct MachinePointerInfo; +class TargetSubtargetInfo; struct WinEHFuncInfo; template <> struct ilist_alloc_traits { @@ -137,27 +160,33 @@ class MachineFunctionProperties { bool hasProperty(Property P) const { return Properties[static_cast(P)]; } + MachineFunctionProperties &set(Property P) { Properties.set(static_cast(P)); return *this; } + MachineFunctionProperties &reset(Property P) { Properties.reset(static_cast(P)); return *this; } + /// Reset all the properties. MachineFunctionProperties &reset() { Properties.reset(); return *this; } + MachineFunctionProperties &set(const MachineFunctionProperties &MFP) { Properties |= MFP.Properties; return *this; } + MachineFunctionProperties &reset(const MachineFunctionProperties &MFP) { Properties.reset(MFP.Properties); return *this; } + // Returns true if all properties set in V (i.e. required by a pass) are set // in this. bool verifyRequiredProperties(const MachineFunctionProperties &V) const { @@ -180,18 +209,17 @@ struct SEHHandler { const BlockAddress *RecoverBA; }; - /// This structure is used to retain landing pad info for the current function. struct LandingPadInfo { MachineBasicBlock *LandingPadBlock; // Landing pad block. SmallVector BeginLabels; // Labels prior to invoke. SmallVector EndLabels; // Labels after invoke. SmallVector SEHHandlers; // SEH handlers active at this lpad. - MCSymbol *LandingPadLabel; // Label at beginning of landing pad. - std::vector TypeIds; // List of type ids (filters negative). + MCSymbol *LandingPadLabel = nullptr; // Label at beginning of landing pad. + std::vector TypeIds; // List of type ids (filters negative). explicit LandingPadInfo(MachineBasicBlock *MBB) - : LandingPadBlock(MBB), LandingPadLabel(nullptr) {} + : LandingPadBlock(MBB) {} }; class MachineFunction { @@ -239,7 +267,7 @@ class MachineFunction { Recycler BasicBlockRecycler; // List of machine basic blocks in function - typedef ilist BasicBlockListType; + using BasicBlockListType = ilist; BasicBlockListType BasicBlocks; /// FunctionNumber - This provides a unique ID for each function emitted in @@ -281,7 +309,7 @@ class MachineFunction { std::vector LandingPads; /// Map a landing pad's EH symbol to the call site indexes. - DenseMap > LPadToCallSiteMap; + DenseMap> LPadToCallSiteMap; /// Map of invoke call site index values to associated begin EH_LABEL. DenseMap CallSiteMap; @@ -303,9 +331,6 @@ class MachineFunction { /// \} - MachineFunction(const MachineFunction &) = delete; - void operator=(const MachineFunction&) = delete; - /// Clear all the members of this MachineFunction, but the ones used /// to initialize again the MachineFunction. /// More specifically, this deallocates all the dynamically allocated @@ -316,8 +341,8 @@ class MachineFunction { /// In particular, the XXXInfo data structure. /// \pre Fn, Target, MMI, and FunctionNumber are properly set. void init(); -public: +public: struct VariableDbgInfo { const DILocalVariable *Var; const DIExpression *Expr; @@ -328,11 +353,13 @@ class MachineFunction { unsigned Slot, const DILocation *Loc) : Var(Var), Expr(Expr), Slot(Slot), Loc(Loc) {} }; - typedef SmallVector VariableDbgInfoMapTy; + using VariableDbgInfoMapTy = SmallVector; VariableDbgInfoMapTy VariableDbgInfos; MachineFunction(const Function *Fn, const TargetMachine &TM, unsigned FunctionNum, MachineModuleInfo &MMI); + MachineFunction(const MachineFunction &) = delete; + MachineFunction &operator=(const MachineFunction &) = delete; ~MachineFunction(); /// Reset the instance as if it was just created. @@ -350,19 +377,15 @@ class MachineFunction { const DataLayout &getDataLayout() const; /// getFunction - Return the LLVM function that this machine code represents - /// const Function *getFunction() const { return Fn; } /// getName - Return the name of the corresponding LLVM function. - /// StringRef getName() const; /// getFunctionNumber - Return a unique ID for the current function. - /// unsigned getFunctionNumber() const { return FunctionNumber; } /// getTarget - Return the target machine this machine code is compiled with - /// const TargetMachine &getTarget() const { return Target; } /// getSubtarget - Return the subtarget for which this machine code is being @@ -378,14 +401,12 @@ class MachineFunction { } /// getRegInfo - Return information about the registers currently in use. - /// MachineRegisterInfo &getRegInfo() { return *RegInfo; } const MachineRegisterInfo &getRegInfo() const { return *RegInfo; } /// getFrameInfo - Return the frame info object for the current function. /// This object contains information about objects allocated on the stack /// frame of the current function in an abstract way. - /// MachineFrameInfo &getFrameInfo() { return *FrameInfo; } const MachineFrameInfo &getFrameInfo() const { return *FrameInfo; } @@ -402,7 +423,6 @@ class MachineFunction { /// getConstantPool - Return the constant pool object for the current /// function. - /// MachineConstantPool *getConstantPool() { return ConstantPool; } const MachineConstantPool *getConstantPool() const { return ConstantPool; } @@ -413,11 +433,9 @@ class MachineFunction { WinEHFuncInfo *getWinEHFuncInfo() { return WinEHInfo; } /// getAlignment - Return the alignment (log2, not bytes) of the function. - /// unsigned getAlignment() const { return Alignment; } /// setAlignment - Set the alignment (log2, not bytes) of the function. - /// void setAlignment(unsigned A) { Alignment = A; } /// ensureAlignment - Make sure the function is at least 1 << A bytes aligned. @@ -487,7 +505,6 @@ class MachineFunction { bool shouldSplitStack() const; /// getNumBlockIDs - Return the number of MBB ID's allocated. - /// unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); } /// RenumberBlocks - This discards all of the MachineBasicBlock numbers and @@ -499,7 +516,6 @@ class MachineFunction { /// print - Print out the MachineFunction in a format suitable for debugging /// to the specified stream. - /// void print(raw_ostream &OS, const SlotIndexes* = nullptr) const; /// viewCFG - This function is meant for use from the debugger. You can just @@ -507,7 +523,6 @@ class MachineFunction { /// program, displaying the CFG of the current function with the code for each /// basic block inside. This depends on there being a 'dot' and 'gv' program /// in your path. - /// void viewCFG() const; /// viewCFGOnly - This function is meant for use from the debugger. It works @@ -518,7 +533,6 @@ class MachineFunction { void viewCFGOnly() const; /// dump - Print the current MachineFunction to cerr, useful for debugger use. - /// void dump() const; /// Run the current MachineFunction through the machine code verifier, useful @@ -528,10 +542,10 @@ class MachineFunction { bool AbortOnError = true) const; // Provide accessors for the MachineBasicBlock list... - typedef BasicBlockListType::iterator iterator; - typedef BasicBlockListType::const_iterator const_iterator; - typedef BasicBlockListType::const_reverse_iterator const_reverse_iterator; - typedef BasicBlockListType::reverse_iterator reverse_iterator; + using iterator = BasicBlockListType::iterator; + using const_iterator = BasicBlockListType::const_iterator; + using const_reverse_iterator = BasicBlockListType::const_reverse_iterator; + using reverse_iterator = BasicBlockListType::reverse_iterator; /// Support for MachineBasicBlock::getNextNode(). static BasicBlockListType MachineFunction::* @@ -590,11 +604,9 @@ class MachineFunction { //===--------------------------------------------------------------------===// // Internal functions used to automatically number MachineBasicBlocks - // /// \brief Adds the MBB to the internal numbering. Returns the unique number /// assigned to the MBB. - /// unsigned addToMBBNumbering(MachineBasicBlock *MBB) { MBBNumbering.push_back(MBB); return (unsigned)MBBNumbering.size()-1; @@ -610,7 +622,6 @@ class MachineFunction { /// CreateMachineInstr - Allocate a new MachineInstr. Use this instead /// of `new MachineInstr'. - /// MachineInstr *CreateMachineInstr(const MCInstrDesc &MCID, const DebugLoc &DL, bool NoImp = false); @@ -623,16 +634,13 @@ class MachineFunction { MachineInstr *CloneMachineInstr(const MachineInstr *Orig); /// DeleteMachineInstr - Delete the given MachineInstr. - /// void DeleteMachineInstr(MachineInstr *MI); /// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this /// instead of `new MachineBasicBlock'. - /// MachineBasicBlock *CreateMachineBasicBlock(const BasicBlock *bb = nullptr); /// DeleteMachineBasicBlock - Delete the given MachineBasicBlock. - /// void DeleteMachineBasicBlock(MachineBasicBlock *MBB); /// getMachineMemOperand - Allocate a new MachineMemOperand. @@ -653,7 +661,7 @@ class MachineFunction { MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO, int64_t Offset, uint64_t Size); - typedef ArrayRecycler::Capacity OperandCapacity; + using OperandCapacity = ArrayRecycler::Capacity; /// Allocate an array of MachineOperands. This is only intended for use by /// internal MachineInstr functions. @@ -700,7 +708,6 @@ class MachineFunction { //===--------------------------------------------------------------------===// // Label Manipulation. - // /// getJTISymbol - Return the MCSymbol for the specified non-empty jump table. /// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a @@ -858,13 +865,16 @@ template <> struct GraphTraits : static NodeRef getEntryNode(MachineFunction *F) { return &F->front(); } // nodes_iterator/begin/end - Allow iteration over all nodes in the graph - typedef pointer_iterator nodes_iterator; + using nodes_iterator = pointer_iterator; + static nodes_iterator nodes_begin(MachineFunction *F) { return nodes_iterator(F->begin()); } + static nodes_iterator nodes_end(MachineFunction *F) { return nodes_iterator(F->end()); } + static unsigned size (MachineFunction *F) { return F->size(); } }; template <> struct GraphTraits : @@ -872,37 +882,39 @@ template <> struct GraphTraits : static NodeRef getEntryNode(const MachineFunction *F) { return &F->front(); } // nodes_iterator/begin/end - Allow iteration over all nodes in the graph - typedef pointer_iterator nodes_iterator; + using nodes_iterator = pointer_iterator; + static nodes_iterator nodes_begin(const MachineFunction *F) { return nodes_iterator(F->begin()); } + static nodes_iterator nodes_end (const MachineFunction *F) { return nodes_iterator(F->end()); } + static unsigned size (const MachineFunction *F) { return F->size(); } }; - // Provide specializations of GraphTraits to be able to treat a function as a // graph of basic blocks... and to walk it in inverse order. Inverse order for // a function is considered to be when traversing the predecessor edges of a BB // instead of the successor edges. // -template <> struct GraphTraits > : - public GraphTraits > { +template <> struct GraphTraits> : + public GraphTraits> { static NodeRef getEntryNode(Inverse G) { return &G.Graph->front(); } }; -template <> struct GraphTraits > : - public GraphTraits > { +template <> struct GraphTraits> : + public GraphTraits> { static NodeRef getEntryNode(Inverse G) { return &G.Graph->front(); } }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEFUNCTION_H diff --git a/include/llvm/CodeGen/MachineFunctionInitializer.h b/include/llvm/CodeGen/MachineFunctionInitializer.h index c644c9783e2f..0fbcb480b1ab 100644 --- a/include/llvm/CodeGen/MachineFunctionInitializer.h +++ b/include/llvm/CodeGen/MachineFunctionInitializer.h @@ -1,4 +1,4 @@ -//===- MachineFunctionInitializer.h - machine function initializer ---------===// +//=- MachineFunctionInitializer.h - machine function initializer --*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -25,7 +25,7 @@ class MachineFunctionInitializer { virtual void anchor(); public: - virtual ~MachineFunctionInitializer() {} + virtual ~MachineFunctionInitializer() = default; /// Initialize the machine function. /// @@ -35,4 +35,4 @@ class MachineFunctionInitializer { } // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEFUNCTIONINITIALIZER_H diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h index 8d040beff7a6..95401e98b297 100644 --- a/include/llvm/CodeGen/MachineInstr.h +++ b/include/llvm/CodeGen/MachineInstr.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/MachineInstr.h - MachineInstr class --------*- C++ -*-===// +//===- llvm/CodeGen/MachineInstr.h - MachineInstr class ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,7 +17,6 @@ #define LLVM_CODEGEN_MACHINEINSTR_H #include "llvm/ADT/DenseMapInfo.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" @@ -28,19 +27,27 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/ArrayRecycler.h" #include "llvm/Target/TargetOpcodes.h" +#include +#include +#include +#include namespace llvm { -class StringRef; template class ArrayRef; -template class SmallVectorImpl; -class DILocalVariable; class DIExpression; +class DILocalVariable; +class MachineBasicBlock; +class MachineFunction; +class MachineMemOperand; +class MachineRegisterInfo; +class ModuleSlotTracker; +class raw_ostream; +template class SmallVectorImpl; +class StringRef; class TargetInstrInfo; class TargetRegisterClass; class TargetRegisterInfo; -class MachineFunction; -class MachineMemOperand; //===----------------------------------------------------------------------===// /// Representation of each machine instruction. @@ -53,7 +60,7 @@ class MachineInstr : public ilist_node_with_parent> { public: - typedef MachineMemOperand **mmo_iterator; + using mmo_iterator = MachineMemOperand **; /// Flags to specify different kinds of comments to output in /// assembly code. These flags carry semantic information not @@ -72,43 +79,39 @@ class MachineInstr BundledPred = 1 << 2, // Instruction has bundled predecessors. BundledSucc = 1 << 3 // Instruction has bundled successors. }; + private: const MCInstrDesc *MCID; // Instruction descriptor. - MachineBasicBlock *Parent; // Pointer to the owning basic block. + MachineBasicBlock *Parent = nullptr; // Pointer to the owning basic block. // Operands are allocated by an ArrayRecycler. - MachineOperand *Operands; // Pointer to the first operand. - unsigned NumOperands; // Number of operands on instruction. - typedef ArrayRecycler::Capacity OperandCapacity; + MachineOperand *Operands = nullptr; // Pointer to the first operand. + unsigned NumOperands = 0; // Number of operands on instruction. + using OperandCapacity = ArrayRecycler::Capacity; OperandCapacity CapOperands; // Capacity of the Operands array. - uint8_t Flags; // Various bits of additional + uint8_t Flags = 0; // Various bits of additional // information about machine // instruction. - uint8_t AsmPrinterFlags; // Various bits of information used by + uint8_t AsmPrinterFlags = 0; // Various bits of information used by // the AsmPrinter to emit helpful // comments. This is *not* semantic // information. Do not use this for // anything other than to convey comment // information to AsmPrinter. - uint8_t NumMemRefs; // Information on memory references. + uint8_t NumMemRefs = 0; // Information on memory references. // Note that MemRefs == nullptr, means 'don't know', not 'no memory access'. // Calling code must treat missing information conservatively. If the number // of memory operands required to be precise exceeds the maximum value of // NumMemRefs - currently 256 - we remove the operands entirely. Note also // that this is a non-owning reference to a shared copy on write buffer owned // by the MachineFunction and created via MF.allocateMemRefsArray. - mmo_iterator MemRefs; + mmo_iterator MemRefs = nullptr; DebugLoc debugLoc; // Source line information. - MachineInstr(const MachineInstr&) = delete; - void operator=(const MachineInstr&) = delete; - // Use MachineFunction::DeleteMachineInstr() instead. - ~MachineInstr() = delete; - // Intrusive list support friend struct ilist_traits; friend struct ilist_callback_traits; @@ -128,6 +131,11 @@ class MachineInstr friend class MachineFunction; public: + MachineInstr(const MachineInstr &) = delete; + MachineInstr &operator=(const MachineInstr &) = delete; + // Use MachineFunction::DeleteMachineInstr() instead. + ~MachineInstr() = delete; + const MachineBasicBlock* getParent() const { return Parent; } MachineBasicBlock* getParent() { return Parent; } @@ -178,7 +186,6 @@ class MachineInstr Flags &= ~((uint8_t)Flag); } - /// Return true if MI is in a bundle (but not the first MI in a bundle). /// /// A bundle looks like this before it's finalized: @@ -263,7 +270,6 @@ class MachineInstr /// earlier. /// /// If this method returns, the caller should try to recover from the error. - /// void emitError(StringRef Msg) const; /// Returns the target instruction descriptor of this MachineInstr. @@ -273,7 +279,6 @@ class MachineInstr unsigned getOpcode() const { return MCID->Opcode; } /// Access to explicit operands of the instruction. - /// unsigned getNumOperands() const { return NumOperands; } const MachineOperand& getOperand(unsigned i) const { @@ -289,8 +294,8 @@ class MachineInstr unsigned getNumExplicitOperands() const; /// iterator/begin/end - Iterate over all operands of a machine instruction. - typedef MachineOperand *mop_iterator; - typedef const MachineOperand *const_mop_iterator; + using mop_iterator = MachineOperand *; + using const_mop_iterator = const MachineOperand *; mop_iterator operands_begin() { return Operands; } mop_iterator operands_end() { return Operands + NumOperands; } @@ -713,7 +718,6 @@ class MachineInstr return hasProperty(MCID::ExtraDefRegAllocReq, Type); } - enum MICheckType { CheckDefs, // Check all operands for equality CheckKillDead, // Check all operands including kill / dead markers @@ -767,6 +771,7 @@ class MachineInstr /// Returns true if the MachineInstr represents a label. bool isLabel() const { return isEHLabel() || isGCLabel(); } + bool isCFIInstruction() const { return getOpcode() == TargetOpcode::CFI_INSTRUCTION; } @@ -775,6 +780,7 @@ class MachineInstr bool isPosition() const { return isLabel() || isCFIInstruction(); } bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; } + /// A DBG_VALUE is indirect iff the first operand is a register and /// the second operand is an immediate. bool isIndirectDebugValue() const { @@ -787,29 +793,38 @@ class MachineInstr bool isKill() const { return getOpcode() == TargetOpcode::KILL; } bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; } bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; } + bool isMSInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM && getInlineAsmDialect(); } + bool isStackAligningInlineAsm() const; InlineAsm::AsmDialect getInlineAsmDialect() const; + bool isInsertSubreg() const { return getOpcode() == TargetOpcode::INSERT_SUBREG; } + bool isSubregToReg() const { return getOpcode() == TargetOpcode::SUBREG_TO_REG; } + bool isRegSequence() const { return getOpcode() == TargetOpcode::REG_SEQUENCE; } + bool isBundle() const { return getOpcode() == TargetOpcode::BUNDLE; } + bool isCopy() const { return getOpcode() == TargetOpcode::COPY; } + bool isFullCopy() const { return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg(); } + bool isExtractSubreg() const { return getOpcode() == TargetOpcode::EXTRACT_SUBREG; } @@ -978,7 +993,6 @@ class MachineInstr /// /// The flag operand is an immediate that can be decoded with methods like /// InlineAsm::hasRegClassConstraint(). - /// int findInlineAsmFlagIdx(unsigned OpIdx, unsigned *GroupNo = nullptr) const; /// Compute the static register class constraint for operand OpIdx. @@ -987,7 +1001,6 @@ class MachineInstr /// /// Returns NULL if the static register class constraint cannot be /// determined. - /// const TargetRegisterClass* getRegClassConstraint(unsigned OpIdx, const TargetInstrInfo *TII, @@ -1328,6 +1341,6 @@ inline raw_ostream& operator<<(raw_ostream &OS, const MachineInstr &MI) { return OS; } -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEINSTR_H diff --git a/include/llvm/CodeGen/MachineInstrBundleIterator.h b/include/llvm/CodeGen/MachineInstrBundleIterator.h index 3104185385ea..5fe4964ff116 100644 --- a/include/llvm/CodeGen/MachineInstrBundleIterator.h +++ b/include/llvm/CodeGen/MachineInstrBundleIterator.h @@ -15,34 +15,37 @@ #define LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H #include "llvm/ADT/ilist.h" +#include "llvm/ADT/simple_ilist.h" +#include #include +#include namespace llvm { template struct MachineInstrBundleIteratorTraits; template struct MachineInstrBundleIteratorTraits { - typedef simple_ilist> list_type; - typedef typename list_type::iterator instr_iterator; - typedef typename list_type::iterator nonconst_instr_iterator; - typedef typename list_type::const_iterator const_instr_iterator; + using list_type = simple_ilist>; + using instr_iterator = typename list_type::iterator; + using nonconst_instr_iterator = typename list_type::iterator; + using const_instr_iterator = typename list_type::const_iterator; }; template struct MachineInstrBundleIteratorTraits { - typedef simple_ilist> list_type; - typedef typename list_type::reverse_iterator instr_iterator; - typedef typename list_type::reverse_iterator nonconst_instr_iterator; - typedef typename list_type::const_reverse_iterator const_instr_iterator; + using list_type = simple_ilist>; + using instr_iterator = typename list_type::reverse_iterator; + using nonconst_instr_iterator = typename list_type::reverse_iterator; + using const_instr_iterator = typename list_type::const_reverse_iterator; }; template struct MachineInstrBundleIteratorTraits { - typedef simple_ilist> list_type; - typedef typename list_type::const_iterator instr_iterator; - typedef typename list_type::iterator nonconst_instr_iterator; - typedef typename list_type::const_iterator const_instr_iterator; + using list_type = simple_ilist>; + using instr_iterator = typename list_type::const_iterator; + using nonconst_instr_iterator = typename list_type::iterator; + using const_instr_iterator = typename list_type::const_iterator; }; template struct MachineInstrBundleIteratorTraits { - typedef simple_ilist> list_type; - typedef typename list_type::const_reverse_iterator instr_iterator; - typedef typename list_type::reverse_iterator nonconst_instr_iterator; - typedef typename list_type::const_reverse_iterator const_instr_iterator; + using list_type = simple_ilist>; + using instr_iterator = typename list_type::const_reverse_iterator; + using nonconst_instr_iterator = typename list_type::reverse_iterator; + using const_instr_iterator = typename list_type::const_reverse_iterator; }; template struct MachineInstrBundleIteratorHelper; @@ -104,27 +107,27 @@ template <> struct MachineInstrBundleIteratorHelper { /// inside bundles (i.e. walk top level MIs only). template class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper { - typedef MachineInstrBundleIteratorTraits Traits; - typedef typename Traits::instr_iterator instr_iterator; + using Traits = MachineInstrBundleIteratorTraits; + using instr_iterator = typename Traits::instr_iterator; + instr_iterator MII; public: - typedef typename instr_iterator::value_type value_type; - typedef typename instr_iterator::difference_type difference_type; - typedef typename instr_iterator::pointer pointer; - typedef typename instr_iterator::reference reference; - typedef std::bidirectional_iterator_tag iterator_category; - - typedef typename instr_iterator::const_pointer const_pointer; - typedef typename instr_iterator::const_reference const_reference; + using value_type = typename instr_iterator::value_type; + using difference_type = typename instr_iterator::difference_type; + using pointer = typename instr_iterator::pointer; + using reference = typename instr_iterator::reference; + using const_pointer = typename instr_iterator::const_pointer; + using const_reference = typename instr_iterator::const_reference; + using iterator_category = std::bidirectional_iterator_tag; private: - typedef typename Traits::nonconst_instr_iterator nonconst_instr_iterator; - typedef typename Traits::const_instr_iterator const_instr_iterator; - typedef MachineInstrBundleIterator< - typename nonconst_instr_iterator::value_type, IsReverse> - nonconst_iterator; - typedef MachineInstrBundleIterator reverse_iterator; + using nonconst_instr_iterator = typename Traits::nonconst_instr_iterator; + using const_instr_iterator = typename Traits::const_instr_iterator; + using nonconst_iterator = + MachineInstrBundleIterator; + using reverse_iterator = MachineInstrBundleIterator; public: MachineInstrBundleIterator(instr_iterator MI) : MII(MI) { @@ -138,12 +141,14 @@ class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper { "MachineInstrBundleIterator with a " "bundled MI"); } + MachineInstrBundleIterator(pointer MI) : MII(MI) { // FIXME: This conversion should be explicit. assert((!MI || !MI->isBundledWithPred()) && "It's not legal to initialize " "MachineInstrBundleIterator " "with a bundled MI"); } + // Template allows conversion from const to nonconst. template MachineInstrBundleIterator( @@ -151,6 +156,7 @@ class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper { typename std::enable_if::value, void *>::type = nullptr) : MII(I.getInstrIterator()) {} + MachineInstrBundleIterator() : MII(nullptr) {} /// Explicit conversion between forward/reverse iterators. @@ -280,4 +286,4 @@ class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper { } // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h index 5c814f22f99b..58cffaade9d2 100644 --- a/include/llvm/CodeGen/MachineLoopInfo.h +++ b/include/llvm/CodeGen/MachineLoopInfo.h @@ -33,6 +33,8 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" namespace llvm { @@ -71,6 +73,7 @@ class MachineLoop : public LoopBase { private: friend class LoopInfoBase; + explicit MachineLoop(MachineBasicBlock *MBB) : LoopBase(MBB) {} }; @@ -79,11 +82,9 @@ class MachineLoop : public LoopBase { extern template class LoopInfoBase; class MachineLoopInfo : public MachineFunctionPass { - LoopInfoBase LI; friend class LoopBase; - void operator=(const MachineLoopInfo &) = delete; - MachineLoopInfo(const MachineLoopInfo &) = delete; + LoopInfoBase LI; public: static char ID; // Pass identification, replacement for typeid @@ -91,6 +92,8 @@ class MachineLoopInfo : public MachineFunctionPass { MachineLoopInfo() : MachineFunctionPass(ID) { initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry()); } + MachineLoopInfo(const MachineLoopInfo &) = delete; + MachineLoopInfo &operator=(const MachineLoopInfo &) = delete; LoopInfoBase& getBase() { return LI; } @@ -103,7 +106,7 @@ class MachineLoopInfo : public MachineFunctionPass { bool SpeculativePreheader = false) const; /// The iterator interface to the top-level loops in the current function. - typedef LoopInfoBase::iterator iterator; + using iterator = LoopInfoBase::iterator; inline iterator begin() const { return LI.begin(); } inline iterator end() const { return LI.end(); } bool empty() const { return LI.empty(); } @@ -166,11 +169,10 @@ class MachineLoopInfo : public MachineFunctionPass { } }; - // Allow clients to walk the list of nested loops... template <> struct GraphTraits { - typedef const MachineLoop *NodeRef; - typedef MachineLoopInfo::iterator ChildIteratorType; + using NodeRef = const MachineLoop *; + using ChildIteratorType = MachineLoopInfo::iterator; static NodeRef getEntryNode(const MachineLoop *L) { return L; } static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } @@ -178,14 +180,14 @@ template <> struct GraphTraits { }; template <> struct GraphTraits { - typedef MachineLoop *NodeRef; - typedef MachineLoopInfo::iterator ChildIteratorType; + using NodeRef = MachineLoop *; + using ChildIteratorType = MachineLoopInfo::iterator; static NodeRef getEntryNode(MachineLoop *L) { return L; } static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } static ChildIteratorType child_end(NodeRef N) { return N->end(); } }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINELOOPINFO_H diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h index f46ef41879d1..d4ac58c3bd22 100644 --- a/include/llvm/CodeGen/MachineModuleInfo.h +++ b/include/llvm/CodeGen/MachineModuleInfo.h @@ -31,35 +31,26 @@ #ifndef LLVM_CODEGEN_MACHINEMODULEINFO_H #define LLVM_CODEGEN_MACHINEMODULEINFO_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerIntPair.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/DebugLoc.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MachineLocation.h" #include "llvm/Pass.h" -#include "llvm/Support/DataTypes.h" +#include +#include +#include namespace llvm { -//===----------------------------------------------------------------------===// -// Forward declarations. -class BlockAddress; +class BasicBlock; class CallInst; -class Constant; -class GlobalVariable; -class LandingPadInst; -class MDNode; -class MMIAddrLabelMap; -class MachineBasicBlock; +class Function; class MachineFunction; class MachineFunctionInitializer; +class MMIAddrLabelMap; class Module; -class PointerType; -class StructType; +class TargetMachine; //===----------------------------------------------------------------------===// /// This class can be derived from and used by targets to hold private @@ -69,11 +60,12 @@ class StructType; /// class MachineModuleInfoImpl { public: - typedef PointerIntPair StubValueTy; - virtual ~MachineModuleInfoImpl(); - typedef std::vector > SymbolListTy; -protected: + using StubValueTy = PointerIntPair; + using SymbolListTy = std::vector>; + virtual ~MachineModuleInfoImpl(); + +protected: /// Return the entries from a DenseMap in a deterministic sorted orer. /// Clears the map. static SymbolListTy getSortedStubs(DenseMap&); @@ -252,6 +244,6 @@ class MachineModuleInfo : public ImmutablePass { /// which will link in MSVCRT's floating-point support. void computeUsesVAFloatArgument(const CallInst &I, MachineModuleInfo &MMI); -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEMODULEINFO_H diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index f3e04cffcda6..3bcfc1c4254b 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -133,6 +133,10 @@ namespace llvm { // instruction and update the MachineFunctionInfo with that information. extern char &ShrinkWrapID; + /// LiveRangeShrink pass. Move instruction close to its definition to shrink + /// the definition's live range. + extern char &LiveRangeShrinkID; + /// Greedy register allocator. extern char &RAGreedyID; diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h index f0c826dc1d45..fcf1937c186e 100644 --- a/include/llvm/CodeGen/TargetPassConfig.h +++ b/include/llvm/CodeGen/TargetPassConfig.h @@ -22,7 +22,7 @@ namespace llvm { class PassConfigImpl; class ScheduleDAGInstrs; -class TargetMachine; +class LLVMTargetMachine; struct MachineSchedContext; // The old pass manager infrastructure is hidden in a legacy namespace now. @@ -103,7 +103,7 @@ class TargetPassConfig : public ImmutablePass { bool AddingMachinePasses; protected: - TargetMachine *TM; + LLVMTargetMachine *TM; PassConfigImpl *Impl; // Internal data structures bool Initialized; // Flagged after all passes are configured. @@ -120,7 +120,7 @@ class TargetPassConfig : public ImmutablePass { bool RequireCodeGenSCCOrder; public: - TargetPassConfig(TargetMachine *tm, PassManagerBase &pm); + TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm); // Dummy constructor. TargetPassConfig(); diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h index 3316e71916ed..4e8c8feb7a12 100644 --- a/include/llvm/DebugInfo/CodeView/CodeView.h +++ b/include/llvm/DebugInfo/CodeView/CodeView.h @@ -6,6 +6,10 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// Defines constants and basic types describing CodeView debug information. +// +//===----------------------------------------------------------------------===// #ifndef LLVM_DEBUGINFO_CODEVIEW_CODEVIEW_H #define LLVM_DEBUGINFO_CODEVIEW_CODEVIEW_H @@ -22,28 +26,28 @@ namespace codeview { /// documentation and headers talk about this as the "leaf" type. enum class TypeRecordKind : uint16_t { #define TYPE_RECORD(lf_ename, value, name) name = value, -#include "TypeRecords.def" +#include "CodeViewTypes.def" }; /// Duplicate copy of the above enum, but using the official CV names. Useful /// for reference purposes and when dealing with unknown record types. enum TypeLeafKind : uint16_t { #define CV_TYPE(name, val) name = val, -#include "TypeRecords.def" +#include "CodeViewTypes.def" }; /// Distinguishes individual records in the Symbols subsection of a .debug$S /// section. Equivalent to SYM_ENUM_e in cvinfo.h. enum class SymbolRecordKind : uint16_t { #define SYMBOL_RECORD(lf_ename, value, name) name = value, -#include "CVSymbolTypes.def" +#include "CodeViewSymbols.def" }; /// Duplicate copy of the above enum, but using the official CV names. Useful /// for reference purposes and when dealing with unknown record types. enum SymbolKind : uint16_t { #define CV_SYMBOL(name, val) name = val, -#include "CVSymbolTypes.def" +#include "CodeViewSymbols.def" }; #define CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(Class) \ @@ -280,7 +284,7 @@ CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(MethodOptions) /// Equivalent to CV_LABEL_TYPE_e. enum class LabelType : uint16_t { Near = 0x0, - Far = 0x4, + Far = 0x4, }; /// Equivalent to CV_modifier_t. diff --git a/include/llvm/DebugInfo/CodeView/CVSymbolTypes.def b/include/llvm/DebugInfo/CodeView/CodeViewSymbols.def similarity index 100% rename from include/llvm/DebugInfo/CodeView/CVSymbolTypes.def rename to include/llvm/DebugInfo/CodeView/CodeViewSymbols.def diff --git a/include/llvm/DebugInfo/CodeView/TypeRecords.def b/include/llvm/DebugInfo/CodeView/CodeViewTypes.def similarity index 100% rename from include/llvm/DebugInfo/CodeView/TypeRecords.def rename to include/llvm/DebugInfo/CodeView/CodeViewTypes.def diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h index c1a5152930ff..428ff153d5d1 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h +++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h @@ -34,6 +34,17 @@ class SymbolDeserializer : public SymbolVisitorCallbacks { }; public: + template static Error deserializeAs(CVSymbol Symbol, T &Record) { + SymbolDeserializer S(nullptr); + if (auto EC = S.visitSymbolBegin(Symbol)) + return EC; + if (auto EC = S.visitKnownRecord(Symbol, Record)) + return EC; + if (auto EC = S.visitSymbolEnd(Symbol)) + return EC; + return Error::success(); + } + explicit SymbolDeserializer(SymbolVisitorDelegate *Delegate) : Delegate(Delegate) {} @@ -54,7 +65,7 @@ class SymbolDeserializer : public SymbolVisitorCallbacks { return visitKnownRecordImpl(CVR, Record); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: template Error visitKnownRecordImpl(CVSymbol &CVR, T &Record) { diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h index c5a5549bf818..a3e4dff647bd 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h +++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h @@ -35,8 +35,6 @@ class SymbolRecord { public: SymbolRecordKind getKind() const { return Kind; } - -private: SymbolRecordKind Kind; }; diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h index 0a1837a0d935..5d072a3b2723 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h +++ b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h @@ -29,7 +29,7 @@ class SymbolRecordMapping : public SymbolVisitorCallbacks { #define SYMBOL_RECORD(EnumName, EnumVal, Name) \ Error visitKnownRecord(CVSymbol &CVR, Name &Record) override; #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: Optional Kind; diff --git a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h index f2e99bd83326..a8fe1a3ae1d0 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h +++ b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h @@ -45,6 +45,17 @@ class SymbolSerializer : public SymbolVisitorCallbacks { } public: + template + static CVSymbol writeOneSymbol(SymType &Sym, BumpPtrAllocator &Storage) { + CVSymbol Result; + Result.Type = static_cast(Sym.Kind); + SymbolSerializer Serializer(Storage); + consumeError(Serializer.visitSymbolBegin(Result)); + consumeError(Serializer.visitKnownRecord(Result, Sym)); + consumeError(Serializer.visitSymbolEnd(Result)); + return Result; + } + explicit SymbolSerializer(BumpPtrAllocator &Storage); virtual Error visitSymbolBegin(CVSymbol &Record) override; @@ -55,7 +66,7 @@ class SymbolSerializer : public SymbolVisitorCallbacks { return visitKnownRecordImpl(CVR, Record); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: template diff --git a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h index 96a93bf7e576..5f4205bd6e08 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h +++ b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h @@ -59,7 +59,7 @@ class SymbolVisitorCallbackPipeline : public SymbolVisitorCallbacks { return Error::success(); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" private: std::vector Pipeline; diff --git a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h index aaa9d2e85e13..2ef7eabdaa9d 100644 --- a/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h +++ b/include/llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h @@ -39,7 +39,7 @@ class SymbolVisitorCallbacks { return Error::success(); \ } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" }; } // end namespace codeview diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h index c064e19a7e90..77dbc91a7d38 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h +++ b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h @@ -39,7 +39,7 @@ class TypeDatabaseVisitor : public TypeVisitorCallbacks { Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: StringRef getTypeName(TypeIndex Index) const; diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h index a9c5cf42fc5b..965cdfd85f48 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h @@ -41,6 +41,7 @@ class TypeDeserializer : public TypeVisitorCallbacks { TypeDeserializer() = default; template static Error deserializeAs(CVType &CVT, T &Record) { + Record.Kind = static_cast(CVT.kind()); MappingInfo I(CVT.content()); if (auto EC = I.Mapping.visitTypeBegin(CVT)) return EC; @@ -75,7 +76,7 @@ class TypeDeserializer : public TypeVisitorCallbacks { #define MEMBER_RECORD(EnumName, EnumVal, Name) #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template @@ -127,7 +128,7 @@ class FieldListDeserializer : public TypeVisitorCallbacks { } #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h index 65b3a33e6548..afb8b3636361 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h +++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h @@ -58,7 +58,7 @@ class TypeDumpVisitor : public TypeVisitorCallbacks { Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: void printMemberAttributes(MemberAttributes Attrs); diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index 92745ebfcded..3a64a437aa4d 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -123,13 +123,13 @@ class TypeRecord { public: TypeRecordKind getKind() const { return Kind; } -private: TypeRecordKind Kind; }; // LF_MODIFIER class ModifierRecord : public TypeRecord { public: + ModifierRecord() = default; explicit ModifierRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ModifierRecord(TypeIndex ModifiedType, ModifierOptions Modifiers) : TypeRecord(TypeRecordKind::Modifier), ModifiedType(ModifiedType), @@ -145,6 +145,7 @@ class ModifierRecord : public TypeRecord { // LF_PROCEDURE class ProcedureRecord : public TypeRecord { public: + ProcedureRecord() = default; explicit ProcedureRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ProcedureRecord(TypeIndex ReturnType, CallingConvention CallConv, FunctionOptions Options, uint16_t ParameterCount, @@ -169,6 +170,7 @@ class ProcedureRecord : public TypeRecord { // LF_MFUNCTION class MemberFunctionRecord : public TypeRecord { public: + MemberFunctionRecord() = default; explicit MemberFunctionRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} MemberFunctionRecord(TypeIndex ReturnType, TypeIndex ClassType, @@ -203,6 +205,7 @@ class MemberFunctionRecord : public TypeRecord { // LF_LABEL class LabelRecord : public TypeRecord { public: + LabelRecord() = default; explicit LabelRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} LabelRecord(LabelType Mode) : TypeRecord(TypeRecordKind::Label), Mode(Mode) {} @@ -213,6 +216,7 @@ class LabelRecord : public TypeRecord { // LF_MFUNC_ID class MemberFuncIdRecord : public TypeRecord { public: + MemberFuncIdRecord() = default; explicit MemberFuncIdRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} MemberFuncIdRecord(TypeIndex ClassType, TypeIndex FunctionType, StringRef Name) @@ -230,6 +234,7 @@ class MemberFuncIdRecord : public TypeRecord { // LF_ARGLIST class ArgListRecord : public TypeRecord { public: + ArgListRecord() = default; explicit ArgListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ArgListRecord(TypeRecordKind Kind, ArrayRef Indices) @@ -243,6 +248,7 @@ class ArgListRecord : public TypeRecord { // LF_SUBSTR_LIST class StringListRecord : public TypeRecord { public: + StringListRecord() = default; explicit StringListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} StringListRecord(TypeRecordKind Kind, ArrayRef Indices) @@ -267,6 +273,7 @@ class PointerRecord : public TypeRecord { static const uint32_t PointerSizeShift = 13; static const uint32_t PointerSizeMask = 0xFF; + PointerRecord() = default; explicit PointerRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} PointerRecord(TypeIndex ReferentType, uint32_t Attrs) @@ -341,6 +348,7 @@ class PointerRecord : public TypeRecord { // LF_NESTTYPE class NestedTypeRecord : public TypeRecord { public: + NestedTypeRecord() = default; explicit NestedTypeRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} NestedTypeRecord(TypeIndex Type, StringRef Name) : TypeRecord(TypeRecordKind::NestedType), Type(Type), Name(Name) {} @@ -355,6 +363,7 @@ class NestedTypeRecord : public TypeRecord { // LF_FIELDLIST class FieldListRecord : public TypeRecord { public: + FieldListRecord() = default; explicit FieldListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} explicit FieldListRecord(ArrayRef Data) : TypeRecord(TypeRecordKind::FieldList), Data(Data) {} @@ -365,6 +374,7 @@ class FieldListRecord : public TypeRecord { // LF_ARRAY class ArrayRecord : public TypeRecord { public: + ArrayRecord() = default; explicit ArrayRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ArrayRecord(TypeIndex ElementType, TypeIndex IndexType, uint64_t Size, StringRef Name) @@ -384,6 +394,7 @@ class ArrayRecord : public TypeRecord { class TagRecord : public TypeRecord { protected: + TagRecord() = default; explicit TagRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} TagRecord(TypeRecordKind Kind, uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, StringRef Name, StringRef UniqueName) @@ -416,6 +427,7 @@ class TagRecord : public TypeRecord { // LF_CLASS, LF_STRUCTURE, LF_INTERFACE class ClassRecord : public TagRecord { public: + ClassRecord() = default; explicit ClassRecord(TypeRecordKind Kind) : TagRecord(Kind) {} ClassRecord(TypeRecordKind Kind, uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, TypeIndex DerivationList, @@ -447,6 +459,7 @@ class ClassRecord : public TagRecord { // LF_UNION struct UnionRecord : public TagRecord { + UnionRecord() = default; explicit UnionRecord(TypeRecordKind Kind) : TagRecord(Kind) {} UnionRecord(uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, uint64_t Size, StringRef Name, StringRef UniqueName) @@ -468,6 +481,7 @@ struct UnionRecord : public TagRecord { // LF_ENUM class EnumRecord : public TagRecord { public: + EnumRecord() = default; explicit EnumRecord(TypeRecordKind Kind) : TagRecord(Kind) {} EnumRecord(uint16_t MemberCount, ClassOptions Options, TypeIndex FieldList, StringRef Name, StringRef UniqueName, TypeIndex UnderlyingType) @@ -482,6 +496,7 @@ class EnumRecord : public TagRecord { // LF_BITFIELD class BitFieldRecord : public TypeRecord { public: + BitFieldRecord() = default; explicit BitFieldRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} BitFieldRecord(TypeIndex Type, uint8_t BitSize, uint8_t BitOffset) : TypeRecord(TypeRecordKind::BitField), Type(Type), BitSize(BitSize), @@ -498,6 +513,7 @@ class BitFieldRecord : public TypeRecord { // LF_VTSHAPE class VFTableShapeRecord : public TypeRecord { public: + VFTableShapeRecord() = default; explicit VFTableShapeRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} explicit VFTableShapeRecord(ArrayRef Slots) : TypeRecord(TypeRecordKind::VFTableShape), SlotsRef(Slots) {} @@ -518,6 +534,7 @@ class VFTableShapeRecord : public TypeRecord { // LF_TYPESERVER2 class TypeServer2Record : public TypeRecord { public: + TypeServer2Record() = default; explicit TypeServer2Record(TypeRecordKind Kind) : TypeRecord(Kind) {} TypeServer2Record(StringRef Guid, uint32_t Age, StringRef Name) : TypeRecord(TypeRecordKind::TypeServer2), Guid(Guid), Age(Age), @@ -537,6 +554,7 @@ class TypeServer2Record : public TypeRecord { // LF_STRING_ID class StringIdRecord : public TypeRecord { public: + StringIdRecord() = default; explicit StringIdRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} StringIdRecord(TypeIndex Id, StringRef String) : TypeRecord(TypeRecordKind::StringId), Id(Id), String(String) {} @@ -551,6 +569,7 @@ class StringIdRecord : public TypeRecord { // LF_FUNC_ID class FuncIdRecord : public TypeRecord { public: + FuncIdRecord() = default; explicit FuncIdRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} FuncIdRecord(TypeIndex ParentScope, TypeIndex FunctionType, StringRef Name) : TypeRecord(TypeRecordKind::FuncId), ParentScope(ParentScope), @@ -570,6 +589,7 @@ class FuncIdRecord : public TypeRecord { // LF_UDT_SRC_LINE class UdtSourceLineRecord : public TypeRecord { public: + UdtSourceLineRecord() = default; explicit UdtSourceLineRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} UdtSourceLineRecord(TypeIndex UDT, TypeIndex SourceFile, uint32_t LineNumber) : TypeRecord(TypeRecordKind::UdtSourceLine), UDT(UDT), @@ -587,6 +607,7 @@ class UdtSourceLineRecord : public TypeRecord { // LF_UDT_MOD_SRC_LINE class UdtModSourceLineRecord : public TypeRecord { public: + UdtModSourceLineRecord() = default; explicit UdtModSourceLineRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} UdtModSourceLineRecord(TypeIndex UDT, TypeIndex SourceFile, uint32_t LineNumber, uint16_t Module) @@ -607,6 +628,7 @@ class UdtModSourceLineRecord : public TypeRecord { // LF_BUILDINFO class BuildInfoRecord : public TypeRecord { public: + BuildInfoRecord() = default; explicit BuildInfoRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} BuildInfoRecord(ArrayRef ArgIndices) : TypeRecord(TypeRecordKind::BuildInfo), @@ -619,6 +641,7 @@ class BuildInfoRecord : public TypeRecord { // LF_VFTABLE class VFTableRecord : public TypeRecord { public: + VFTableRecord() = default; explicit VFTableRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} VFTableRecord(TypeIndex CompleteClass, TypeIndex OverriddenVFTable, uint32_t VFPtrOffset, StringRef Name, @@ -646,7 +669,7 @@ class VFTableRecord : public TypeRecord { // LF_ONEMETHOD class OneMethodRecord : public TypeRecord { public: - OneMethodRecord() : TypeRecord(TypeRecordKind::OneMethod) {} + OneMethodRecord() = default; explicit OneMethodRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} OneMethodRecord(TypeIndex Type, MemberAttributes Attrs, int32_t VFTableOffset, StringRef Name) @@ -678,6 +701,7 @@ class OneMethodRecord : public TypeRecord { // LF_METHODLIST class MethodOverloadListRecord : public TypeRecord { public: + MethodOverloadListRecord() = default; explicit MethodOverloadListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} MethodOverloadListRecord(ArrayRef Methods) : TypeRecord(TypeRecordKind::MethodOverloadList), Methods(Methods) {} @@ -689,6 +713,7 @@ class MethodOverloadListRecord : public TypeRecord { /// For method overload sets. LF_METHOD class OverloadedMethodRecord : public TypeRecord { public: + OverloadedMethodRecord() = default; explicit OverloadedMethodRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} OverloadedMethodRecord(uint16_t NumOverloads, TypeIndex MethodList, StringRef Name) @@ -706,6 +731,7 @@ class OverloadedMethodRecord : public TypeRecord { // LF_MEMBER class DataMemberRecord : public TypeRecord { public: + DataMemberRecord() = default; explicit DataMemberRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} DataMemberRecord(MemberAttributes Attrs, TypeIndex Type, uint64_t Offset, StringRef Name) @@ -730,6 +756,7 @@ class DataMemberRecord : public TypeRecord { // LF_STMEMBER class StaticDataMemberRecord : public TypeRecord { public: + StaticDataMemberRecord() = default; explicit StaticDataMemberRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} StaticDataMemberRecord(MemberAttributes Attrs, TypeIndex Type, StringRef Name) : TypeRecord(TypeRecordKind::StaticDataMember), Attrs(Attrs), Type(Type), @@ -750,6 +777,7 @@ class StaticDataMemberRecord : public TypeRecord { // LF_ENUMERATE class EnumeratorRecord : public TypeRecord { public: + EnumeratorRecord() = default; explicit EnumeratorRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} EnumeratorRecord(MemberAttributes Attrs, APSInt Value, StringRef Name) : TypeRecord(TypeRecordKind::Enumerator), Attrs(Attrs), @@ -770,6 +798,7 @@ class EnumeratorRecord : public TypeRecord { // LF_VFUNCTAB class VFPtrRecord : public TypeRecord { public: + VFPtrRecord() = default; explicit VFPtrRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} VFPtrRecord(TypeIndex Type) : TypeRecord(TypeRecordKind::VFPtr), Type(Type) {} @@ -782,6 +811,7 @@ class VFPtrRecord : public TypeRecord { // LF_BCLASS, LF_BINTERFACE class BaseClassRecord : public TypeRecord { public: + BaseClassRecord() = default; explicit BaseClassRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} BaseClassRecord(MemberAttributes Attrs, TypeIndex Type, uint64_t Offset) : TypeRecord(TypeRecordKind::BaseClass), Attrs(Attrs), Type(Type), @@ -802,6 +832,7 @@ class BaseClassRecord : public TypeRecord { // LF_VBCLASS, LF_IVBCLASS class VirtualBaseClassRecord : public TypeRecord { public: + VirtualBaseClassRecord() = default; explicit VirtualBaseClassRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} VirtualBaseClassRecord(TypeRecordKind Kind, MemberAttributes Attrs, TypeIndex BaseType, TypeIndex VBPtrType, @@ -831,6 +862,7 @@ class VirtualBaseClassRecord : public TypeRecord { /// together. The first will end in an LF_INDEX record that points to the next. class ListContinuationRecord : public TypeRecord { public: + ListContinuationRecord() = default; explicit ListContinuationRecord(TypeRecordKind Kind) : TypeRecord(Kind) {} ListContinuationRecord(TypeIndex ContinuationIndex) : TypeRecord(TypeRecordKind::ListContinuation), diff --git a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h index 924ca0470fad..6156223b2560 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h @@ -37,7 +37,7 @@ class TypeRecordMapping : public TypeVisitorCallbacks { Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: Optional TypeKind; diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h index 435c43f7edcb..1dee86a1da79 100644 --- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h @@ -106,7 +106,7 @@ class TypeSerializer : public TypeVisitorCallbacks { return visitKnownMemberImpl(CVR, Record); \ } #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template diff --git a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h index 7bdc9ecb20cf..907ed1010e5b 100644 --- a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h +++ b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h @@ -13,8 +13,8 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" -#include "llvm/DebugInfo/CodeView/TypeSerializer.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/TypeSerializer.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Error.h" #include diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h index ed48df33249f..126fb8abb0da 100644 --- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h +++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h @@ -94,7 +94,7 @@ class TypeVisitorCallbackPipeline : public TypeVisitorCallbacks { } #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template Error visitKnownRecordImpl(CVType &CVR, T &Record) { diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h index 0ea754deb425..d7a473306bc2 100644 --- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h +++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h @@ -58,7 +58,11 @@ class TypeVisitorCallbacks { #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" +#undef TYPE_RECORD +#undef TYPE_RECORD_ALIAS +#undef MEMBER_RECORD +#undef MEMBER_RECORD_ALIAS }; } // end namespace codeview diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h index d51408122fc9..2ab1c9508522 100644 --- a/include/llvm/DebugInfo/DIContext.h +++ b/include/llvm/DebugInfo/DIContext.h @@ -146,6 +146,14 @@ enum DIDumpType { DIDT_TUIndex, }; +/// Container for dump options that control which debug information will be +/// dumped. +struct DIDumpOptions { + DIDumpType DumpType = DIDT_All; + bool DumpEH = false; + bool SummarizeTypes = false; +}; + class DIContext { public: enum DIContextKind { @@ -158,8 +166,7 @@ class DIContext { DIContextKind getKind() const { return Kind; } - virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All, - bool DumpEH = false, bool SummarizeTypes = false) = 0; + virtual void dump(raw_ostream &OS, DIDumpOptions DumpOpts) = 0; virtual bool verify(raw_ostream &OS, DIDumpType DumpType = DIDT_All) { // No verifier? Just say things went well. diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h index 7fa68f3f2314..519ecf618558 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -105,8 +105,7 @@ class DWARFContext : public DIContext { return DICtx->getKind() == CK_DWARF; } - void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All, - bool DumpEH = false, bool SummarizeTypes = false) override; + void dump(raw_ostream &OS, DIDumpOptions DumpOpts) override; bool verify(raw_ostream &OS, DIDumpType DumpType = DIDT_All) override; diff --git a/include/llvm/DebugInfo/PDB/Native/RawConstants.h b/include/llvm/DebugInfo/PDB/Native/RawConstants.h index f5d4df8feb2e..e1bd86b2870b 100644 --- a/include/llvm/DebugInfo/PDB/Native/RawConstants.h +++ b/include/llvm/DebugInfo/PDB/Native/RawConstants.h @@ -12,7 +12,6 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/DebugInfo/CodeView/CodeView.h" - #include namespace llvm { diff --git a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h index dd2698c354a2..156abb59a6be 100644 --- a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h +++ b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h @@ -38,7 +38,7 @@ class TpiHashUpdater : public codeview::TypeVisitorCallbacks { #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) #define MEMBER_RECORD(EnumName, EnumVal, Name) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" private: template diff --git a/include/llvm/DebugInfo/PDB/PDBContext.h b/include/llvm/DebugInfo/PDB/PDBContext.h index 84ab8ed173cb..0ce49f5ef922 100644 --- a/include/llvm/DebugInfo/PDB/PDBContext.h +++ b/include/llvm/DebugInfo/PDB/PDBContext.h @@ -41,8 +41,7 @@ namespace pdb { return DICtx->getKind() == CK_PDB; } - void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All, - bool DumpEH = false, bool SummarizeTypes = false) override; + void dump(raw_ostream &OS, DIDumpOptions DIDumpOpts) override; DILineInfo getLineInfoForAddress( uint64_t Address, diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index ace309ed95a4..687863857698 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -228,34 +228,31 @@ class AttributeSet { bool operator==(const AttributeSet &O) { return SetNode == O.SetNode; } bool operator!=(const AttributeSet &O) { return !(*this == O); } - /// Add an argument attribute. Because - /// attribute sets are immutable, this returns a new set. - AttributeSet addAttribute(LLVMContext &C, - Attribute::AttrKind Kind) const; + /// Add an argument attribute. Returns a new set because attribute sets are + /// immutable. + AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const; - /// Add a target-dependent attribute. Because - /// attribute sets are immutable, this returns a new set. + /// Add a target-dependent attribute. Returns a new set because attribute sets + /// are immutable. AttributeSet addAttribute(LLVMContext &C, StringRef Kind, StringRef Value = StringRef()) const; - /// Add attributes to the attribute set. Because - /// attribute sets are immutable, this returns a new set. + /// Add attributes to the attribute set. Returns a new set because attribute + /// sets are immutable. AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const; - /// Remove the specified attribute from this set. Because - /// attribute sets are immutable, this returns a new set. - AttributeSet removeAttribute(LLVMContext &C, - Attribute::AttrKind Kind) const; + /// Remove the specified attribute from this set. Returns a new set because + /// attribute sets are immutable. + AttributeSet removeAttribute(LLVMContext &C, Attribute::AttrKind Kind) const; - /// Remove the specified attribute from this set. Because - /// attribute sets are immutable, this returns a new set. - AttributeSet removeAttribute(LLVMContext &C, - StringRef Kind) const; + /// Remove the specified attribute from this set. Returns a new set because + /// attribute sets are immutable. + AttributeSet removeAttribute(LLVMContext &C, StringRef Kind) const; - /// Remove the specified attributes from this set. Because - /// attribute sets are immutable, this returns a new set. + /// Remove the specified attributes from this set. Returns a new set because + /// attribute sets are immutable. AttributeSet removeAttributes(LLVMContext &C, - const AttrBuilder &AttrsToRemove) const; + const AttrBuilder &AttrsToRemove) const; /// Return the number of attributes in this set. unsigned getNumAttributes() const; @@ -377,6 +374,25 @@ class AttributeList { static AttributeList get(LLVMContext &C, unsigned Index, const AttrBuilder &B); + /// \brief Add an attribute to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAttribute(LLVMContext &C, unsigned Index, + Attribute::AttrKind Kind) const; + + /// \brief Add an attribute to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind, + StringRef Value = StringRef()) const; + + /// Add an attribute to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAttribute(LLVMContext &C, unsigned Index, Attribute A) const; + + /// \brief Add attributes to the attribute set at the given index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAttributes(LLVMContext &C, unsigned Index, + const AttrBuilder &B) const; + /// Add an argument attribute to the list. Returns a new list because /// attribute lists are immutable. AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, @@ -384,66 +400,112 @@ class AttributeList { return addAttribute(C, ArgNo + FirstArgIndex, Kind); } - /// \brief Add an attribute to the attribute set at the given index. Because - /// attribute sets are immutable, this returns a new set. - AttributeList addAttribute(LLVMContext &C, unsigned Index, - Attribute::AttrKind Kind) const; + /// Add an argument attribute to the list. Returns a new list because + /// attribute lists are immutable. + AttributeList addParamAttribute(LLVMContext &C, unsigned ArgNo, + StringRef Kind, + StringRef Value = StringRef()) const { + return addAttribute(C, ArgNo + FirstArgIndex, Kind, Value); + } - /// \brief Add an attribute to the attribute set at the given index. Because - /// attribute sets are immutable, this returns a new set. - AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind, - StringRef Value = StringRef()) const; + /// Add an attribute to the attribute list at the given arg indices. Returns a + /// new list because attribute lists are immutable. + AttributeList addParamAttribute(LLVMContext &C, ArrayRef ArgNos, + Attribute A) const; - /// Add an attribute to the attribute set at the given indices. Because - /// attribute sets are immutable, this returns a new set. - AttributeList addAttribute(LLVMContext &C, ArrayRef Indices, - Attribute A) const; - - /// \brief Add attributes to the attribute set at the given index. Because - /// attribute sets are immutable, this returns a new set. - AttributeList addAttributes(LLVMContext &C, unsigned Index, - const AttrBuilder &B) const; + /// Add an argument attribute to the list. Returns a new list because + /// attribute lists are immutable. + AttributeList addParamAttributes(LLVMContext &C, unsigned ArgNo, + const AttrBuilder &B) const { + return addAttributes(C, ArgNo + FirstArgIndex, B); + } /// \brief Remove the specified attribute at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const; /// \brief Remove the specified attribute at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttribute(LLVMContext &C, unsigned Index, StringRef Kind) const; /// \brief Remove the specified attributes at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttributes(LLVMContext &C, unsigned Index, const AttrBuilder &AttrsToRemove) const; /// \brief Remove all attributes at the specified index from this - /// attribute list. Because attribute lists are immutable, this returns the - /// new list. + /// attribute list. Returns a new list because attribute lists are immutable. AttributeList removeAttributes(LLVMContext &C, unsigned Index) const; - /// \brief Add the dereferenceable attribute to the attribute set at the given - /// index. Because attribute sets are immutable, this returns a new set. + /// \brief Remove the specified attribute at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo, + Attribute::AttrKind Kind) const { + return removeAttribute(C, ArgNo + FirstArgIndex, Kind); + } + + /// \brief Remove the specified attribute at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttribute(LLVMContext &C, unsigned ArgNo, + StringRef Kind) const { + return removeAttribute(C, ArgNo + FirstArgIndex, Kind); + } + + /// \brief Remove the specified attribute at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo, + const AttrBuilder &AttrsToRemove) const { + return removeAttributes(C, ArgNo + FirstArgIndex, AttrsToRemove); + } + + /// \brief Remove all attributes at the specified arg index from this + /// attribute list. Returns a new list because attribute lists are immutable. + AttributeList removeParamAttributes(LLVMContext &C, unsigned ArgNo) const { + return removeAttributes(C, ArgNo + FirstArgIndex); + } + + /// \Brief Add the dereferenceable attribute to the attribute set at the given + /// index. Returns a new list because attribute lists are immutable. AttributeList addDereferenceableAttr(LLVMContext &C, unsigned Index, uint64_t Bytes) const; + /// \Brief Add the dereferenceable attribute to the attribute set at the given + /// arg index. Returns a new list because attribute lists are immutable. + AttributeList addDereferenceableParamAttr(LLVMContext &C, unsigned ArgNo, + uint64_t Bytes) const { + return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes); + } + /// \brief Add the dereferenceable_or_null attribute to the attribute set at - /// the given index. Because attribute sets are immutable, this returns a new - /// set. + /// the given index. Returns a new list because attribute lists are immutable. AttributeList addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index, uint64_t Bytes) const; + /// \brief Add the dereferenceable_or_null attribute to the attribute set at + /// the given arg index. Returns a new list because attribute lists are + /// immutable. + AttributeList addDereferenceableOrNullParamAttr(LLVMContext &C, + unsigned ArgNo, + uint64_t Bytes) const { + return addDereferenceableOrNullAttr(C, ArgNo + FirstArgIndex, Bytes); + } + /// Add the allocsize attribute to the attribute set at the given index. - /// Because attribute sets are immutable, this returns a new set. + /// Returns a new list because attribute lists are immutable. AttributeList addAllocSizeAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg, const Optional &NumElemsArg); + /// Add the allocsize attribute to the attribute set at the given arg index. + /// Returns a new list because attribute lists are immutable. + AttributeList addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, + unsigned ElemSizeArg, + const Optional &NumElemsArg) { + return addAllocSizeAttr(C, ArgNo + FirstArgIndex, ElemSizeArg, NumElemsArg); + } + //===--------------------------------------------------------------------===// // AttributeList Accessors //===--------------------------------------------------------------------===// @@ -473,6 +535,21 @@ class AttributeList { /// \brief Return true if attribute exists at the given index. bool hasAttributes(unsigned Index) const; + /// \brief Return true if the attribute exists for the given argument + bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + return hasAttribute(ArgNo + FirstArgIndex, Kind); + } + + /// \brief Return true if the attribute exists for the given argument + bool hasParamAttr(unsigned ArgNo, StringRef Kind) const { + return hasAttribute(ArgNo + FirstArgIndex, Kind); + } + + /// \brief Return true if attributes exists for the given argument + bool hasParamAttrs(unsigned ArgNo) const { + return hasAttributes(ArgNo + FirstArgIndex); + } + /// \brief Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but /// may be faster. bool hasFnAttribute(Attribute::AttrKind Kind) const; @@ -496,6 +573,16 @@ class AttributeList { /// \brief Return the attribute object that exists at the given index. Attribute getAttribute(unsigned Index, StringRef Kind) const; + /// \brief Return the attribute object that exists at the arg index. + Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + return getAttribute(ArgNo + FirstArgIndex, Kind); + } + + /// \brief Return the attribute object that exists at the given index. + Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const { + return getAttribute(ArgNo + FirstArgIndex, Kind); + } + /// \brief Return the alignment of the return value. unsigned getRetAlignment() const; @@ -508,10 +595,22 @@ class AttributeList { /// \brief Get the number of dereferenceable bytes (or zero if unknown). uint64_t getDereferenceableBytes(unsigned Index) const; + /// \brief Get the number of dereferenceable bytes (or zero if unknown) of an + /// arg. + uint64_t getParamDereferenceableBytes(unsigned ArgNo) const { + return getDereferenceableBytes(ArgNo + FirstArgIndex); + } + /// \brief Get the number of dereferenceable_or_null bytes (or zero if /// unknown). uint64_t getDereferenceableOrNullBytes(unsigned Index) const; + /// \brief Get the number of dereferenceable_or_null bytes (or zero if + /// unknown) of an arg. + uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const { + return getDereferenceableOrNullBytes(ArgNo + FirstArgIndex); + } + /// Get the allocsize argument numbers (or pair(0, 0) if unknown). std::pair> getAllocSizeArgs(unsigned Index) const; diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h index f27e5c50a47f..29f512ddd076 100644 --- a/include/llvm/IR/Function.h +++ b/include/llvm/IR/Function.h @@ -214,10 +214,6 @@ class Function : public GlobalObject, public ilist_node { addAttribute(AttributeList::FunctionIndex, Attr); } - void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - addAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); - } - /// @brief Remove function attributes from this function. void removeFnAttr(Attribute::AttrKind Kind) { removeAttribute(AttributeList::FunctionIndex, Kind); @@ -229,10 +225,6 @@ class Function : public GlobalObject, public ilist_node { getContext(), AttributeList::FunctionIndex, Kind)); } - void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - removeAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); - } - /// \brief Set the entry count for this function. /// /// Entry count is the number of times this function was executed based on @@ -299,6 +291,15 @@ class Function : public GlobalObject, public ilist_node { /// @brief adds the attributes to the list of attributes. void addAttributes(unsigned i, const AttrBuilder &Attrs); + /// @brief adds the attribute to the list of attributes for the given arg. + void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + + /// @brief adds the attribute to the list of attributes for the given arg. + void addParamAttr(unsigned ArgNo, Attribute Attr); + + /// @brief adds the attributes to the list of attributes for the given arg. + void addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs); + /// @brief removes the attribute from the list of attributes. void removeAttribute(unsigned i, Attribute::AttrKind Kind); @@ -308,6 +309,15 @@ class Function : public GlobalObject, public ilist_node { /// @brief removes the attributes from the list of attributes. void removeAttributes(unsigned i, const AttrBuilder &Attrs); + /// @brief removes the attribute from the list of attributes. + void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + + /// @brief removes the attribute from the list of attributes. + void removeParamAttr(unsigned ArgNo, StringRef Kind); + + /// @brief removes the attribute from the list of attributes. + void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs); + /// @brief check if an attributes is in the list of attributes. bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const { return getAttributes().hasAttribute(i, Kind); @@ -329,10 +339,18 @@ class Function : public GlobalObject, public ilist_node { /// @brief adds the dereferenceable attribute to the list of attributes. void addDereferenceableAttr(unsigned i, uint64_t Bytes); + /// @brief adds the dereferenceable attribute to the list of attributes for + /// the given arg. + void addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes); + /// @brief adds the dereferenceable_or_null attribute to the list of /// attributes. void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes); + /// @brief adds the dereferenceable_or_null attribute to the list of + /// attributes for the given arg. + void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes); + /// @brief Extract the alignment for a call or parameter (0=unknown). unsigned getParamAlignment(unsigned ArgNo) const { return AttributeSets.getParamAlignment(ArgNo); @@ -345,6 +363,12 @@ class Function : public GlobalObject, public ilist_node { return AttributeSets.getDereferenceableBytes(i); } + /// @brief Extract the number of dereferenceable bytes for a parameter. + /// @param ArgNo Index of an argument, with 0 being the first function arg. + uint64_t getParamDereferenceableBytes(unsigned ArgNo) const { + return AttributeSets.getParamDereferenceableBytes(ArgNo); + } + /// @brief Extract the number of dereferenceable_or_null bytes for a call or /// parameter (0=unknown). /// @param i AttributeList index, referring to a return value or argument. @@ -352,6 +376,13 @@ class Function : public GlobalObject, public ilist_node { return AttributeSets.getDereferenceableOrNullBytes(i); } + /// @brief Extract the number of dereferenceable_or_null bytes for a + /// parameter. + /// @param ArgNo AttributeList ArgNo, referring to an argument. + uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const { + return AttributeSets.getParamDereferenceableOrNullBytes(ArgNo); + } + /// @brief Determine if the function does not access memory. bool doesNotAccessMemory() const { return hasFnAttribute(Attribute::ReadNone); diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index 6fab59613dd6..1f7990b99ebe 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -1660,6 +1660,9 @@ class CallInst : public Instruction, /// Adds the attribute to the indicated argument void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + /// Adds the attribute to the indicated argument + void addParamAttr(unsigned ArgNo, Attribute Attr); + /// removes the attribute from the list of attributes. void removeAttribute(unsigned i, Attribute::AttrKind Kind); @@ -1669,6 +1672,9 @@ class CallInst : public Instruction, /// Removes the attribute from the given argument void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind); + /// Removes the attribute from the given argument + void removeParamAttr(unsigned ArgNo, StringRef Kind); + /// adds the dereferenceable attribute to the list of attributes. void addDereferenceableAttr(unsigned i, uint64_t Bytes); @@ -1704,6 +1710,18 @@ class CallInst : public Instruction, return getAttributes().getAttribute(i, Kind); } + /// Get the attribute of a given kind from a given arg + Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + return getAttributes().getParamAttr(ArgNo, Kind); + } + + /// Get the attribute of a given kind from a given arg + Attribute getParamAttr(unsigned ArgNo, StringRef Kind) const { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + return getAttributes().getParamAttr(ArgNo, Kind); + } + /// Return true if the data operand at index \p i has the attribute \p /// A. /// diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 3ca21c15577b..5b9796d4fba6 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -188,6 +188,7 @@ void initializeLintPass(PassRegistry&); void initializeLiveDebugValuesPass(PassRegistry&); void initializeLiveDebugVariablesPass(PassRegistry&); void initializeLiveIntervalsPass(PassRegistry&); +void initializeLiveRangeShrinkPass(PassRegistry&); void initializeLiveRegMatrixPass(PassRegistry&); void initializeLiveStacksPass(PassRegistry&); void initializeLiveVariablesPass(PassRegistry&); diff --git a/include/llvm/Object/WindowsResource.h b/include/llvm/Object/WindowsResource.h index f94ad09ce0c6..2484f551aee0 100644 --- a/include/llvm/Object/WindowsResource.h +++ b/include/llvm/Object/WindowsResource.h @@ -30,11 +30,18 @@ #define LLVM_INCLUDE_LLVM_OBJECT_RESFILE_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Object/Binary.h" +#include "llvm/Object/Error.h" #include "llvm/Support/BinaryByteStream.h" #include "llvm/Support/BinaryStreamReader.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ScopedPrinter.h" + +#include namespace llvm { namespace object { @@ -44,23 +51,44 @@ class WindowsResource; class ResourceEntryRef { public: Error moveNext(bool &End); + bool checkTypeString() const { return IsStringType; } + ArrayRef getTypeString() const { return Type; } + uint16_t getTypeID() const { return TypeID; } + bool checkNameString() const { return IsStringName; } + ArrayRef getNameString() const { return Name; } + uint16_t getNameID() const { return NameID; } + uint16_t getLanguage() const { return Suffix->Language; } private: friend class WindowsResource; ResourceEntryRef(BinaryStreamRef Ref, const WindowsResource *Owner, Error &Err); + Error loadNext(); + struct HeaderSuffix { + support::ulittle32_t DataVersion; + support::ulittle16_t MemoryFlags; + support::ulittle16_t Language; + support::ulittle32_t Version; + support::ulittle32_t Characteristics; + }; + BinaryStreamReader Reader; - BinaryStreamRef HeaderBytes; - BinaryStreamRef DataBytes; + bool IsStringType; + ArrayRef Type; + uint16_t TypeID; + bool IsStringName; + ArrayRef Name; + uint16_t NameID; + const HeaderSuffix *Suffix = nullptr; + ArrayRef Data; const WindowsResource *OwningRes = nullptr; }; class WindowsResource : public Binary { public: - ~WindowsResource() override; Expected getHeadEntry(); static bool classof(const Binary *V) { return V->isWinRes(); } @@ -76,6 +104,36 @@ class WindowsResource : public Binary { BinaryByteStream BBS; }; +class WindowsResourceParser { +public: + WindowsResourceParser(); + + Error parse(WindowsResource *WR); + + void printTree() const; + +private: + class TreeNode { + public: + TreeNode() = default; + explicit TreeNode(ArrayRef Ref); + void addEntry(const ResourceEntryRef &Entry); + void print(ScopedPrinter &Writer, StringRef Name) const; + + private: + TreeNode &addTypeNode(const ResourceEntryRef &Entry); + TreeNode &addNameNode(const ResourceEntryRef &Entry); + TreeNode &addLanguageNode(const ResourceEntryRef &Entry); + TreeNode &addChild(uint32_t ID); + TreeNode &addChild(ArrayRef NameRef); + std::vector Name; + std::map> IDChildren; + std::map> StringChildren; + }; + + TreeNode Root; +}; + } // namespace object } // namespace llvm diff --git a/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h b/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h new file mode 100644 index 000000000000..6ddae2e2b41c --- /dev/null +++ b/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h @@ -0,0 +1,91 @@ +//===- CodeViewYAMLDebugSections.h - CodeView YAMLIO debug sections -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_CODEVIEWYAMLDEBUGSECTIONS_H +#define LLVM_OBJECTYAML_CODEVIEWYAMLDEBUGSECTIONS_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/DebugSubsection.h" +#include "llvm/ObjectYAML/YAML.h" + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct C13FragmentBase; +} + +struct SourceLineEntry { + uint32_t Offset; + uint32_t LineStart; + uint32_t EndDelta; + bool IsStatement; +}; + +struct SourceColumnEntry { + uint16_t StartColumn; + uint16_t EndColumn; +}; + +struct SourceLineBlock { + StringRef FileName; + std::vector Lines; + std::vector Columns; +}; + +struct HexFormattedString { + std::vector Bytes; +}; + +struct SourceFileChecksumEntry { + StringRef FileName; + codeview::FileChecksumKind Kind; + HexFormattedString ChecksumBytes; +}; + +struct SourceLineInfo { + uint32_t RelocOffset; + uint32_t RelocSegment; + codeview::LineFlags Flags; + uint32_t CodeSize; + + std::vector Blocks; +}; + +struct InlineeSite { + uint32_t Inlinee; + StringRef FileName; + uint32_t SourceLineNum; + std::vector ExtraFiles; +}; + +struct InlineeInfo { + bool HasExtraFiles; + std::vector Sites; +}; + +struct SourceFileInfo { + std::vector FileChecksums; + std::vector LineFragments; + std::vector Inlinees; +}; + +struct C13DebugSection { + std::vector Fragments; +}; +} // namespace CodeViewYAML +} // namespace llvm + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileInfo) + +#endif diff --git a/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h b/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h new file mode 100644 index 000000000000..ee4e2ac9d404 --- /dev/null +++ b/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h @@ -0,0 +1,41 @@ +//===- CodeViewYAMLSymbols.h - CodeView YAMLIO Symbol implementation ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_CODEVIEWYAMLSYMBOLS_H +#define LLVM_OBJECTYAML_CODEVIEWYAMLSYMBOLS_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/ObjectYAML/YAML.h" + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct SymbolRecordBase; +} + +struct SymbolRecord { + std::shared_ptr Symbol; + + codeview::CVSymbol toCodeViewSymbol(BumpPtrAllocator &Allocator) const; + static Expected fromCodeViewSymbol(codeview::CVSymbol Symbol); +}; + +} // namespace CodeViewYAML +} // namespace llvm + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SymbolRecord) +LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::SymbolRecord) + +#endif diff --git a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h new file mode 100644 index 000000000000..a57ada34a4fa --- /dev/null +++ b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h @@ -0,0 +1,48 @@ +//===- CodeViewYAMLTypes.h - CodeView YAMLIO Type Record implementation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OBJECTYAML_CODEVIEWYAMLTYPES_H +#define LLVM_OBJECTYAML_CODEVIEWYAMLTYPES_H + +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/ObjectYAML/YAML.h" + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +struct LeafRecordBase; +struct MemberRecordBase; +} + +struct MemberRecord { + std::shared_ptr Member; +}; + +struct LeafRecord { + std::shared_ptr Leaf; + + codeview::CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const; + static Expected fromCodeViewRecord(codeview::CVType Type); +}; +} // namespace CodeViewYAML +} // namespace llvm + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::LeafRecord) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::MemberRecord) + +LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::LeafRecord) +LLVM_YAML_IS_SEQUENCE_VECTOR(CodeViewYAML::MemberRecord) + +#endif diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index efa36d957fbd..12b05e4ff0c5 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -192,6 +192,39 @@ class PassBuilder { buildFunctionSimplificationPipeline(OptimizationLevel Level, bool DebugLogging = false); + /// Construct the core LLVM module canonicalization and simplification + /// pipeline. + /// + /// This pipeline focuses on canonicalizing and simplifying the entire module + /// of IR. Much like the function simplification pipeline above, it is + /// suitable to run repeatedly over the IR and is not expected to destroy + /// important information. It does, however, perform inlining and other + /// heuristic based simplifications that are not strictly reversible. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager + buildModuleSimplificationPipeline(OptimizationLevel Level, + bool DebugLogging = false); + + /// Construct the core LLVM module optimization pipeline. + /// + /// This pipeline focuses on optimizing the execution speed of the IR. It + /// uses cost modeling and thresholds to balance code growth against runtime + /// improvements. It includes vectorization and other information destroying + /// transformations. It also cannot generally be run repeatedly on a module + /// without potentially seriously regressing either runtime performance of + /// the code or serious code size growth. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level, + bool DebugLogging = false); + /// Build a per-module default optimization pipeline. /// /// This provides a good default optimization pipeline for per-module @@ -206,6 +239,36 @@ class PassBuilder { ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level, bool DebugLogging = false); + /// Build a pre-link, ThinLTO-targeting default optimization pipeline to + /// a pass manager. + /// + /// This adds the pre-link optimizations tuned to prepare a module for + /// a ThinLTO run. It works to minimize the IR which needs to be analyzed + /// without making irreversible decisions which could be made better during + /// the LTO run. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager + buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level, + bool DebugLogging = false); + + /// Build an ThinLTO default optimization pipeline to a pass manager. + /// + /// This provides a good default optimization pipeline for link-time + /// optimization and code generation. It is particularly tuned to fit well + /// when IR coming into the LTO phase was first run through \c + /// addPreLinkLTODefaultPipeline, and the two coordinate closely. + /// + /// Note that \p Level cannot be `O0` here. The pipelines produced are + /// only intended for use when attempting to optimize code. If frontends + /// require some transformations for semantic reasons, they should explicitly + /// build them. + ModulePassManager buildThinLTODefaultPipeline(OptimizationLevel Level, + bool DebugLogging = false); + /// Build a pre-link, LTO-targeting default optimization pipeline to a pass /// manager. /// diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def index 32dc57a0fedf..65cb2715a6a5 100644 --- a/include/llvm/Support/ARMTargetParser.def +++ b/include/llvm/Support/ARMTargetParser.def @@ -206,7 +206,7 @@ ARM_CPU_NAME("cortex-a5", AK_ARMV7A, FK_NEON_VFPV4, false, ARM_CPU_NAME("cortex-a7", AK_ARMV7A, FK_NEON_VFPV4, false, (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB)) -ARM_CPU_NAME("cortex-a8", AK_ARMV7A, FK_NEON, true, ARM::AEK_SEC) +ARM_CPU_NAME("cortex-a8", AK_ARMV7A, FK_NEON, false, ARM::AEK_SEC) ARM_CPU_NAME("cortex-a9", AK_ARMV7A, FK_NEON_FP16, false, (ARM::AEK_SEC | ARM::AEK_MP)) ARM_CPU_NAME("cortex-a12", AK_ARMV7A, FK_NEON_VFPV4, false, (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | @@ -236,7 +236,7 @@ ARM_CPU_NAME("cortex-m23", AK_ARMV8MBaseline, FK_NONE, false, ARM::AEK_NONE) ARM_CPU_NAME("cortex-m33", AK_ARMV8MMainline, FK_FPV5_SP_D16, false, ARM::AEK_DSP) ARM_CPU_NAME("cortex-a32", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a35", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) -ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, ARM::AEK_CRC) +ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a73", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h index 56375f41d2c0..29e8a2ab08aa 100644 --- a/include/llvm/Support/BinaryStreamReader.h +++ b/include/llvm/Support/BinaryStreamReader.h @@ -14,6 +14,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/BinaryStreamArray.h" #include "llvm/Support/BinaryStreamRef.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Endian.h" #include "llvm/Support/Error.h" #include "llvm/Support/type_traits.h" @@ -104,6 +105,13 @@ class BinaryStreamReader { /// returns an appropriate error code. Error readCString(StringRef &Dest); + /// Similar to readCString, however read a null-terminated UTF16 string + /// instead. + /// + /// \returns a success error code if the data was successfully read, otherwise + /// returns an appropriate error code. + Error readWideString(ArrayRef &Dest); + /// Read a \p Length byte string into \p Dest. Whether a copy occurs depends /// on the implementation of the underlying stream. Updates the stream's /// offset to point after the newly read data. diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index ffea679fab82..8949d69ce724 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -1606,6 +1606,44 @@ template struct StdMapStringCustomMappingTraitsImpl { } \ } +#define LLVM_YAML_DECLARE_MAPPING_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct MappingTraits { \ + static void mapping(IO &IO, Type &Obj); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_ENUM_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct ScalarEnumerationTraits { \ + static void enumeration(IO &io, Type &Value); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_BITSET_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct ScalarBitSetTraits { \ + static void bitset(IO &IO, Type &Options); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_SCALAR_TRAITS(Type, MustQuote) \ + namespace llvm { \ + namespace yaml { \ + template <> struct ScalarTraits { \ + static void output(const Type &Value, void *ctx, llvm::raw_ostream &Out); \ + static StringRef input(StringRef Scalar, void *ctxt, Type &Value); \ + static bool mustQuote(StringRef) { return MustQuote; } \ + }; \ + } \ + } + /// Utility for declaring that a std::vector of a particular type /// should be considered a YAML document list. #define LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(_type) \ diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h index 437f68b24e57..3c181f0e511b 100644 --- a/include/llvm/TableGen/Record.h +++ b/include/llvm/TableGen/Record.h @@ -1237,7 +1237,6 @@ class RecordVal { public: RecordVal(Init *N, RecTy *T, bool P); - RecordVal(StringRef N, RecTy *T, bool P); StringRef getName() const; Init *getNameInit() const { return Name; } @@ -1340,7 +1339,6 @@ class Record { } void setName(Init *Name); // Also updates RecordKeeper. - void setName(StringRef Name); // Also updates RecordKeeper. ArrayRef getLoc() const { return Locs; } @@ -1378,13 +1376,11 @@ class Record { } RecordVal *getValue(const Init *Name) { - for (RecordVal &Val : Values) - if (Val.Name == Name) return &Val; - return nullptr; + return const_cast(static_cast(this)->getValue(Name)); } RecordVal *getValue(StringRef Name) { - return getValue(StringInit::get(Name)); + return const_cast(static_cast(this)->getValue(Name)); } void addTemplateArg(Init *Name) { @@ -1492,7 +1488,7 @@ class Record { /// its value as a string, throwing an exception if the field does not exist /// or if the value is not a string. /// - std::string getValueAsString(StringRef FieldName) const; + StringRef getValueAsString(StringRef FieldName) const; /// This method looks up the specified field and returns /// its value as a BitsInit, throwing an exception if the field does not exist @@ -1522,7 +1518,7 @@ class Record { /// returns its value as a vector of strings, throwing an exception if the /// field does not exist or if the value is not the right type. /// - std::vector getValueAsListOfStrings(StringRef FieldName) const; + std::vector getValueAsListOfStrings(StringRef FieldName) const; /// This method looks up the specified field and returns its /// value as a Record, throwing an exception if the field does not exist or if diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 17182b958ecb..7258a5cc2d89 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1143,6 +1143,16 @@ class TargetLoweringBase { return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + /// Get maximum # of load operations permitted for memcmp + /// + /// This function returns the maximum number of load operations permitted + /// to replace a call to memcmp. The value is set by the target at the + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. + unsigned getMaxExpandSizeMemcmp(bool OptSize) const { + return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; + } + /// \brief Get maximum # of store operations permitted for llvm.memmove /// /// This function returns the maximum number of store operations permitted @@ -2330,6 +2340,8 @@ class TargetLoweringBase { /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; + unsigned MaxLoadsPerMemcmp; + unsigned MaxLoadsPerMemcmpOptSize; /// \brief Specify maximum bytes of store instructions per memmove call. /// diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h index 73ae2ad12988..ed390799cfc3 100644 --- a/include/llvm/Target/TargetMachine.h +++ b/include/llvm/Target/TargetMachine.h @@ -299,6 +299,12 @@ class LLVMTargetMachine : public TargetMachine { bool addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx, raw_pwrite_stream &OS, bool DisableVerify = true) override; + + /// Returns true if the target is expected to pass all machine verifier + /// checks. This is a stopgap measure to fix targets one by one. We will + /// remove this at some point and always enable the verifier when + /// EXPENSIVE_CHECKS is enabled. + virtual bool isMachineVerifierClean() const { return true; } }; } // end namespace llvm diff --git a/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h b/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h new file mode 100644 index 000000000000..bf04bbfe92d8 --- /dev/null +++ b/include/llvm/Transforms/IPO/ThinLTOBitcodeWriter.h @@ -0,0 +1,41 @@ +//===- ThinLTOBitcodeWriter.h - Bitcode writing pass for ThinLTO ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass prepares a module containing type metadata for ThinLTO by splitting +// it into regular and thin LTO parts if possible, and writing both parts to +// a multi-module bitcode file. Modules that do not contain type metadata are +// written unmodified as a single module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_THINLTOBITCODEWRITER_H +#define LLVM_TRANSFORMS_IPO_THINLTOBITCODEWRITER_H + +#include +#include + +namespace llvm { + +class ThinLTOBitcodeWriterPass + : public PassInfoMixin { + raw_ostream &OS; + raw_ostream *ThinLinkOS; + +public: + // Writes bitcode to OS. Also write thin link file to ThinLinkOS, if + // it's not nullptr. + ThinLTOBitcodeWriterPass(raw_ostream &OS, raw_ostream *ThinLinkOS) + : OS(OS), ThinLinkOS(ThinLinkOS) {} + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} // namespace llvm + +#endif diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h index 3f97789cabbc..589aaaca02fe 100644 --- a/include/llvm/Transforms/Scalar/GVN.h +++ b/include/llvm/Transforms/Scalar/GVN.h @@ -68,24 +68,6 @@ class GVN : public PassInfoMixin { class ValueTable { DenseMap valueNumbering; DenseMap expressionNumbering; - - // Expressions is the vector of Expression. ExprIdx is the mapping from - // value number to the index of Expression in Expressions. We use it - // instead of a DenseMap because filling such mapping is faster than - // filling a DenseMap and the compile time is a little better. - uint32_t nextExprNumber; - std::vector Expressions; - std::vector ExprIdx; - // Value number to PHINode mapping. Used for phi-translate in scalarpre. - DenseMap NumberingPhi; - // Cache for phi-translate in scalarpre. - typedef DenseMap, uint32_t> - PhiTranslateMap; - PhiTranslateMap PhiTranslateTable; - // Map the block to reversed postorder traversal number. It is used to - // find back edge easily. - DenseMap BlockRPONumber; - AliasAnalysis *AA; MemoryDependenceResults *MD; DominatorTree *DT; @@ -97,10 +79,6 @@ class GVN : public PassInfoMixin { Value *LHS, Value *RHS); Expression createExtractvalueExpr(ExtractValueInst *EI); uint32_t lookupOrAddCall(CallInst *C); - uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn); - std::pair assignExpNewValueNum(Expression &exp); - bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVN &Gvn); public: ValueTable(); @@ -109,12 +87,9 @@ class GVN : public PassInfoMixin { ~ValueTable(); uint32_t lookupOrAdd(Value *V); - uint32_t lookup(Value *V, bool Verify = true) const; + uint32_t lookup(Value *V) const; uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS, Value *RHS); - uint32_t phiTranslate(const BasicBlock *BB, const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn); - void assignBlockRPONumber(Function &F); bool exists(Value *V) const; void add(Value *V, uint32_t num); void clear(); diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h index a602498e5f22..7e23544af1ab 100644 --- a/include/llvm/Transforms/Utils/CodeExtractor.h +++ b/include/llvm/Transforms/Utils/CodeExtractor.h @@ -25,6 +25,7 @@ template class ArrayRef; class BranchProbabilityInfo; class DominatorTree; class Function; + class Instruction; class Loop; class Module; class RegionNode; @@ -103,7 +104,17 @@ template class ArrayRef; /// a code sequence, that sequence is modified, including changing these /// sets, before extraction occurs. These modifications won't have any /// significant impact on the cost however. - void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs) const; + void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, + const ValueSet &Allocas) const; + /// Find the set of allocas whose life ranges are contained within the + /// outlined region. + /// + /// Allocas which have life_time markers contained in the outlined region + /// should be pushed to the outlined function. The address bitcasts that + /// are used by the lifetime markers are also candidates for shrink- + /// wrapping. The instructions that need to be sinked are collected in + /// 'Allocas'. + void findAllocas(ValueSet &Allocas) const; private: void severSplitPHINodes(BasicBlock *&Header); diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap index 5e15e8d49802..e0780885d159 100644 --- a/include/llvm/module.modulemap +++ b/include/llvm/module.modulemap @@ -95,8 +95,8 @@ module LLVM_DebugInfo_CodeView { module * { export * } // These are intended for (repeated) textual inclusion. - textual header "DebugInfo/CodeView/TypeRecords.def" - textual header "DebugInfo/CodeView/CVSymbolTypes.def" + textual header "DebugInfo/CodeView/CodeViewTypes.def" + textual header "DebugInfo/CodeView/CodeViewSymbols.def" } module LLVM_ExecutionEngine { diff --git a/lib/Analysis/CFLGraph.h b/lib/Analysis/CFLGraph.h index a8fb12b72568..54782b6bd4ad 100644 --- a/lib/Analysis/CFLGraph.h +++ b/lib/Analysis/CFLGraph.h @@ -210,6 +210,11 @@ template class CFLGraphBuilder { void addDerefEdge(Value *From, Value *To, bool IsRead) { assert(From != nullptr && To != nullptr); + // FIXME: This is subtly broken, due to how we model some instructions + // (e.g. extractvalue, extractelement) as loads. Since those take + // non-pointer operands, we'll entirely skip adding edges for those. + // + // addAssignEdge seems to have a similar issue with insertvalue, etc. if (!From->getType()->isPointerTy() || !To->getType()->isPointerTy()) return; addNode(From); @@ -540,6 +545,7 @@ template class CFLGraphBuilder { case Instruction::ExtractValue: { auto *Ptr = CE->getOperand(0); addLoadEdge(Ptr, CE); + break; } case Instruction::ShuffleVector: { auto *From1 = CE->getOperand(0); diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 79517ec6a3a8..6a1af87450c9 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -1739,6 +1739,7 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, if ((Name == "round" && TLI->has(LibFunc_round)) || (Name == "roundf" && TLI->has(LibFunc_roundf))) return ConstantFoldFP(round, V, Ty); + break; case 's': if ((Name == "sin" && TLI->has(LibFunc_sin)) || (Name == "sinf" && TLI->has(LibFunc_sinf))) @@ -1807,6 +1808,7 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, dyn_cast_or_null(Op->getAggregateElement(0U))) return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(), /*roundTowardZero=*/false, Ty); + LLVM_FALLTHROUGH; case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp index ebf0a370b0b0..b12ae9884e3d 100644 --- a/lib/Analysis/EHPersonalities.cpp +++ b/lib/Analysis/EHPersonalities.cpp @@ -27,8 +27,10 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) { return StringSwitch(F->getName()) .Case("__gnat_eh_personality", EHPersonality::GNU_Ada) .Case("__gxx_personality_v0", EHPersonality::GNU_CXX) + .Case("__gxx_personality_seh0",EHPersonality::GNU_CXX) .Case("__gxx_personality_sj0", EHPersonality::GNU_CXX_SjLj) .Case("__gcc_personality_v0", EHPersonality::GNU_C) + .Case("__gcc_personality_seh0",EHPersonality::GNU_C) .Case("__gcc_personality_sj0", EHPersonality::GNU_C_SjLj) .Case("__objc_personality_v0", EHPersonality::GNU_ObjC) .Case("_except_handler3", EHPersonality::MSVC_X86SEH) diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 122442bafb11..66ac847455cd 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -103,13 +103,8 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) { return false; // If we have a DominatorTree then do a precise test. - if (DT) { - if (!DT->isReachableFromEntry(P->getParent())) - return true; - if (!DT->isReachableFromEntry(I->getParent())) - return false; + if (DT) return DT->dominates(I, P); - } // Otherwise, if the instruction is in the entry block and is not an invoke, // then it obviously dominates all phi nodes. diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index 66a0d145dcd8..188885063b39 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -691,6 +691,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // load query, we can safely ignore it (scan past it). if (isLoad) continue; + LLVM_FALLTHROUGH; default: // Otherwise, there is a potential dependence. Return a clobber. return MemDepResult::getClobber(Inst); diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index f55ce202bcbb..d96697cafbe9 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -8182,6 +8182,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, case ICmpInst::ICMP_SGE: std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ICmpInst::ICMP_SLE: // X s<= (X + C) if C >= 0 if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative()) @@ -8195,6 +8196,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, case ICmpInst::ICMP_SGT: std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ICmpInst::ICMP_SLT: // X s< (X + C) if C > 0 if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && @@ -8552,6 +8554,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin))) return true; + LLVM_FALLTHROUGH; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_UGT: @@ -8566,6 +8569,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min))) return true; + LLVM_FALLTHROUGH; default: // No change diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 7a8d4f3be24f..ac646716476b 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -215,6 +215,10 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const { + return TTIImpl->expandMemCmp(I, MaxLoadSize); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index bd79cd56a18b..a5dceb6c2271 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -172,6 +172,18 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, } +bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) { + for (const User *U : CxtI->users()) { + if (const ICmpInst *IC = dyn_cast(U)) + if (IC->isEquality()) + if (Constant *C = dyn_cast(IC->getOperand(1))) + if (C->isNullValue()) + continue; + return false; + } + return true; +} + static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, const Query &Q); @@ -2327,6 +2339,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple, case Instruction::SExt: if (!LookThroughSExt) return false; // otherwise fall through to ZExt + LLVM_FALLTHROUGH; case Instruction::ZExt: return ComputeMultiple(I->getOperand(0), Base, Multiple, LookThroughSExt, Depth+1); diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index c1d81ac203a1..a402b4ddd462 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -378,18 +378,10 @@ class IndexBitcodeWriter : public BitcodeWriterBase { ModuleToSummariesForIndex->count(ModulePath); } - bool hasValueId(GlobalValue::GUID ValGUID) { - const auto &VMI = GUIDToValueIdMap.find(ValGUID); - return VMI != GUIDToValueIdMap.end(); - } - void assignValueId(GlobalValue::GUID ValGUID) { - unsigned &ValueId = GUIDToValueIdMap[ValGUID]; - if (ValueId == 0) - ValueId = ++GlobalValueId; - } - unsigned getValueId(GlobalValue::GUID ValGUID) { + Optional getValueId(GlobalValue::GUID ValGUID) { auto VMI = GUIDToValueIdMap.find(ValGUID); - assert(VMI != GUIDToValueIdMap.end()); + if (VMI == GUIDToValueIdMap.end()) + return None; return VMI->second; } std::map &valueIds() { return GUIDToValueIdMap; } @@ -3413,12 +3405,6 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3); Stream.EmitRecord(bitc::FS_VERSION, ArrayRef{INDEX_VERSION}); - // Create value IDs for undefined references. - forEachSummary([&](GVInfo I) { - for (auto &RI : I.second->refs()) - assignValueId(RI.getGUID()); - }); - for (const auto &GVI : valueIds()) { Stream.EmitRecord(bitc::FS_VALUE_GUID, ArrayRef{GVI.second, GVI.first}); @@ -3492,9 +3478,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { GlobalValueSummary *S = I.second; assert(S); - assert(hasValueId(I.first)); - unsigned ValueId = getValueId(I.first); - SummaryToValueIdMap[S] = ValueId; + auto ValueId = getValueId(I.first); + assert(ValueId); + SummaryToValueIdMap[S] = *ValueId; if (auto *AS = dyn_cast(S)) { // Will process aliases as a post-pass because the reader wants all @@ -3504,11 +3490,14 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { } if (auto *VS = dyn_cast(S)) { - NameVals.push_back(ValueId); + NameVals.push_back(*ValueId); NameVals.push_back(Index.getModuleId(VS->modulePath())); NameVals.push_back(getEncodedGVSummaryFlags(VS->flags())); for (auto &RI : VS->refs()) { - NameVals.push_back(getValueId(RI.getGUID())); + auto RefValueId = getValueId(RI.getGUID()); + if (!RefValueId) + continue; + NameVals.push_back(*RefValueId); } // Emit the finished record. @@ -3522,15 +3511,22 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { auto *FS = cast(S); writeFunctionTypeMetadataRecords(Stream, FS); - NameVals.push_back(ValueId); + NameVals.push_back(*ValueId); NameVals.push_back(Index.getModuleId(FS->modulePath())); NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); NameVals.push_back(FS->instCount()); - NameVals.push_back(FS->refs().size()); + // Fill in below + NameVals.push_back(0); + unsigned Count = 0; for (auto &RI : FS->refs()) { - NameVals.push_back(getValueId(RI.getGUID())); + auto RefValueId = getValueId(RI.getGUID()); + if (!RefValueId) + continue; + NameVals.push_back(*RefValueId); + Count++; } + NameVals[4] = Count; bool HasProfileData = false; for (auto &EI : FS->calls()) { @@ -3543,15 +3539,19 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // If this GUID doesn't have a value id, it doesn't have a function // summary and we don't need to record any calls to it. GlobalValue::GUID GUID = EI.first.getGUID(); - if (!hasValueId(GUID)) { + auto CallValueId = getValueId(GUID); + if (!CallValueId) { // For SamplePGO, the indirect call targets for local functions will // have its original name annotated in profile. We try to find the // corresponding PGOFuncName as the GUID. GUID = Index.getGUIDFromOriginalID(GUID); - if (GUID == 0 || !hasValueId(GUID)) + if (GUID == 0) + continue; + CallValueId = getValueId(GUID); + if (!CallValueId) continue; } - NameVals.push_back(getValueId(GUID)); + NameVals.push_back(*CallValueId); if (HasProfileData) NameVals.push_back(static_cast(EI.second.Hotness)); } diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 43b245c66400..5abf50e5bd10 100644 --- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -165,7 +165,7 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + if (!IsReturnBlock && !Pristine.test(Reg)) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { unsigned AliasReg = *AI; diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index d72cf5922987..e61e22abe82a 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -949,6 +949,19 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) { MCConstantExpr::create(FrameOffset, OutContext)); } +static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF, + MachineModuleInfo *MMI) { + if (!MF.getLandingPads().empty() || MF.hasEHFunclets() || MMI->hasDebugInfo()) + return true; + + // We might emit an EH table that uses function begin and end labels even if + // we don't have any landingpads. + if (!MF.getFunction()->hasPersonalityFn()) + return false; + return !isNoOpWithoutInvoke( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())); +} + /// EmitFunctionBody - This method emits the body and trailer for a /// function. void AsmPrinter::EmitFunctionBody() { @@ -1076,8 +1089,8 @@ void AsmPrinter::EmitFunctionBody() { // Emit target-specific gunk after the function body. EmitFunctionBodyEnd(); - if (!MF->getLandingPads().empty() || MMI->hasDebugInfo() || - MF->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) { + if (needFuncLabelsForEHOrDebugInfo(*MF, MMI) || + MAI->hasDotTypeDotSizeDirective()) { // Create a symbol for the end of function. CurrentFnEnd = createTempSymbol("func_end"); OutStreamer->EmitLabel(CurrentFnEnd); @@ -1402,8 +1415,7 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) { CurrentFnBegin = nullptr; CurExceptionSym = nullptr; bool NeedsLocalForSize = MAI->needsLocalForSize(); - if (!MF.getLandingPads().empty() || MMI->hasDebugInfo() || - MF.hasEHFunclets() || NeedsLocalForSize) { + if (needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize) { CurrentFnBegin = createTempSymbol("func_begin"); if (NeedsLocalForSize) CurrentFnSymForSize = CurrentFnBegin; diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 0a4a7a06cb2e..e14d5be1177a 100644 --- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -309,7 +309,7 @@ computeCallSiteTable(SmallVectorImpl &CallSites, // If some instruction between the previous try-range and the end of the // function may throw, create a call-site entry with no landing pad for the // region following the try-range. - if (SawPotentiallyThrowing && !IsSJLJ && LastLabel != nullptr) { + if (SawPotentiallyThrowing && !IsSJLJ) { CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 }; CallSites.push_back(Site); } diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 2b5863aa5800..55a27e2fb79e 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -49,6 +49,7 @@ add_llvm_library(LLVMCodeGen LivePhysRegs.cpp LiveRangeCalc.cpp LiveRangeEdit.cpp + LiveRangeShrink.cpp LiveRegMatrix.cpp LiveRegUnits.cpp LiveStackAnalysis.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 2a2715beaadc..4d30c6574b12 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); + initializeLiveRangeShrinkPass(Registry); initializeLiveStacksPass(Registry); initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 4e85708efafc..568b278dd47c 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -24,12 +24,13 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/Passes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -60,6 +61,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include "llvm/Transforms/Utils/ValueMapper.h" + using namespace llvm; using namespace llvm::PatternMatch; @@ -84,6 +86,12 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, + "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -144,6 +152,11 @@ EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden, cl::desc("Enable merging of redundant sexts when one is dominating" " the other."), cl::init(true)); +static cl::opt MemCmpNumLoadsPerBlock( + "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), + cl::desc("The number of loads per basic block for inline expansion of " + "memcmp that is only being compared against zero.")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef PointerIntPair TypeIsSExt; @@ -1629,6 +1642,593 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, return true; } +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + BasicBlock *BB; + PHINode *PhiSrc1; + PHINode *PhiSrc2; + ResultBlock(); + }; + + CallInst *CI; + ResultBlock ResBlock; + unsigned MaxLoadSize; + unsigned NumBlocks; + unsigned NumBlocksNonOneByte; + unsigned NumLoadsPerBlock; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + bool IsUsedForZeroCmp; + int calculateNumBlocks(unsigned Size); + void createLoadCmpBlocks(); + void createResultBlock(); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex, + bool IsLittleEndian); + void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size, + unsigned &NumBytesProcessed); + void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); + void emitMemCmpResultBlock(bool IsLittleEndian); + Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); + unsigned getLoadSize(unsigned Size); + unsigned getNumLoads(unsigned Size); + +public: + MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock); + Value *getMemCmpExpansion(bool IsLittleEndian); +}; + +MemCmpExpansion::ResultBlock::ResultBlock() + : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {} + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned NumLoadsPerBlock) + : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(NumLoadsPerBlock) { + + IRBuilder<> Builder(CI->getContext()); + + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); + + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + uint64_t Size = SizeCast->getZExtValue(); + + // Calculate how many load compare blocks are required for an expansion of + // given Size. + NumBlocks = calculateNumBlocks(Size); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); + + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); +} + +void MemCmpExpansion::createLoadCmpBlocks() { + for (unsigned i = 0; i < NumBlocks; i++) { + BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", + EndBlock->getParent(), EndBlock); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock() { + ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", + EndBlock->getParent(), EndBlock); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp paramters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) { + IRBuilder<> Builder(CI->getContext()); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Type *LoadSizeType = Type::getInt8Ty(CI->getContext()); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext())); + LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext())); + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); + + if (Index < (LoadCmpBlocks.size() - 1)) { + // Early exit branch if difference found to EndBlock, otherwise continue to + // next LoadCmpBlock + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BranchInst *CmpBr = + BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp); + Builder.Insert(CmpBr); + } else { + // The last block has an unconditional branch to EndBlock + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); + } +} + +unsigned MemCmpExpansion::getNumLoads(unsigned Size) { + return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize); +} + +unsigned MemCmpExpansion::getLoadSize(unsigned Size) { + return MinAlign(PowerOf2Floor(Size), MaxLoadSize); +} + +void MemCmpExpansion::emitLoadCompareBlockMultipleLoads( + unsigned Index, unsigned Size, unsigned &NumBytesProcessed) { + + IRBuilder<> Builder(CI->getContext()); + + std::vector XorList, OrList; + Value *Diff; + + unsigned RemainingBytes = Size - NumBytesProcessed; + unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); + unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + + for (unsigned i = 0; i < NumLoads; ++i) { + unsigned LoadSize = getLoadSize(RemainingBytes); + unsigned GEPIndex = NumBytesProcessed / LoadSize; + NumBytesProcessed += LoadSize; + RemainingBytes -= LoadSize; + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType); + XorList.push_back(Diff); + } + + auto pairWiseOr = [&](std::vector &InList) -> std::vector { + std::vector OutList; + for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { + Value *Or = Builder.CreateOr(InList[i], InList[i + 1]); + OutList.push_back(Or); + } + if (InList.size() % 2 != 0) + OutList.push_back(InList.back()); + return OutList; + }; + + // Pair wise OR the XOR results + OrList = pairWiseOr(XorList); + + // Pair wise OR the OR results until one result left + while (OrList.size() != 1) { + OrList = pairWiseOr(OrList); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0], + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does a subtract to see if there was +// a difference in the loaded values. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was +// larger. Otherwise, it falls through to the either the next LoadCmpBlock or +// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with +// a special case through emitLoadCompareByteBlock. The special handling can +// simply subtract the loaded values and add it to the result phi node. +void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize, + int GEPIndex, bool IsLittleEndian) { + if (LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); + return; + } + + IRBuilder<> Builder(CI->getContext()); + + Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (IsLittleEndian) { + Function *F = LoadCmpBlocks[Index]->getParent(); + + Function *Bswap = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::bswap, LoadSizeType); + LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1); + LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2); + } + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!IsUsedForZeroCmp) { + ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); + ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); + } + + Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); + PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It compares the two loaded source values and returns -1 if +// src1 < src2 and 1 if src1 > src2. +void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) { + IRBuilder<> Builder(CI->getContext()); + + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (IsUsedForZeroCmp) { + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); + PhiRes->addIncoming(Res, ResBlock.BB); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.BB, InsertPt); + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1, + ResBlock.PhiSrc2); + + Value *Res = + Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), + ConstantInt::get(Builder.getInt32Ty(), 1)); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + PhiRes->addIncoming(Res, ResBlock.BB); +} + +int MemCmpExpansion::calculateNumBlocks(unsigned Size) { + int NumBlocks = 0; + bool haveOneByteLoad = false; + unsigned RemainingSize = Size; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + if (LoadSize == 1) + haveOneByteLoad = true; + NumBlocks += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks; + + if (IsUsedForZeroCmp) + NumBlocks = NumBlocks / NumLoadsPerBlock + + (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0); + + return NumBlocks; +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + IRBuilder<> Builder(CI->getContext()); + Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.BB); + ResBlock.PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); + ResBlock.PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + IRBuilder<> Builder(CI->getContext()); + + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); +} + +Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size, + bool IsLittleEndian) { + unsigned NumBytesProcessed = 0; + // This loop populates each of the LoadCmpBlocks with IR sequence to handle + // multiple loads per block + for (unsigned i = 0; i < NumBlocks; ++i) { + emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed); + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion(bool IsLittleEndian) { + + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + uint64_t Size = SizeCast->getZExtValue(); + + int LoadSize = MaxLoadSize; + int NumBytesToBeProcessed = Size; + + if (IsUsedForZeroCmp) { + return getMemCmpExpansionZeroCase(Size, IsLittleEndian); + } + + unsigned Index = 0; + // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two + // memcmp source. It starts with loading using the maximum load size set by + // the target. It processes any remaining bytes using a load size which is the + // next smallest power of 2. + while (NumBytesToBeProcessed) { + // Calculate how many blocks we can create with the current load size + int NumBlocks = NumBytesToBeProcessed / LoadSize; + int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; + NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; + + // For each NumBlocks, populate the instruction sequence for loading and + // comparing LoadSize bytes + while (NumBlocks--) { + emitLoadCompareBlock(Index, LoadSize, GEPIndex, IsLittleEndian); + Index++; + GEPIndex++; + } + // Get the next LoadSize to use + LoadSize = LoadSize / 2; + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced wtih new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = call i64 @llvm.bswap.i64(i64 %4) +/// %7 = call i64 @llvm.bswap.i64(i64 %5) +/// %8 = sub i64 %6, %7 +/// %9 = icmp ne i64 %8, 0 +/// br i1 %9, label %res_block, label %loadbb1 +/// res_block: ; preds = %loadbb2, +/// %loadbb1, %loadbb +/// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] +/// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] +/// %10 = icmp ult i64 %phi.src1, %phi.src2 +/// %11 = select i1 %10, i32 -1, i32 1 +/// br label %endblock +/// loadbb1: ; preds = %loadbb +/// %12 = bitcast i32* %buffer2 to i8* +/// %13 = bitcast i32* %buffer1 to i8* +/// %14 = bitcast i8* %13 to i32* +/// %15 = bitcast i8* %12 to i32* +/// %16 = getelementptr i32, i32* %14, i32 2 +/// %17 = getelementptr i32, i32* %15, i32 2 +/// %18 = load i32, i32* %16 +/// %19 = load i32, i32* %17 +/// %20 = call i32 @llvm.bswap.i32(i32 %18) +/// %21 = call i32 @llvm.bswap.i32(i32 %19) +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = sub i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb2 +/// loadbb2: ; preds = %loadbb1 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = bitcast i8* %27 to i16* +/// %29 = bitcast i8* %26 to i16* +/// %30 = getelementptr i16, i16* %28, i16 6 +/// %31 = getelementptr i16, i16* %29, i16 6 +/// %32 = load i16, i16* %30 +/// %33 = load i16, i16* %31 +/// %34 = call i16 @llvm.bswap.i16(i16 %32) +/// %35 = call i16 @llvm.bswap.i16(i16 %33) +/// %36 = zext i16 %34 to i64 +/// %37 = zext i16 %35 to i64 +/// %38 = sub i64 %36, %37 +/// %39 = icmp ne i64 %38, 0 +/// br i1 %39, label %res_block, label %loadbb3 +/// loadbb3: ; preds = %loadbb2 +/// %40 = bitcast i32* %buffer2 to i8* +/// %41 = bitcast i32* %buffer1 to i8* +/// %42 = getelementptr i8, i8* %41, i8 14 +/// %43 = getelementptr i8, i8* %40, i8 14 +/// %44 = load i8, i8* %42 +/// %45 = load i8, i8* %43 +/// %46 = zext i8 %44 to i32 +/// %47 = zext i8 %45 to i32 +/// %48 = sub i32 %46, %47 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb3 +/// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] +/// ret i32 %phi.res +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + IRBuilder<> Builder(CI->getContext()); + + // TTI call to check if target would like to expand memcmp and get the + // MaxLoadSize + unsigned MaxLoadSize; + if (!TTI->expandMemCmp(CI, MaxLoadSize)) + return false; + + // Early exit from expansion if -Oz + if (CI->getParent()->getParent()->optForMinSize()) { + return false; + } + + // Early exit from expansion if size is not a constant + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + // Early exit from expansion if size greater than max bytes to load + uint64_t SizeVal = SizeCast->getZExtValue(); + + unsigned NumLoads = 0; + unsigned RemainingSize = SizeVal; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + NumLoads += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + + if (NumLoads > + TLI->getMaxExpandSizeMemcmp(CI->getParent()->getParent()->optForSize())) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + // MemCmpHelper object, creates and sets up basic blocks required for + // expanding memcmp with size SizeVal + unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock; + MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, NumLoadsPerBlock); + + Value *Res = MemCmpHelper.getMemCmpExpansion(DL->isLittleEndian()); + + // Replace call with result of expansion and erarse call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -1780,6 +2380,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { CI->eraseFromParent(); return true; } + + LibFunc Func; + if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) && + Func == LibFunc_memcmp) { + if (expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } + } return false; } @@ -4927,6 +5536,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) { return true; } + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp index b2d6652b075e..a3cf2846d2f5 100644 --- a/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -74,7 +74,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I; ++I) { unsigned Reg = *I; - if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg))) + if (!IsReturnBlock && !Pristine.test(Reg)) continue; for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { unsigned Reg = *AI; diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp index c2a568e4b452..c5d0999fe438 100644 --- a/lib/CodeGen/GlobalISel/Localizer.cpp +++ b/lib/CodeGen/GlobalISel/Localizer.cpp @@ -98,12 +98,10 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) { // Create the localized instruction. MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); LocalizedInstrs.insert(LocalizedMI); - // Move it at the right place. - MachineInstr &MIUse = *MOUse.getParent(); - if (MIUse.getParent() == InsertMBB) - InsertMBB->insert(MIUse, LocalizedMI); - else - InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); + // Don't try to be smart for the insertion point. + // There is no guarantee that the first seen use is the first + // use in the block. + InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); // Set a new register for the definition. unsigned NewReg = diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index 24e289dd4f1b..444416a77008 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -607,8 +607,20 @@ MachineInstr *ImplicitNullChecks::insertFaultingInstr( .addMBB(HandlerMBB) .addImm(MI->getOpcode()); - for (auto &MO : MI->uses()) - MIB.add(MO); + for (auto &MO : MI->uses()) { + if (MO.isReg()) { + MachineOperand NewMO = MO; + if (MO.isUse()) { + NewMO.setIsKill(false); + } else { + assert(MO.isDef() && "Expected def or use"); + NewMO.setIsDead(false); + } + MIB.add(NewMO); + } else { + MIB.add(MO); + } + } MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp new file mode 100644 index 000000000000..552f4b5393fe --- /dev/null +++ b/lib/CodeGen/LiveRangeShrink.cpp @@ -0,0 +1,231 @@ +//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +///===---------------------------------------------------------------------===// +/// +/// \file +/// This pass moves instructions close to the definition of its operands to +/// shrink live range of the def instruction. The code motion is limited within +/// the basic block. The moved instruction should have 1 def, and more than one +/// uses, all of which are the only use of the def. +/// +///===---------------------------------------------------------------------===// +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lrshrink" + +STATISTIC(NumInstrsHoistedToShrinkLiveRange, + "Number of insructions hoisted to shrink live range."); + +using namespace llvm; + +namespace { +class LiveRangeShrink : public MachineFunctionPass { +public: + static char ID; + + LiveRangeShrink() : MachineFunctionPass(ID) { + initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Live Range Shrink"; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char LiveRangeShrink::ID = 0; +char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID; + +INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false, + false) +namespace { +typedef DenseMap InstOrderMap; + +/// Returns \p New if it's dominated by \p Old, otherwise return \p Old. +/// \p M maintains a map from instruction to its dominating order that satisfies +/// M[A] > M[B] guarantees that A is dominated by B. +/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return +/// \p New. +MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old, + const InstOrderMap &M) { + auto NewIter = M.find(&New); + if (NewIter == M.end()) + return Old; + if (Old == nullptr) + return &New; + unsigned OrderOld = M.find(Old)->second; + unsigned OrderNew = NewIter->second; + if (OrderOld != OrderNew) + return OrderOld < OrderNew ? &New : Old; + // OrderOld == OrderNew, we need to iterate down from Old to see if it + // can reach New, if yes, New is dominated by Old. + for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew; + I = I->getNextNode()) + if (I == &New) + return &New; + return Old; +} + +/// Builds Instruction to its dominating order number map \p M by traversing +/// from instruction \p Start. +void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { + M.clear(); + unsigned i = 0; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) + M[&I] = i++; +} +} // end anonymous namespace + +bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + InstOrderMap IOM; + // Map from register to instruction order (value of IOM) where the + // register is used last. When moving instructions up, we need to + // make sure all its defs (including dead def) will not cross its + // last use when moving up. + DenseMap> UseMap; + + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + bool SawStore = false; + BuildInstOrderMap(MBB.begin(), IOM); + UseMap.clear(); + + for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) { + MachineInstr &MI = *Next; + ++Next; + if (MI.isPHI() || MI.isDebugValue()) + continue; + if (MI.mayStore()) + SawStore = true; + + unsigned CurrentOrder = IOM[&MI]; + unsigned Barrier = 0; + MachineInstr *BarrierMI = nullptr; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDebug()) + continue; + if (MO.isUse()) + UseMap[MO.getReg()] = std::make_pair(CurrentOrder, &MI); + else if (MO.isDead() && UseMap.count(MO.getReg())) + // Barrier is the last instruction where MO get used. MI should not + // be moved above Barrier. + if (Barrier < UseMap[MO.getReg()].first) { + Barrier = UseMap[MO.getReg()].first; + BarrierMI = UseMap[MO.getReg()].second; + } + } + + if (!MI.isSafeToMove(nullptr, SawStore)) { + // If MI has side effects, it should become a barrier for code motion. + // IOM is rebuild from the next instruction to prevent later + // instructions from being moved before this MI. + if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) { + BuildInstOrderMap(Next, IOM); + SawStore = false; + } + continue; + } + + const MachineOperand *DefMO = nullptr; + MachineInstr *Insert = nullptr; + + // Number of live-ranges that will be shortened. We do not count + // live-ranges that are defined by a COPY as it could be coalesced later. + unsigned NumEligibleUse = 0; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDead() || MO.isDebug()) + continue; + unsigned Reg = MO.getReg(); + // Do not move the instruction if it def/uses a physical register, + // unless it is a constant physical register or a noreg. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!Reg || MRI.isConstantPhysReg(Reg)) + continue; + Insert = nullptr; + break; + } + if (MO.isDef()) { + // Do not move if there is more than one def. + if (DefMO) { + Insert = nullptr; + break; + } + DefMO = &MO; + } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg) && DefMO && + MRI.getRegClass(DefMO->getReg()) == + MRI.getRegClass(MO.getReg())) { + // The heuristic does not handle different register classes yet + // (registers of different sizes, looser/tighter constraints). This + // is because it needs more accurate model to handle register + // pressure correctly. + MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); + if (!DefInstr.isCopy()) + NumEligibleUse++; + Insert = FindDominatedInstruction(DefInstr, Insert, IOM); + } else { + Insert = nullptr; + break; + } + } + + // If Barrier equals IOM[I], traverse forward to find if BarrierMI is + // after Insert, if yes, then we should not hoist. + for (MachineInstr *I = Insert; I && IOM[I] == Barrier; + I = I->getNextNode()) + if (I == BarrierMI) { + Insert = nullptr; + break; + } + // Move the instruction when # of shrunk live range > 1. + if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) { + MachineBasicBlock::iterator I = std::next(Insert->getIterator()); + // Skip all the PHI and debug instructions. + while (I != MBB.end() && (I->isPHI() || I->isDebugValue())) + I = std::next(I); + if (I == MI.getIterator()) + continue; + + // Update the dominator order to be the same as the insertion point. + // We do this to maintain a non-decreasing order without need to update + // all instruction orders after the insertion point. + unsigned NewOrder = IOM[&*I]; + IOM[&MI] = NewOrder; + NumInstrsHoistedToShrinkLiveRange++; + + // Find MI's debug value following MI. + MachineBasicBlock::iterator EndIter = std::next(MI.getIterator()); + if (MI.getOperand(0).isReg()) + for (; EndIter != MBB.end() && EndIter->isDebugValue() && + EndIter->getOperand(0).isReg() && + EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg(); + ++EndIter, ++Next) + IOM[&*EndIter] = NewOrder; + MBB.splice(I, &MBB, MI.getIterator(), EndIter); + } + } + } + return false; +} diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp index bd04acd049db..ff12297e3fc6 100644 --- a/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/lib/CodeGen/MIRParser/MIRParser.cpp @@ -332,8 +332,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) { MF.setAlignment(YamlMF.Alignment); MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice); - if (YamlMF.NoVRegs) - MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); if (YamlMF.Legalized) MF.getProperties().set(MachineFunctionProperties::Property::Legalized); if (YamlMF.RegBankSelected) diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 6f6a67d81b0f..293fc7358b8e 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -183,8 +183,6 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.Alignment = MF.getAlignment(); YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice(); - YamlMF.NoVRegs = MF.getProperties().hasProperty( - MachineFunctionProperties::Property::NoVRegs); YamlMF.Legalized = MF.getProperties().hasProperty( MachineFunctionProperties::Property::Legalized); YamlMF.RegBankSelected = MF.getProperties().hasProperty( diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index 06112723497b..590acc01008a 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -350,6 +350,13 @@ void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) { LiveIns.erase(I); } +MachineBasicBlock::livein_iterator +MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) { + // Get non-const version of iterator. + LiveInVector::iterator LI = LiveIns.begin() + (I - LiveIns.begin()); + return LiveIns.erase(LI); +} + bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const { livein_iterator I = find_if( LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; }); diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp index d665201a5d17..306b75dbbae7 100644 --- a/lib/CodeGen/MachineInstr.cpp +++ b/lib/CodeGen/MachineInstr.cpp @@ -1,4 +1,4 @@ -//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===// +//===- lib/CodeGen/MachineInstr.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,21 +11,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -35,9 +48,13 @@ #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -45,6 +62,14 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include +#include +#include +#include +#include +#include +#include + using namespace llvm; static cl::opt PrintWholeRegMask( @@ -256,7 +281,7 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const { case MachineOperand::MO_GlobalAddress: return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset(); case MachineOperand::MO_ExternalSymbol: - return !strcmp(getSymbolName(), Other.getSymbolName()) && + return strcmp(getSymbolName(), Other.getSymbolName()) == 0 && getOffset() == Other.getOffset(); case MachineOperand::MO_BlockAddress: return getBlockAddress() == Other.getBlockAddress() && @@ -723,9 +748,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { /// the MCInstrDesc. MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, DebugLoc dl, bool NoImp) - : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0), - AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr), - debugLoc(std::move(dl)) { + : MCID(&tid), debugLoc(std::move(dl)) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); // Reserve space for the expected number of operands. @@ -742,9 +765,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, /// MachineInstr ctor - Copies MachineInstr arg exactly /// MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI) - : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0), - Flags(0), AsmPrinterFlags(0), NumMemRefs(MI.NumMemRefs), - MemRefs(MI.MemRefs), debugLoc(MI.getDebugLoc()) { + : MCID(&MI.getDesc()), NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs), + debugLoc(MI.getDebugLoc()) { assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); CapOperands = OperandCapacity::get(MI.getNumOperands()); @@ -1633,8 +1655,8 @@ bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other, // memory objects. It can save compile time, and possibly catch some // corner cases not currently covered. - assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); - assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset"); + assert((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset"); int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset()); int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset; @@ -1667,7 +1689,7 @@ bool MachineInstr::hasOrderedMemoryRef() const { return true; // Check if any of our memory operands are ordered. - return any_of(memoperands(), [](const MachineMemOperand *MMO) { + return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) { return !MMO->isUnordered(); }); } @@ -1841,7 +1863,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, return; // Print the rest of the operands. - bool OmittedAnyCallClobbers = false; bool FirstOp = true; unsigned AsmDescOp = ~0u; unsigned AsmOpCount = 0; @@ -1878,31 +1899,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) VirtRegs.push_back(MO.getReg()); - // Omit call-clobbered registers which aren't used anywhere. This makes - // call instructions much less noisy on targets where calls clobber lots - // of registers. Don't rely on MO.isDead() because we may be called before - // LiveVariables is run, or we may be looking at a non-allocatable reg. - if (MRI && isCall() && - MO.isReg() && MO.isImplicit() && MO.isDef()) { - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - if (MRI->use_empty(Reg)) { - bool HasAliasLive = false; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { - unsigned AliasReg = *AI; - if (!MRI->use_empty(AliasReg)) { - HasAliasLive = true; - break; - } - } - if (!HasAliasLive) { - OmittedAnyCallClobbers = true; - continue; - } - } - } - } - if (FirstOp) FirstOp = false; else OS << ","; OS << " "; if (i < getDesc().NumOperands) { @@ -1984,12 +1980,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, MO.print(OS, MST, TRI); } - // Briefly indicate whether any call clobbers were omitted. - if (OmittedAnyCallClobbers) { - if (!FirstOp) OS << ","; - OS << " ..."; - } - bool HaveSemi = false; const unsigned PrintableFlags = FrameSetup | FrameDestroy; if (Flags & PrintableFlags) { @@ -2255,8 +2245,8 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef UsedRegs, unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue; // If there are no uses, including partial uses, the def is dead. - if (none_of(UsedRegs, - [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); })) + if (llvm::none_of(UsedRegs, + [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); })) MO.setIsDead(); } diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp index 6cf751d34e26..c1b72430e605 100644 --- a/lib/CodeGen/MachineModuleInfo.cpp +++ b/lib/CodeGen/MachineModuleInfo.cpp @@ -7,27 +7,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/ADT/PointerUnion.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionInitializer.h" -#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Constants.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" -#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Dwarf.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include +#include +#include +#include +#include + using namespace llvm; using namespace llvm::dwarf; @@ -37,14 +44,16 @@ INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo", char MachineModuleInfo::ID = 0; // Out of line virtual method. -MachineModuleInfoImpl::~MachineModuleInfoImpl() {} +MachineModuleInfoImpl::~MachineModuleInfoImpl() = default; namespace llvm { + class MMIAddrLabelMapCallbackPtr final : CallbackVH { - MMIAddrLabelMap *Map; + MMIAddrLabelMap *Map = nullptr; + public: - MMIAddrLabelMapCallbackPtr() : Map(nullptr) {} - MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(nullptr) {} + MMIAddrLabelMapCallbackPtr() = default; + MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {} void setPtr(BasicBlock *BB) { ValueHandleBase::operator=(BB); @@ -75,11 +84,12 @@ class MMIAddrLabelMap { /// This is a per-function list of symbols whose corresponding BasicBlock got /// deleted. These symbols need to be emitted at some point in the file, so /// AsmPrinter emits them after the function body. - DenseMap, std::vector > + DenseMap, std::vector> DeletedAddrLabelsNeedingEmission; -public: +public: MMIAddrLabelMap(MCContext &context) : Context(context) {} + ~MMIAddrLabelMap() { assert(DeletedAddrLabelsNeedingEmission.empty() && "Some labels for deleted blocks never got emitted"); @@ -93,7 +103,8 @@ class MMIAddrLabelMap { void UpdateForDeletedBlock(BasicBlock *BB); void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New); }; -} + +} // end namespace llvm ArrayRef MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) { assert(BB->hasAddressTaken() && @@ -119,7 +130,7 @@ ArrayRef MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) { /// If we have any deleted symbols for F, return them. void MMIAddrLabelMap:: takeDeletedSymbolsForFunction(Function *F, std::vector &Result) { - DenseMap, std::vector >::iterator I = + DenseMap, std::vector>::iterator I = DeletedAddrLabelsNeedingEmission.find(F); // If there are no entries for the function, just return. @@ -130,7 +141,6 @@ takeDeletedSymbolsForFunction(Function *F, std::vector &Result) { DeletedAddrLabelsNeedingEmission.erase(I); } - void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) { // If the block got deleted, there is no need for the symbol. If the symbol // was already emitted, we can just forget about it, otherwise we need to @@ -177,7 +187,6 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) { OldEntry.Symbols.end()); } - void MMIAddrLabelMapCallbackPtr::deleted() { Map->UpdateForDeletedBlock(cast(getValPtr())); } @@ -186,9 +195,6 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) { Map->UpdateForRAUWBlock(cast(getValPtr()), cast(V2)); } - -//===----------------------------------------------------------------------===// - MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM) : ImmutablePass(ID), TM(*TM), Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(), @@ -196,11 +202,9 @@ MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM) initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry()); } -MachineModuleInfo::~MachineModuleInfo() { -} +MachineModuleInfo::~MachineModuleInfo() = default; bool MachineModuleInfo::doInitialization(Module &M) { - ObjFileMMI = nullptr; CurCallSite = 0; DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false; @@ -211,7 +215,6 @@ bool MachineModuleInfo::doInitialization(Module &M) { } bool MachineModuleInfo::doFinalization(Module &M) { - Personalities.clear(); delete AddrLabelSymbols; @@ -290,10 +293,12 @@ void MachineModuleInfo::deleteMachineFunctionFor(Function &F) { } namespace { + /// This pass frees the MachineFunction object associated with a Function. class FreeMachineFunction : public FunctionPass { public: static char ID; + FreeMachineFunction() : FunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -311,14 +316,14 @@ class FreeMachineFunction : public FunctionPass { return "Free MachineFunction"; } }; -char FreeMachineFunction::ID; + } // end anonymous namespace -namespace llvm { -FunctionPass *createFreeMachineFunctionPass() { +char FreeMachineFunction::ID; + +FunctionPass *llvm::createFreeMachineFunctionPass() { return new FreeMachineFunction(); } -} // end namespace llvm //===- MMI building helpers -----------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ab36bc1417ae..fb51a4eb1421 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -280,6 +280,7 @@ namespace { SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); SDValue visitSETCCE(SDNode *N); + SDValue visitSETCCCARRY(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); @@ -1457,6 +1458,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); case ISD::SETCCE: return visitSETCCE(N); + case ISD::SETCCCARRY: return visitSETCCCARRY(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); @@ -1958,7 +1960,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) { // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && - VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1)) + DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); if (SDValue Combined = visitADDLike(N0, N1, N)) @@ -1970,6 +1972,44 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return SDValue(); } +static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { + bool Masked = false; + + // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. + while (true) { + if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { + V = V.getOperand(0); + continue; + } + + if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { + Masked = true; + V = V.getOperand(0); + continue; + } + + break; + } + + // If this is not a carry, return. + if (V.getResNo() != 1) + return SDValue(); + + if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && + V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) + return SDValue(); + + // If the result is masked, then no matter what kind of bool it is we can + // return. If it isn't, then we need to make sure the bool type is either 0 or + // 1 and not other values. + if (Masked || + TLI.getBooleanContents(V.getValueType()) == + TargetLoweringBase::ZeroOrOneBooleanContent) + return V; + + return SDValue(); +} + SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { EVT VT = N0.getValueType(); SDLoc DL(LocReference); @@ -2017,6 +2057,13 @@ SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), N0, N1.getOperand(0), N1.getOperand(2)); + // (add X, Carry) -> (addcarry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + if (SDValue Carry = getAsCarry(TLI, N1)) + return DAG.getNode(ISD::ADDCARRY, DL, + DAG.getVTList(VT, Carry.getValueType()), N0, + DAG.getConstant(0, DL, VT), Carry); + return SDValue(); } @@ -2090,6 +2137,8 @@ SDValue DAGCombiner::visitUADDO(SDNode *N) { } SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { + auto VT = N0.getValueType(); + // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) // If Y + 1 cannot overflow. if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { @@ -2100,6 +2149,12 @@ SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { N1.getOperand(2)); } + // (uaddo X, Carry) -> (addcarry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + if (SDValue Carry = getAsCarry(TLI, N1)) + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + DAG.getConstant(0, SDLoc(N), VT), Carry); + return SDValue(); } @@ -2167,6 +2222,41 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0.getOperand(0), N0.getOperand(1), CarryIn); + /** + * When one of the addcarry argument is itself a carry, we may be facing + * a diamond carry propagation. In which case we try to transform the DAG + * to ensure linear carry propagation if that is possible. + * + * We are trying to get: + * (addcarry X, 0, (addcarry A, B, Z):Carry) + */ + if (auto Y = getAsCarry(TLI, N1)) { + /** + * (uaddo A, B) + * / \ + * Carry Sum + * | \ + * | (addcarry *, 0, Z) + * | / + * \ Carry + * | / + * (addcarry X, *, *) + */ + if (Y.getOpcode() == ISD::UADDO && + CarryIn.getResNo() == 1 && + CarryIn.getOpcode() == ISD::ADDCARRY && + isNullConstant(CarryIn.getOperand(1)) && + CarryIn.getOperand(0) == Y.getValue(0)) { + auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(), + Y.getOperand(0), Y.getOperand(1), + CarryIn.getOperand(2)); + AddToWorklist(NewY.getNode()); + return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + DAG.getConstant(0, SDLoc(N), N0.getValueType()), + NewY.getValue(1)); + } + } + return SDValue(); } @@ -6754,6 +6844,19 @@ SDValue DAGCombiner::visitSETCCE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + + // If Carry is false, fold to a regular SETCC. + if (isNullConstant(Carry)) + return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); + + return SDValue(); +} + /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext @@ -7124,12 +7227,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); - CombineTo(N, ExtLoad); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); - CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); + return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked! } } @@ -7185,10 +7287,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); - CombineTo(N, And); - CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); + return CombineTo(N, And); // Return N so it doesn't get rechecked! } } } @@ -7427,12 +7528,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND); CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1)); - - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), - ISD::ZERO_EXTEND); - CombineTo(N, ExtLoad); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked! } } @@ -7482,11 +7580,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0.getOperand(0)), N0.getOperand(0).getValueType(), ExtLoad); - CombineTo(N, And); + ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND); CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1)); - ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, - ISD::ZERO_EXTEND); - return SDValue(N, 0); // Return N so it doesn't get rechecked! + return CombineTo(N, And); // Return N so it doesn't get rechecked! } } } @@ -12777,10 +12873,10 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { } // If we have load/store pair instructions and we only have two values, - // don't bother. + // don't bother merging. unsigned RequiredAlignment; if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && - St->getAlignment() >= RequiredAlignment) { + StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); continue; } diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 92b0d2ae4015..0d5e07ded25c 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2875,6 +2875,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; case ISD::SETCCE: Res = ExpandIntOp_SETCCE(N); break; + case ISD::SETCCCARRY: Res = ExpandIntOp_SETCCCARRY(N); break; case ISD::SINT_TO_FP: Res = ExpandIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = ExpandIntOp_STORE(cast(N), OpNo); break; case ISD::TRUNCATE: Res = ExpandIntOp_TRUNCATE(N); break; @@ -3009,14 +3010,16 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, return; } - // Lower with SETCCE if the target supports it. + // Lower with SETCCE or SETCCCARRY if the target supports it. + EVT HiVT = LHSHi.getValueType(); + EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT); + bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT); + // FIXME: Make all targets support this, then remove the other lowering. - if (TLI.getOperationAction( - ISD::SETCCE, - TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) == - TargetLowering::Custom) { - // SETCCE can detect < and >= directly. For > and <=, flip operands and - // condition code. + if (HasSETCCCARRY || + TLI.getOperationAction(ISD::SETCCE, ExpandVT) == TargetLowering::Custom) { + // SETCCE/SETCCCARRY can detect < and >= directly. For > and <=, flip + // operands and condition code. bool FlipOperands = false; switch (CCCode) { case ISD::SETGT: CCCode = ISD::SETLT; FlipOperands = true; break; @@ -3030,27 +3033,28 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, std::swap(LHSHi, RHSHi); } // Perform a wide subtraction, feeding the carry from the low part into - // SETCCE. The SETCCE operation is essentially looking at the high part of - // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or - // positive iff LHS >= RHS. - SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); - SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo); - SDValue Res = - DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()), - LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode)); + // SETCCE/SETCCCARRY. The SETCCE/SETCCCARRY operation is essentially + // looking at the high part of the result of LHS - RHS. It is negative + // iff LHS < RHS. It is zero or positive iff LHS >= RHS. + EVT LoVT = LHSLo.getValueType(); + SDVTList VTList = DAG.getVTList( + LoVT, HasSETCCCARRY ? getSetCCResultType(LoVT) : MVT::Glue); + SDValue LowCmp = DAG.getNode(HasSETCCCARRY ? ISD::USUBO : ISD::SUBC, dl, + VTList, LHSLo, RHSLo); + SDValue Res = DAG.getNode(HasSETCCCARRY ? ISD::SETCCCARRY : ISD::SETCCE, dl, + getSetCCResultType(HiVT), LHSHi, RHSHi, + LowCmp.getValue(1), DAG.getCondCode(CCCode)); NewLHS = Res; NewRHS = SDValue(); return; } - NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), - LHSHi, RHSHi, ISD::SETEQ, false, - DagCombineInfo, dl); + NewLHS = TLI.SimplifySetCC(getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ, + false, DagCombineInfo, dl); if (!NewLHS.getNode()) - NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), - LHSHi, RHSHi, ISD::SETEQ); - NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), - NewLHS, LoCmp, HiCmp); + NewLHS = + DAG.getSetCC(dl, getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ); + NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), NewLHS, LoCmp, HiCmp); NewRHS = SDValue(); } @@ -3103,8 +3107,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) { } // Otherwise, update N to have the operands specified. - return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, - DAG.getCondCode(CCCode)), 0); + return SDValue( + DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { @@ -3125,6 +3129,24 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { LowCmp.getValue(1), Cond); } +SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue Carry = N->getOperand(2); + SDValue Cond = N->getOperand(3); + SDLoc dl = SDLoc(N); + + SDValue LHSLo, LHSHi, RHSLo, RHSHi; + GetExpandedInteger(LHS, LHSLo, LHSHi); + GetExpandedInteger(RHS, RHSLo, RHSHi); + + // Expand to a SUBE for the low part and a smaller SETCCCARRY for the high. + SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType()); + SDValue LowCmp = DAG.getNode(ISD::SUBCARRY, dl, VTList, LHSLo, RHSLo, Carry); + return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi, + LowCmp.getValue(1), Cond); +} + SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) { // The value being shifted is legal, but the shift amount is too big. // It follows that either the result of the shift is undefined, or the diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 4c3b514856b7..8e999188d8e1 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -381,6 +381,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ExpandIntOp_SELECT_CC(SDNode *N); SDValue ExpandIntOp_SETCC(SDNode *N); SDValue ExpandIntOp_SETCCE(SDNode *N); + SDValue ExpandIntOp_SETCCCARRY(SDNode *N); SDValue ExpandIntOp_Shift(SDNode *N); SDValue ExpandIntOp_SINT_TO_FP(SDNode *N); SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo); diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 4f4025d8ae6a..579112c9bfc8 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -226,6 +226,7 @@ class ScheduleDAGRRList : public ScheduleDAGSDNodes { void UnscheduleNodeBottomUp(SUnit*); void RestoreHazardCheckerBottomUp(); void BacktrackBottomUp(SUnit*, SUnit*); + SUnit *TryUnfoldSU(SUnit *); SUnit *CopyAndMoveSuccessors(SUnit*); void InsertCopiesAndMoveSuccs(SUnit*, unsigned, const TargetRegisterClass*, @@ -780,7 +781,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { } /// CapturePred - This does the opposite of ReleasePred. Since SU is being -/// unscheduled, incrcease the succ left count of its predecessors. Remove +/// unscheduled, increase the succ left count of its predecessors. Remove /// them from AvailableQueue if necessary. void ScheduleDAGRRList::CapturePred(SDep *PredEdge) { SUnit *PredSU = PredEdge->getSUnit(); @@ -934,6 +935,146 @@ static bool isOperandOf(const SUnit *SU, SDNode *N) { return false; } +/// TryUnfold - Attempt to unfold +SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) { + SDNode *N = SU->getNode(); + // Use while over if to ease fall through. + SmallVector NewNodes; + if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes)) + return nullptr; + + // unfolding an x86 DEC64m operation results in store, dec, load which + // can't be handled here so quit + if (NewNodes.size() == 3) + return nullptr; + + assert(NewNodes.size() == 2 && "Expected a load folding node!"); + + N = NewNodes[1]; + SDNode *LoadNode = NewNodes[0]; + unsigned NumVals = N->getNumValues(); + unsigned OldNumVals = SU->getNode()->getNumValues(); + + // LoadNode may already exist. This can happen when there is another + // load from the same location and producing the same type of value + // but it has different alignment or volatileness. + bool isNewLoad = true; + SUnit *LoadSU; + if (LoadNode->getNodeId() != -1) { + LoadSU = &SUnits[LoadNode->getNodeId()]; + // If LoadSU has already been scheduled, we should clone it but + // this would negate the benefit to unfolding so just return SU. + if (LoadSU->isScheduled) + return SU; + isNewLoad = false; + } else { + LoadSU = CreateNewSUnit(LoadNode); + LoadNode->setNodeId(LoadSU->NodeNum); + + InitNumRegDefsLeft(LoadSU); + computeLatency(LoadSU); + } + + DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n"); + + // Now that we are committed to unfolding replace DAG Uses. + for (unsigned i = 0; i != NumVals; ++i) + DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i)); + DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1), + SDValue(LoadNode, 1)); + + SUnit *NewSU = CreateNewSUnit(N); + assert(N->getNodeId() == -1 && "Node already inserted!"); + N->setNodeId(NewSU->NodeNum); + + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { + if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { + NewSU->isTwoAddress = true; + break; + } + } + if (MCID.isCommutable()) + NewSU->isCommutable = true; + + InitNumRegDefsLeft(NewSU); + computeLatency(NewSU); + + // Record all the edges to and from the old SU, by category. + SmallVector ChainPreds; + SmallVector ChainSuccs; + SmallVector LoadPreds; + SmallVector NodePreds; + SmallVector NodeSuccs; + for (SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) + ChainPreds.push_back(Pred); + else if (isOperandOf(Pred.getSUnit(), LoadNode)) + LoadPreds.push_back(Pred); + else + NodePreds.push_back(Pred); + } + for (SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) + ChainSuccs.push_back(Succ); + else + NodeSuccs.push_back(Succ); + } + + // Now assign edges to the newly-created nodes. + for (const SDep &Pred : ChainPreds) { + RemovePred(SU, Pred); + if (isNewLoad) + AddPred(LoadSU, Pred); + } + for (const SDep &Pred : LoadPreds) { + RemovePred(SU, Pred); + if (isNewLoad) + AddPred(LoadSU, Pred); + } + for (const SDep &Pred : NodePreds) { + RemovePred(SU, Pred); + AddPred(NewSU, Pred); + } + for (SDep D : NodeSuccs) { + SUnit *SuccDep = D.getSUnit(); + D.setSUnit(SU); + RemovePred(SuccDep, D); + D.setSUnit(NewSU); + AddPred(SuccDep, D); + // Balance register pressure. + if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled && + !D.isCtrl() && NewSU->NumRegDefsLeft > 0) + --NewSU->NumRegDefsLeft; + } + for (SDep D : ChainSuccs) { + SUnit *SuccDep = D.getSUnit(); + D.setSUnit(SU); + RemovePred(SuccDep, D); + if (isNewLoad) { + D.setSUnit(LoadSU); + AddPred(SuccDep, D); + } + } + + // Add a data dependency to reflect that NewSU reads the value defined + // by LoadSU. + SDep D(LoadSU, SDep::Data, 0); + D.setLatency(LoadSU->Latency); + AddPred(NewSU, D); + + if (isNewLoad) + AvailableQueue->addNode(LoadSU); + AvailableQueue->addNode(NewSU); + + ++NumUnfolds; + + if (NewSU->NumSuccsLeft == 0) + NewSU->isAvailable = true; + + return NewSU; +} + /// CopyAndMoveSuccessors - Clone the specified node and move its scheduled /// successors to the newly created node. SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { @@ -959,135 +1100,16 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { return nullptr; } + // If possible unfold instruction. if (TryUnfold) { - SmallVector NewNodes; - if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes)) + SUnit *UnfoldSU = TryUnfoldSU(SU); + if (!UnfoldSU) return nullptr; - - // unfolding an x86 DEC64m operation results in store, dec, load which - // can't be handled here so quit - if (NewNodes.size() == 3) - return nullptr; - - DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n"); - assert(NewNodes.size() == 2 && "Expected a load folding node!"); - - N = NewNodes[1]; - SDNode *LoadNode = NewNodes[0]; - unsigned NumVals = N->getNumValues(); - unsigned OldNumVals = SU->getNode()->getNumValues(); - for (unsigned i = 0; i != NumVals; ++i) - DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i)); - DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1), - SDValue(LoadNode, 1)); - - // LoadNode may already exist. This can happen when there is another - // load from the same location and producing the same type of value - // but it has different alignment or volatileness. - bool isNewLoad = true; - SUnit *LoadSU; - if (LoadNode->getNodeId() != -1) { - LoadSU = &SUnits[LoadNode->getNodeId()]; - isNewLoad = false; - } else { - LoadSU = CreateNewSUnit(LoadNode); - LoadNode->setNodeId(LoadSU->NodeNum); - - InitNumRegDefsLeft(LoadSU); - computeLatency(LoadSU); - } - - SUnit *NewSU = CreateNewSUnit(N); - assert(N->getNodeId() == -1 && "Node already inserted!"); - N->setNodeId(NewSU->NodeNum); - - const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); - for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { - if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { - NewSU->isTwoAddress = true; - break; - } - } - if (MCID.isCommutable()) - NewSU->isCommutable = true; - - InitNumRegDefsLeft(NewSU); - computeLatency(NewSU); - - // Record all the edges to and from the old SU, by category. - SmallVector ChainPreds; - SmallVector ChainSuccs; - SmallVector LoadPreds; - SmallVector NodePreds; - SmallVector NodeSuccs; - for (SDep &Pred : SU->Preds) { - if (Pred.isCtrl()) - ChainPreds.push_back(Pred); - else if (isOperandOf(Pred.getSUnit(), LoadNode)) - LoadPreds.push_back(Pred); - else - NodePreds.push_back(Pred); - } - for (SDep &Succ : SU->Succs) { - if (Succ.isCtrl()) - ChainSuccs.push_back(Succ); - else - NodeSuccs.push_back(Succ); - } - - // Now assign edges to the newly-created nodes. - for (const SDep &Pred : ChainPreds) { - RemovePred(SU, Pred); - if (isNewLoad) - AddPred(LoadSU, Pred); - } - for (const SDep &Pred : LoadPreds) { - RemovePred(SU, Pred); - if (isNewLoad) - AddPred(LoadSU, Pred); - } - for (const SDep &Pred : NodePreds) { - RemovePred(SU, Pred); - AddPred(NewSU, Pred); - } - for (SDep D : NodeSuccs) { - SUnit *SuccDep = D.getSUnit(); - D.setSUnit(SU); - RemovePred(SuccDep, D); - D.setSUnit(NewSU); - AddPred(SuccDep, D); - // Balance register pressure. - if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled - && !D.isCtrl() && NewSU->NumRegDefsLeft > 0) - --NewSU->NumRegDefsLeft; - } - for (SDep D : ChainSuccs) { - SUnit *SuccDep = D.getSUnit(); - D.setSUnit(SU); - RemovePred(SuccDep, D); - if (isNewLoad) { - D.setSUnit(LoadSU); - AddPred(SuccDep, D); - } - } - - // Add a data dependency to reflect that NewSU reads the value defined - // by LoadSU. - SDep D(LoadSU, SDep::Data, 0); - D.setLatency(LoadSU->Latency); - AddPred(NewSU, D); - - if (isNewLoad) - AvailableQueue->addNode(LoadSU); - AvailableQueue->addNode(NewSU); - - ++NumUnfolds; - - if (NewSU->NumSuccsLeft == 0) { - NewSU->isAvailable = true; - return NewSU; - } - SU = NewSU; + SU = UnfoldSU; + N = SU->getNode(); + // If this can be scheduled don't bother duplicating and just return + if (SU->NumSuccsLeft == 0) + return SU; } DEBUG(dbgs() << " Duplicating SU #" << SU->NodeNum << "\n"); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index c37d7080f2c5..0dbd9e846aa6 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -214,6 +214,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FPOWI: return "fpowi"; case ISD::SETCC: return "setcc"; case ISD::SETCCE: return "setcce"; + case ISD::SETCCCARRY: return "setcccarry"; case ISD::SELECT: return "select"; case ISD::VSELECT: return "vselect"; case ISD::SELECT_CC: return "select_cc"; diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 0def5ae6d0d0..900c0318b179 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -842,9 +842,10 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; - MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = + MaxLoadsPerMemcmp = 8; + MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = + MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; @@ -926,6 +927,7 @@ void TargetLoweringBase::initActions() { // ADDCARRY operations default to expand setOperationAction(ISD::ADDCARRY, VT, Expand); setOperationAction(ISD::SUBCARRY, VT, Expand); + setOperationAction(ISD::SETCCCARRY, VT, Expand); // These default to Expand so they will be expanded to CTLZ/CTTZ by default. setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 83348058eca9..72d5e995ac22 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -261,9 +261,9 @@ TargetPassConfig::~TargetPassConfig() { // Out of line constructor provides default values for pass options and // registers all common codegen passes. -TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) +TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm) : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false), - AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false), + AddingMachinePasses(false), TM(&TM), Impl(nullptr), Initialized(false), DisableVerify(false), EnableTailMerge(true), RequireCodeGenSCCOrder(false) { @@ -282,9 +282,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm) substitutePass(&PostRAMachineLICMID, &MachineLICMID); if (StringRef(PrintMachineInstrs.getValue()).equals("")) - TM->Options.PrintMachineCode = true; + TM.Options.PrintMachineCode = true; - if (TM->Options.EnableIPRA) + if (TM.Options.EnableIPRA) setRequiresCodeGenSCCOrder(); } @@ -310,7 +310,7 @@ void TargetPassConfig::insertPass(AnalysisID TargetPassID, /// /// Targets may override this to extend TargetPassConfig. TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) { - return new TargetPassConfig(this, PM); + return new TargetPassConfig(*this, PM); } TargetPassConfig::TargetPassConfig() @@ -430,7 +430,12 @@ void TargetPassConfig::addPrintPass(const std::string &Banner) { } void TargetPassConfig::addVerifyPass(const std::string &Banner) { - if (VerifyMachineCode) + bool Verify = VerifyMachineCode; +#ifdef EXPENSIVE_CHECKS + if (VerifyMachineCode == cl::BOU_UNSET) + Verify = TM->isMachineVerifierClean(); +#endif + if (Verify) PM->add(createMachineVerifierPass(Banner)); } diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp index 4c78caf03477..d058f4864975 100644 --- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp @@ -46,7 +46,7 @@ Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) { } #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) \ SYMBOL_RECORD(EnumVal, EnumVal, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" } if (auto EC = Callbacks.visitSymbolEnd(Record)) diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp index 705b548141b0..f0debd9e9702 100644 --- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp @@ -71,7 +71,7 @@ static Error visitMemberRecord(CVMemberRecord &Record, MEMBER_RECORD(EnumVal, EnumVal, AliasName) #define TYPE_RECORD(EnumName, EnumVal, Name) #define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" } if (auto EC = Callbacks.visitMemberEnd(Record)) @@ -155,7 +155,7 @@ Error CVTypeVisitor::finishVisitation(CVType &Record) { TYPE_RECORD(EnumVal, EnumVal, AliasName) #define MEMBER_RECORD(EnumName, EnumVal, Name) #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" } if (auto EC = Callbacks.visitTypeEnd(Record)) diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp index 0441110c85ef..01d8ccf2d31e 100644 --- a/lib/DebugInfo/CodeView/EnumTables.cpp +++ b/lib/DebugInfo/CodeView/EnumTables.cpp @@ -20,13 +20,13 @@ using namespace codeview; static const EnumEntry SymbolTypeNames[] = { #define CV_SYMBOL(enum, val) {#enum, enum}, -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" #undef CV_SYMBOL }; static const EnumEntry TypeLeafNames[] = { #define CV_TYPE(name, val) {#name, name}, -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" #undef CV_TYPE }; diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp index 2f5a7d256c60..3d49a7198d1a 100644 --- a/lib/DebugInfo/CodeView/SymbolDumper.cpp +++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp @@ -41,7 +41,7 @@ class CVSymbolDumperImpl : public SymbolVisitorCallbacks { #define SYMBOL_RECORD(EnumName, EnumVal, Name) \ Error visitKnownRecord(CVSymbol &CVR, Name &Record) override; #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def" +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" Error visitSymbolBegin(CVSymbol &Record) override; Error visitSymbolEnd(CVSymbol &Record) override; diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp index 84f52a055815..04b0384d8190 100644 --- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp @@ -26,7 +26,7 @@ using namespace llvm::codeview; static const EnumEntry LeafTypeNames[] = { #define CV_TYPE(enum, val) {#enum, enum}, -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" }; #define ENUM_ENTRY(enum_class, enum) \ @@ -155,7 +155,7 @@ static StringRef getLeafTypeName(TypeLeafKind LT) { #define TYPE_RECORD(ename, value, name) \ case ename: \ return #name; -#include "llvm/DebugInfo/CodeView/TypeRecords.def" +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" default: break; } diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 5ed55ce4c0dc..1be156d6ea9b 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -84,8 +84,12 @@ static void dumpAccelSection(raw_ostream &OS, StringRef Name, Accel.dump(OS); } -void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH, - bool SummarizeTypes) { +void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts){ + + DIDumpType DumpType = DumpOpts.DumpType; + bool DumpEH = DumpOpts.DumpEH; + bool SummarizeTypes = DumpOpts.SummarizeTypes; + if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) { OS << ".debug_abbrev contents:\n"; getDebugAbbrev()->dump(OS); diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp index 2a1d12e82390..7c6069652da6 100644 --- a/lib/DebugInfo/PDB/Native/InfoStream.cpp +++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp @@ -79,6 +79,7 @@ Error InfoStream::reload() { break; case uint32_t(PdbRaw_FeatureSig::MinimalDebugInfo): Features |= PdbFeatureMinimalDebugInfo; + break; default: continue; } diff --git a/lib/DebugInfo/PDB/PDBContext.cpp b/lib/DebugInfo/PDB/PDBContext.cpp index 94b81ecf561e..f6b6b951ebe1 100644 --- a/lib/DebugInfo/PDB/PDBContext.cpp +++ b/lib/DebugInfo/PDB/PDBContext.cpp @@ -29,8 +29,7 @@ PDBContext::PDBContext(const COFFObjectFile &Object, Session->setLoadAddress(ImageBase.get()); } -void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH, - bool SummarizeTypes) {} +void PDBContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts){} DILineInfo PDBContext::getLineInfoForAddress(uint64_t Address, DILineInfoSpecifier Specifier) { diff --git a/lib/Fuzzer/test/dump_coverage.test b/lib/Fuzzer/test/dump_coverage.test index 8acc8304fc60..bd85ed718e19 100644 --- a/lib/Fuzzer/test/dump_coverage.test +++ b/lib/Fuzzer/test/dump_coverage.test @@ -4,11 +4,11 @@ RUN: sancov -covered-functions LLVMFuzzer-NullDerefTest* %t_workdir/*.sancov | F RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' LLVMFuzzer-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not LLVMFuzzer-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV -CHECK: SanitizerCoverage: {{.*}}LLVMFuzzer-NullDerefTest.{{.*}}.sancov {{.*}} PCs written +CHECK: SanitizerCoverage: {{.*}}LLVMFuzzer-NullDerefTest.{{.*}}.sancov: {{.*}} PCs written SANCOV: LLVMFuzzerTestOneInput -DSO: SanitizerCoverage: {{.*}}LLVMFuzzer-DSOTest.{{.*}}.sancov {{.*}} PCs written -DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO1.{{.*}}.sancov {{.*}} PCs written -DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO2.{{.*}}.sancov {{.*}} PCs written +DSO: SanitizerCoverage: {{.*}}LLVMFuzzer-DSOTest.{{.*}}.sancov: {{.*}} PCs written +DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO1.{{.*}}.sancov: {{.*}} PCs written +DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO2.{{.*}}.sancov: {{.*}} PCs written NOCOV-NOT: SanitizerCoverage: {{.*}} PCs written diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index 19b7c3027232..a76c944f0005 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -1006,6 +1006,10 @@ AttributeList AttributeList::get(LLVMContext &C, for (AttributeList List : Attrs) MaxSize = std::max(MaxSize, List.getNumAttrSets()); + // If every list was empty, there is no point in merging the lists. + if (MaxSize == 0) + return AttributeList(); + SmallVector NewAttrSets(MaxSize); for (unsigned I = 0; I < MaxSize; ++I) { AttrBuilder CurBuilder; @@ -1033,24 +1037,11 @@ AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index, return addAttributes(C, Index, B); } -AttributeList AttributeList::addAttribute(LLVMContext &C, - ArrayRef Indices, +AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index, Attribute A) const { - assert(std::is_sorted(Indices.begin(), Indices.end())); - - SmallVector AttrSets(this->begin(), this->end()); - unsigned MaxIndex = attrIdxToArrayIdx(Indices.back()); - if (MaxIndex >= AttrSets.size()) - AttrSets.resize(MaxIndex + 1); - - for (unsigned Index : Indices) { - Index = attrIdxToArrayIdx(Index); - AttrBuilder B(AttrSets[Index]); - B.addAttribute(A); - AttrSets[Index] = AttributeSet::get(C, B); - } - - return getImpl(C, AttrSets); + AttrBuilder B; + B.addAttribute(A); + return addAttributes(C, Index, B); } AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, @@ -1082,6 +1073,26 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, return getImpl(C, AttrSets); } +AttributeList AttributeList::addParamAttribute(LLVMContext &C, + ArrayRef ArgNos, + Attribute A) const { + assert(std::is_sorted(ArgNos.begin(), ArgNos.end())); + + SmallVector AttrSets(this->begin(), this->end()); + unsigned MaxIndex = attrIdxToArrayIdx(ArgNos.back() + FirstArgIndex); + if (MaxIndex >= AttrSets.size()) + AttrSets.resize(MaxIndex + 1); + + for (unsigned ArgNo : ArgNos) { + unsigned Index = attrIdxToArrayIdx(ArgNo + FirstArgIndex); + AttrBuilder B(AttrSets[Index]); + B.addAttribute(A); + AttrSets[Index] = AttributeSet::get(C, B); + } + + return getImpl(C, AttrSets); +} + AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Kind) const { if (!hasAttribute(Index, Kind)) return *this; diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index 39de4b0a97fa..fc68c0e3cad9 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -118,15 +118,13 @@ unsigned Argument::getParamAlignment() const { uint64_t Argument::getDereferenceableBytes() const { assert(getType()->isPointerTy() && "Only pointers have dereferenceable bytes"); - return getParent()->getDereferenceableBytes(getArgNo() + - AttributeList::FirstArgIndex); + return getParent()->getParamDereferenceableBytes(getArgNo()); } uint64_t Argument::getDereferenceableOrNullBytes() const { assert(getType()->isPointerTy() && "Only pointers have dereferenceable bytes"); - return getParent()->getDereferenceableOrNullBytes( - getArgNo() + AttributeList::FirstArgIndex); + return getParent()->getParamDereferenceableOrNullBytes(getArgNo()); } bool Argument::hasNestAttr() const { @@ -169,21 +167,20 @@ bool Argument::onlyReadsMemory() const { void Argument::addAttrs(AttrBuilder &B) { AttributeList AL = getParent()->getAttributes(); - AL = AL.addAttributes(Parent->getContext(), - getArgNo() + AttributeList::FirstArgIndex, B); + AL = AL.addParamAttributes(Parent->getContext(), getArgNo(), B); getParent()->setAttributes(AL); } void Argument::addAttr(Attribute::AttrKind Kind) { - getParent()->addAttribute(getArgNo() + AttributeList::FirstArgIndex, Kind); + getParent()->addParamAttr(getArgNo(), Kind); } void Argument::addAttr(Attribute Attr) { - getParent()->addAttribute(getArgNo() + AttributeList::FirstArgIndex, Attr); + getParent()->addParamAttr(getArgNo(), Attr); } void Argument::removeAttr(Attribute::AttrKind Kind) { - getParent()->removeAttribute(getArgNo() + AttributeList::FirstArgIndex, Kind); + getParent()->removeParamAttr(getArgNo(), Kind); } bool Argument::hasAttribute(Attribute::AttrKind Kind) const { @@ -365,6 +362,24 @@ void Function::addAttributes(unsigned i, const AttrBuilder &Attrs) { setAttributes(PAL); } +void Function::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void Function::addParamAttr(unsigned ArgNo, Attribute Attr) { + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr); + setAttributes(PAL); +} + +void Function::addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) { + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttributes(getContext(), ArgNo, Attrs); + setAttributes(PAL); +} + void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) { AttributeList PAL = getAttributes(); PAL = PAL.removeAttribute(getContext(), i, Kind); @@ -383,18 +398,49 @@ void Function::removeAttributes(unsigned i, const AttrBuilder &Attrs) { setAttributes(PAL); } +void Function::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) { + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs); + setAttributes(PAL); +} + void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) { AttributeList PAL = getAttributes(); PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes); setAttributes(PAL); } +void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) { + AttributeList PAL = getAttributes(); + PAL = PAL.addDereferenceableParamAttr(getContext(), ArgNo, Bytes); + setAttributes(PAL); +} + void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { AttributeList PAL = getAttributes(); PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes); setAttributes(PAL); } +void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo, + uint64_t Bytes) { + AttributeList PAL = getAttributes(); + PAL = PAL.addDereferenceableOrNullParamAttr(getContext(), ArgNo, Bytes); + setAttributes(PAL); +} + const std::string &Function::getGC() const { assert(hasGC() && "Function has no collector"); return getContext().getGC(*this); diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index d7baa9ebc223..46c27331ff95 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -393,7 +393,17 @@ void CallInst::addAttribute(unsigned i, Attribute Attr) { } void CallInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - addAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void CallInst::addParamAttr(unsigned ArgNo, Attribute Attr) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr); + setAttributes(PAL); } void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) { @@ -409,7 +419,17 @@ void CallInst::removeAttribute(unsigned i, StringRef Kind) { } void CallInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - removeAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); +} + +void CallInst::removeParamAttr(unsigned ArgNo, StringRef Kind) { + assert(ArgNo < getNumArgOperands() && "Out of bounds"); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); } void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) { @@ -808,7 +828,9 @@ void InvokeInst::addAttribute(unsigned i, Attribute Attr) { } void InvokeInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - addAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + AttributeList PAL = getAttributes(); + PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); } void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) { @@ -824,7 +846,9 @@ void InvokeInst::removeAttribute(unsigned i, StringRef Kind) { } void InvokeInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) { - removeAttribute(ArgNo + AttributeList::FirstArgIndex, Kind); + AttributeList PAL = getAttributes(); + PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind); + setAttributes(PAL); } void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) { diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 668667a53562..f9c41f5c9744 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -136,7 +136,8 @@ createTargetMachine(Config &Conf, const Target *TheTarget, Module &M) { Conf.CodeModel, Conf.CGOptLevel)); } -static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel) { +static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel, + bool IsThinLTO) { PassBuilder PB(TM); AAManager AA; @@ -180,7 +181,10 @@ static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel) { break; } - MPM = PB.buildLTODefaultPipeline(OL, false /* DebugLogging */); + if (IsThinLTO) + MPM = PB.buildThinLTODefaultPipeline(OL, false /* DebugLogging */); + else + MPM = PB.buildLTODefaultPipeline(OL, false /* DebugLogging */); MPM.run(Mod, MAM); // FIXME (davide): verify the output. @@ -258,17 +262,12 @@ static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM, bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod, bool IsThinLTO, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) { - // There's still no ThinLTO pipeline hooked up in the new pass manager, - // once there is one, we can just remove this. - if (LTOUseNewPM && IsThinLTO) - report_fatal_error("ThinLTO not supported with the new PM yet!"); - // FIXME: Plumb the combined index into the new pass manager. if (!Conf.OptPipeline.empty()) runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline, Conf.DisableVerify); else if (LTOUseNewPM) - runNewPMPasses(Mod, TM, Conf.OptLevel); + runNewPMPasses(Mod, TM, Conf.OptLevel, IsThinLTO); else runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary); return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod); diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp index a0a0ef312276..6c9a4f9f982d 100644 --- a/lib/MC/MCCodeView.cpp +++ b/lib/MC/MCCodeView.cpp @@ -12,11 +12,11 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCCodeView.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/DebugInfo/CodeView/Line.h" #include "llvm/DebugInfo/CodeView/SymbolRecord.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCValue.h" diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index bfb8875f47d4..084159a61f55 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -3269,7 +3269,6 @@ void MachOBindEntry::moveNext() { if (ImmValue) { SignExtended = MachO::BIND_OPCODE_MASK | ImmValue; Ordinal = SignExtended; - LibraryOrdinalSet = true; if (Ordinal < MachO::BIND_SPECIAL_DYLIB_FLAT_LOOKUP) { *E = malformedError("for BIND_OPCODE_SET_DYLIB_SPECIAL_IMM unknown " "special ordinal: " + Twine((int)Ordinal) + " for opcode at: " @@ -3279,6 +3278,7 @@ void MachOBindEntry::moveNext() { } } else Ordinal = 0; + LibraryOrdinalSet = true; DEBUG_WITH_TYPE( "mach-o-bind", dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: " diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp index b52563469094..e46d38e466a0 100644 --- a/lib/Object/WindowsResource.cpp +++ b/lib/Object/WindowsResource.cpp @@ -12,20 +12,23 @@ //===----------------------------------------------------------------------===// #include "llvm/Object/WindowsResource.h" -#include "llvm/Object/Error.h" +#include "llvm/Support/COFF.h" +#include #include namespace llvm { namespace object { -static const size_t ResourceMagicSize = 16; - -static const size_t NullEntrySize = 16; - #define RETURN_IF_ERROR(X) \ if (auto EC = X) \ return EC; +const uint32_t MIN_HEADER_SIZE = 7 * sizeof(uint32_t) + 2 * sizeof(uint16_t); + +static const size_t ResourceMagicSize = 16; + +static const size_t NullEntrySize = 16; + WindowsResource::WindowsResource(MemoryBufferRef Source) : Binary(Binary::ID_WinRes, Source) { size_t LeadingSize = ResourceMagicSize + NullEntrySize; @@ -33,8 +36,6 @@ WindowsResource::WindowsResource(MemoryBufferRef Source) support::little); } -WindowsResource::~WindowsResource() = default; - Expected> WindowsResource::createWindowsResource(MemoryBufferRef Source) { if (Source.getBufferSize() < ResourceMagicSize + NullEntrySize) @@ -72,19 +73,150 @@ Error ResourceEntryRef::moveNext(bool &End) { return Error::success(); } +static Error readStringOrId(BinaryStreamReader &Reader, uint16_t &ID, + ArrayRef &Str, bool &IsString) { + uint16_t IDFlag; + RETURN_IF_ERROR(Reader.readInteger(IDFlag)); + IsString = IDFlag != 0xffff; + + if (IsString) { + Reader.setOffset( + Reader.getOffset() - + sizeof(uint16_t)); // Re-read the bytes which we used to check the flag. + RETURN_IF_ERROR(Reader.readWideString(Str)); + } else + RETURN_IF_ERROR(Reader.readInteger(ID)); + + return Error::success(); +} + Error ResourceEntryRef::loadNext() { uint32_t DataSize; RETURN_IF_ERROR(Reader.readInteger(DataSize)); uint32_t HeaderSize; RETURN_IF_ERROR(Reader.readInteger(HeaderSize)); - // The data and header size ints are themselves part of the header, so we must - // subtract them from the size. - RETURN_IF_ERROR( - Reader.readStreamRef(HeaderBytes, HeaderSize - 2 * sizeof(uint32_t))); - RETURN_IF_ERROR(Reader.readStreamRef(DataBytes, DataSize)); + + if (HeaderSize < MIN_HEADER_SIZE) + return make_error("Header size is too small.", + object_error::parse_failed); + + RETURN_IF_ERROR(readStringOrId(Reader, TypeID, Type, IsStringType)); + + RETURN_IF_ERROR(readStringOrId(Reader, NameID, Name, IsStringName)); + RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t))); + + RETURN_IF_ERROR(Reader.readObject(Suffix)); + + RETURN_IF_ERROR(Reader.readArray(Data, DataSize)); + + RETURN_IF_ERROR(Reader.padToAlignment(sizeof(uint32_t))); + return Error::success(); } +WindowsResourceParser::WindowsResourceParser() {} + +Error WindowsResourceParser::parse(WindowsResource *WR) { + auto EntryOrErr = WR->getHeadEntry(); + if (!EntryOrErr) + return EntryOrErr.takeError(); + + ResourceEntryRef Entry = EntryOrErr.get(); + bool End = false; + + while (!End) { + + Root.addEntry(Entry); + + RETURN_IF_ERROR(Entry.moveNext(End)); + } + + return Error::success(); +} + +void WindowsResourceParser::printTree() const { + ScopedPrinter Writer(outs()); + Root.print(Writer, "Resource Tree"); +} + +void WindowsResourceParser::TreeNode::addEntry(const ResourceEntryRef &Entry) { + TreeNode &TypeNode = addTypeNode(Entry); + TreeNode &NameNode = TypeNode.addNameNode(Entry); + NameNode.addLanguageNode(Entry); +} + +WindowsResourceParser::TreeNode::TreeNode(ArrayRef NameRef) + : Name(NameRef) {} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addTypeNode(const ResourceEntryRef &Entry) { + if (Entry.checkTypeString()) + return addChild(Entry.getTypeString()); + else + return addChild(Entry.getTypeID()); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addNameNode(const ResourceEntryRef &Entry) { + if (Entry.checkNameString()) + return addChild(Entry.getNameString()); + else + return addChild(Entry.getNameID()); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addLanguageNode( + const ResourceEntryRef &Entry) { + return addChild(Entry.getLanguage()); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addChild(uint32_t ID) { + auto Child = IDChildren.find(ID); + if (Child == IDChildren.end()) { + auto NewChild = llvm::make_unique(ID); + WindowsResourceParser::TreeNode &Node = *NewChild; + IDChildren.emplace(ID, std::move(NewChild)); + return Node; + } else + return *(Child->second); +} + +WindowsResourceParser::TreeNode & +WindowsResourceParser::TreeNode::addChild(ArrayRef NameRef) { + std::string NameString; + ArrayRef CorrectedName; + std::vector EndianCorrectedName; + if (llvm::sys::IsBigEndianHost) { + EndianCorrectedName.resize(NameRef.size() + 1); + std::copy(NameRef.begin(), NameRef.end(), EndianCorrectedName.begin() + 1); + EndianCorrectedName[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED; + CorrectedName = makeArrayRef(EndianCorrectedName); + } else + CorrectedName = NameRef; + llvm::convertUTF16ToUTF8String(CorrectedName, NameString); + + auto Child = StringChildren.find(NameString); + if (Child == StringChildren.end()) { + auto NewChild = llvm::make_unique(NameRef); + WindowsResourceParser::TreeNode &Node = *NewChild; + StringChildren.emplace(NameString, std::move(NewChild)); + return Node; + } else + return *(Child->second); +} + +void WindowsResourceParser::TreeNode::print(ScopedPrinter &Writer, + StringRef Name) const { + ListScope NodeScope(Writer, Name); + for (auto const &Child : StringChildren) { + Child.second->print(Writer, Child.first); + } + for (auto const &Child : IDChildren) { + Child.second->print(Writer, to_string(Child.first)); + } +} + } // namespace object } // namespace llvm diff --git a/lib/ObjectYAML/CMakeLists.txt b/lib/ObjectYAML/CMakeLists.txt index 37f8fd7bce1a..7af0b9c194e6 100644 --- a/lib/ObjectYAML/CMakeLists.txt +++ b/lib/ObjectYAML/CMakeLists.txt @@ -1,4 +1,7 @@ add_llvm_library(LLVMObjectYAML + CodeViewYAMLTypes.cpp + CodeViewYAMLSymbols.cpp + CodeViewYAMLDebugSections.cpp COFFYAML.cpp DWARFEmitter.cpp DWARFVisitor.cpp diff --git a/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp new file mode 100644 index 000000000000..f652ff57f30d --- /dev/null +++ b/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp @@ -0,0 +1,127 @@ +//===- CodeViewYAMLDebugSections.cpp - CodeView YAMLIO debug sections -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h" + +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/SymbolRecord.h" + +using namespace llvm; +using namespace llvm::codeview; +using namespace llvm::CodeViewYAML; +using namespace llvm::CodeViewYAML::detail; +using namespace llvm::yaml; + +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceFileChecksumEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceColumnEntry) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineBlock) +LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineInfo) +LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeSite) +LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo) +LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) + +LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false) +LLVM_YAML_DECLARE_ENUM_TRAITS(FileChecksumKind) +LLVM_YAML_DECLARE_BITSET_TRAITS(LineFlags) + +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceColumnEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceFileChecksumEntry) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineInfo) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::SourceLineBlock) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeInfo) +LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::InlineeSite) + +void ScalarBitSetTraits::bitset(IO &io, LineFlags &Flags) { + io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns); + io.enumFallback(Flags); +} + +void ScalarEnumerationTraits::enumeration( + IO &io, FileChecksumKind &Kind) { + io.enumCase(Kind, "None", FileChecksumKind::None); + io.enumCase(Kind, "MD5", FileChecksumKind::MD5); + io.enumCase(Kind, "SHA1", FileChecksumKind::SHA1); + io.enumCase(Kind, "SHA256", FileChecksumKind::SHA256); +} + +void ScalarTraits::output(const HexFormattedString &Value, + void *ctx, raw_ostream &Out) { + StringRef Bytes(reinterpret_cast(Value.Bytes.data()), + Value.Bytes.size()); + Out << toHex(Bytes); +} + +StringRef ScalarTraits::input(StringRef Scalar, void *ctxt, + HexFormattedString &Value) { + std::string H = fromHex(Scalar); + Value.Bytes.assign(H.begin(), H.end()); + return StringRef(); +} + +void MappingTraits::mapping(IO &IO, SourceLineEntry &Obj) { + IO.mapRequired("Offset", Obj.Offset); + IO.mapRequired("LineStart", Obj.LineStart); + IO.mapRequired("IsStatement", Obj.IsStatement); + IO.mapRequired("EndDelta", Obj.EndDelta); +} + +void MappingTraits::mapping(IO &IO, SourceColumnEntry &Obj) { + IO.mapRequired("StartColumn", Obj.StartColumn); + IO.mapRequired("EndColumn", Obj.EndColumn); +} + +void MappingTraits::mapping(IO &IO, SourceLineBlock &Obj) { + IO.mapRequired("FileName", Obj.FileName); + IO.mapRequired("Lines", Obj.Lines); + IO.mapRequired("Columns", Obj.Columns); +} + +void MappingTraits::mapping( + IO &IO, SourceFileChecksumEntry &Obj) { + IO.mapRequired("FileName", Obj.FileName); + IO.mapRequired("Kind", Obj.Kind); + IO.mapRequired("Checksum", Obj.ChecksumBytes); +} + +void MappingTraits::mapping(IO &IO, SourceLineInfo &Obj) { + IO.mapRequired("CodeSize", Obj.CodeSize); + + IO.mapRequired("Flags", Obj.Flags); + IO.mapRequired("RelocOffset", Obj.RelocOffset); + IO.mapRequired("RelocSegment", Obj.RelocSegment); + IO.mapRequired("Blocks", Obj.Blocks); +} + +void MappingTraits::mapping(IO &IO, SourceFileInfo &Obj) { + IO.mapOptional("Checksums", Obj.FileChecksums); + IO.mapOptional("Lines", Obj.LineFragments); + IO.mapOptional("InlineeLines", Obj.Inlinees); +} + +void MappingTraits::mapping(IO &IO, InlineeSite &Obj) { + IO.mapRequired("FileName", Obj.FileName); + IO.mapRequired("LineNum", Obj.SourceLineNum); + IO.mapRequired("Inlinee", Obj.Inlinee); + IO.mapOptional("ExtraFiles", Obj.ExtraFiles); +} + +void MappingTraits::mapping(IO &IO, InlineeInfo &Obj) { + IO.mapRequired("HasExtraFiles", Obj.HasExtraFiles); + IO.mapRequired("Sites", Obj.Sites); +} diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp new file mode 100644 index 000000000000..6e8bb5c7372c --- /dev/null +++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp @@ -0,0 +1,496 @@ +//===- CodeViewYAMLSymbols.cpp - CodeView YAMLIO Symbol implementation ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/CodeViewYAMLSymbols.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" +#include "llvm/DebugInfo/CodeView/SymbolSerializer.h" + +using namespace llvm; +using namespace llvm::codeview; +using namespace llvm::CodeViewYAML; +using namespace llvm::CodeViewYAML::detail; +using namespace llvm::yaml; + +LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) + +// We only need to declare these, the definitions are in CodeViewYAMLTypes.cpp +LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, false) +LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeIndex, false) + +LLVM_YAML_DECLARE_ENUM_TRAITS(SymbolKind) + +LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym2Flags) +LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym3Flags) +LLVM_YAML_DECLARE_BITSET_TRAITS(ExportFlags) +LLVM_YAML_DECLARE_BITSET_TRAITS(LocalSymFlags) +LLVM_YAML_DECLARE_BITSET_TRAITS(ProcSymFlags) +LLVM_YAML_DECLARE_BITSET_TRAITS(FrameProcedureOptions) +LLVM_YAML_DECLARE_ENUM_TRAITS(CPUType) +LLVM_YAML_DECLARE_ENUM_TRAITS(RegisterId) +LLVM_YAML_DECLARE_ENUM_TRAITS(TrampolineType) +LLVM_YAML_DECLARE_ENUM_TRAITS(ThunkOrdinal) + +void ScalarEnumerationTraits::enumeration(IO &io, + SymbolKind &Value) { + auto SymbolNames = getSymbolTypeNames(); + for (const auto &E : SymbolNames) + io.enumCase(Value, E.Name.str().c_str(), E.Value); +} + +void ScalarBitSetTraits::bitset(IO &io, + CompileSym2Flags &Flags) { + auto FlagNames = getCompileSym2FlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarBitSetTraits::bitset(IO &io, + CompileSym3Flags &Flags) { + auto FlagNames = getCompileSym3FlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarBitSetTraits::bitset(IO &io, ExportFlags &Flags) { + auto FlagNames = getExportSymFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarBitSetTraits::bitset(IO &io, LocalSymFlags &Flags) { + auto FlagNames = getLocalFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarBitSetTraits::bitset(IO &io, ProcSymFlags &Flags) { + auto FlagNames = getProcSymFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarBitSetTraits::bitset( + IO &io, FrameProcedureOptions &Flags) { + auto FlagNames = getFrameProcSymFlagNames(); + for (const auto &E : FlagNames) { + io.bitSetCase(Flags, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarEnumerationTraits::enumeration(IO &io, CPUType &Cpu) { + auto CpuNames = getCPUTypeNames(); + for (const auto &E : CpuNames) { + io.enumCase(Cpu, E.Name.str().c_str(), static_cast(E.Value)); + } +} + +void ScalarEnumerationTraits::enumeration(IO &io, RegisterId &Reg) { + auto RegNames = getRegisterNames(); + for (const auto &E : RegNames) { + io.enumCase(Reg, E.Name.str().c_str(), static_cast(E.Value)); + } + io.enumFallback(Reg); +} + +void ScalarEnumerationTraits::enumeration( + IO &io, TrampolineType &Tramp) { + auto TrampNames = getTrampolineNames(); + for (const auto &E : TrampNames) { + io.enumCase(Tramp, E.Name.str().c_str(), + static_cast(E.Value)); + } +} + +void ScalarEnumerationTraits::enumeration(IO &io, + ThunkOrdinal &Ord) { + auto ThunkNames = getThunkOrdinalNames(); + for (const auto &E : ThunkNames) { + io.enumCase(Ord, E.Name.str().c_str(), static_cast(E.Value)); + } +} + +namespace llvm { +namespace CodeViewYAML { +namespace detail { + +struct SymbolRecordBase { + codeview::SymbolKind Kind; + explicit SymbolRecordBase(codeview::SymbolKind K) : Kind(K) {} + + virtual ~SymbolRecordBase() {} + virtual void map(yaml::IO &io) = 0; + virtual codeview::CVSymbol + toCodeViewSymbol(BumpPtrAllocator &Allocator) const = 0; + virtual Error fromCodeViewSymbol(codeview::CVSymbol Type) = 0; +}; + +template struct SymbolRecordImpl : public SymbolRecordBase { + explicit SymbolRecordImpl(codeview::SymbolKind K) + : SymbolRecordBase(K), Symbol(static_cast(K)) {} + + void map(yaml::IO &io) override; + + codeview::CVSymbol + toCodeViewSymbol(BumpPtrAllocator &Allocator) const override { + return SymbolSerializer::writeOneSymbol(Symbol, Allocator); + } + Error fromCodeViewSymbol(codeview::CVSymbol CVS) override { + return SymbolDeserializer::deserializeAs(CVS, Symbol); + } + + mutable T Symbol; +}; + +template <> void SymbolRecordImpl::map(IO &IO) {} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Parent", Symbol.Parent); + IO.mapRequired("End", Symbol.End); + IO.mapRequired("Next", Symbol.Next); + IO.mapRequired("Off", Symbol.Offset); + IO.mapRequired("Seg", Symbol.Segment); + IO.mapRequired("Len", Symbol.Length); + IO.mapRequired("Ordinal", Symbol.Thunk); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Size", Symbol.Size); + IO.mapRequired("ThunkOff", Symbol.ThunkOffset); + IO.mapRequired("TargetOff", Symbol.TargetOffset); + IO.mapRequired("ThunkSection", Symbol.ThunkSection); + IO.mapRequired("TargetSection", Symbol.TargetSection); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("SectionNumber", Symbol.SectionNumber); + IO.mapRequired("Alignment", Symbol.Alignment); + IO.mapRequired("Rva", Symbol.Rva); + IO.mapRequired("Length", Symbol.Length); + IO.mapRequired("Characteristics", Symbol.Characteristics); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Size", Symbol.Size); + IO.mapRequired("Characteristics", Symbol.Characteristics); + IO.mapRequired("Offset", Symbol.Offset); + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Ordinal", Symbol.Ordinal); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the linkage name + + IO.mapRequired("PtrParent", Symbol.Parent); + IO.mapRequired("PtrEnd", Symbol.End); + IO.mapRequired("PtrNext", Symbol.Next); + IO.mapRequired("CodeSize", Symbol.CodeSize); + IO.mapRequired("DbgStart", Symbol.DbgStart); + IO.mapRequired("DbgEnd", Symbol.DbgEnd); + IO.mapRequired("FunctionType", Symbol.FunctionType); + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("DisplayName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Symbol.Index); + IO.mapRequired("Seg", Symbol.Register); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Symbol.Index); + IO.mapRequired("Seg", Symbol.Segment); + IO.mapRequired("Off", Symbol.Offset); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("SumName", Symbol.SumName); + IO.mapRequired("SymOffset", Symbol.SymOffset); + IO.mapRequired("Mod", Symbol.Module); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Entries", Symbol.Fields); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("PtrParent", Symbol.Parent); + IO.mapRequired("PtrEnd", Symbol.End); + IO.mapRequired("Inlinee", Symbol.Inlinee); + // TODO: The binary annotations +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("VarName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> +void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the subfields +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the linkage name + IO.mapRequired("PtrParent", Symbol.Parent); + IO.mapRequired("PtrEnd", Symbol.End); + IO.mapRequired("CodeSize", Symbol.CodeSize); + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("BlockName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Print the linkage name + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("DisplayName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Signature", Symbol.Signature); + IO.mapRequired("ObjectName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Machine", Symbol.Machine); + IO.mapRequired("FrontendMajor", Symbol.VersionFrontendMajor); + IO.mapRequired("FrontendMinor", Symbol.VersionFrontendMinor); + IO.mapRequired("FrontendBuild", Symbol.VersionFrontendBuild); + IO.mapRequired("BackendMajor", Symbol.VersionBackendMajor); + IO.mapRequired("BackendMinor", Symbol.VersionBackendMinor); + IO.mapRequired("BackendBuild", Symbol.VersionBackendBuild); + IO.mapRequired("Version", Symbol.Version); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Machine", Symbol.Machine); + IO.mapRequired("FrontendMajor", Symbol.VersionFrontendMajor); + IO.mapRequired("FrontendMinor", Symbol.VersionFrontendMinor); + IO.mapRequired("FrontendBuild", Symbol.VersionFrontendBuild); + IO.mapRequired("FrontendQFE", Symbol.VersionFrontendQFE); + IO.mapRequired("BackendMajor", Symbol.VersionBackendMajor); + IO.mapRequired("BackendMinor", Symbol.VersionBackendMinor); + IO.mapRequired("BackendBuild", Symbol.VersionBackendBuild); + IO.mapRequired("BackendQFE", Symbol.VersionBackendQFE); + IO.mapRequired("Version", Symbol.Version); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("TotalFrameBytes", Symbol.TotalFrameBytes); + IO.mapRequired("PaddingFrameBytes", Symbol.PaddingFrameBytes); + IO.mapRequired("OffsetToPadding", Symbol.OffsetToPadding); + IO.mapRequired("BytesOfCalleeSavedRegisters", + Symbol.BytesOfCalleeSavedRegisters); + IO.mapRequired("OffsetOfExceptionHandler", Symbol.OffsetOfExceptionHandler); + IO.mapRequired("SectionIdOfExceptionHandler", + Symbol.SectionIdOfExceptionHandler); + IO.mapRequired("Flags", Symbol.Flags); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Map Linkage Name + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("Type", Symbol.Type); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Index", Symbol.Index); + IO.mapRequired("ModFilenameOffset", Symbol.ModFilenameOffset); + IO.mapRequired("Flags", Symbol.Flags); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Map Linkage Name + IO.mapRequired("Segment", Symbol.Segment); + IO.mapRequired("CallInstructionSize", Symbol.CallInstructionSize); + IO.mapRequired("Type", Symbol.Type); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Map Linkage Name + IO.mapRequired("Register", Symbol.Register); + IO.mapRequired("CookieKind", Symbol.CookieKind); + IO.mapRequired("Flags", Symbol.Flags); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("FuncID", Symbol.Indices); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("UDTName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("BuildId", Symbol.BuildId); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Offset", Symbol.Offset); + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("VarName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Offset", Symbol.Offset); + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Register", Symbol.Register); + IO.mapRequired("VarName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("Value", Symbol.Value); + IO.mapRequired("Name", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Map linkage name + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("DisplayName", Symbol.Name); +} + +template <> void SymbolRecordImpl::map(IO &IO) { + // TODO: Map linkage name + IO.mapRequired("Type", Symbol.Type); + IO.mapRequired("DisplayName", Symbol.Name); +} +} +} +} + +CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol( + BumpPtrAllocator &Allocator) const { + return Symbol->toCodeViewSymbol(Allocator); +} + +namespace llvm { +namespace yaml { +template <> struct MappingTraits { + static void mapping(IO &io, SymbolRecordBase &Record) { Record.map(io); } +}; +} +} + +template +static inline Expected +fromCodeViewSymbolImpl(CVSymbol Symbol) { + CodeViewYAML::SymbolRecord Result; + + auto Impl = std::make_shared>(Symbol.kind()); + if (auto EC = Impl->fromCodeViewSymbol(Symbol)) + return std::move(EC); + Result.Symbol = Impl; + return Result; +} + +Expected +CodeViewYAML::SymbolRecord::fromCodeViewSymbol(CVSymbol Symbol) { +#define SYMBOL_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + return fromCodeViewSymbolImpl(Symbol); +#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + SYMBOL_RECORD(EnumName, EnumVal, ClassName) + switch (Symbol.kind()) { +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" + default: { llvm_unreachable("Unknown symbol kind!"); } + } + return make_error(cv_error_code::corrupt_record); +} + +template +static void mapSymbolRecordImpl(IO &IO, const char *Class, SymbolKind Kind, + CodeViewYAML::SymbolRecord &Obj) { + if (!IO.outputting()) + Obj.Symbol = std::make_shared>(Kind); + + IO.mapRequired(Class, *Obj.Symbol); +} + +void MappingTraits::mapping( + IO &IO, CodeViewYAML::SymbolRecord &Obj) { + SymbolKind Kind; + if (IO.outputting()) + Kind = Obj.Symbol->Kind; + IO.mapRequired("Kind", Kind); + +#define SYMBOL_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + mapSymbolRecordImpl(IO, #ClassName, Kind, Obj); \ + break; +#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + SYMBOL_RECORD(EnumName, EnumVal, ClassName) + switch (Kind) { +#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def" + default: { llvm_unreachable("Unknown symbol kind!"); } + } +} diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp new file mode 100644 index 000000000000..4e82a299a672 --- /dev/null +++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp @@ -0,0 +1,712 @@ +//===- CodeViewYAMLTypes.cpp - CodeView YAMLIO types implementation -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines classes for handling the YAML representation of CodeView +// Debug Info. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ObjectYAML/CodeViewYAMLTypes.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/CodeViewError.h" +#include "llvm/DebugInfo/CodeView/EnumTables.h" +#include "llvm/DebugInfo/CodeView/TypeDeserializer.h" +#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" + +using namespace llvm; +using namespace llvm::codeview; +using namespace llvm::CodeViewYAML; +using namespace llvm::CodeViewYAML::detail; +using namespace llvm::yaml; + +LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord) +LLVM_YAML_IS_SEQUENCE_VECTOR(StringRef) +LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind) +LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex) + +LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeIndex, false) +LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, false) + +LLVM_YAML_DECLARE_ENUM_TRAITS(TypeLeafKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(PointerToMemberRepresentation) +LLVM_YAML_DECLARE_ENUM_TRAITS(VFTableSlotKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(CallingConvention) +LLVM_YAML_DECLARE_ENUM_TRAITS(PointerKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(PointerMode) +LLVM_YAML_DECLARE_ENUM_TRAITS(HfaKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(MemberAccess) +LLVM_YAML_DECLARE_ENUM_TRAITS(MethodKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(WindowsRTClassKind) +LLVM_YAML_DECLARE_ENUM_TRAITS(LabelType) + +LLVM_YAML_DECLARE_BITSET_TRAITS(PointerOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(ModifierOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(FunctionOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(ClassOptions) +LLVM_YAML_DECLARE_BITSET_TRAITS(MethodOptions) + +LLVM_YAML_DECLARE_MAPPING_TRAITS(OneMethodRecord) +LLVM_YAML_DECLARE_MAPPING_TRAITS(MemberPointerInfo) + +namespace llvm { +namespace CodeViewYAML { +namespace detail { + +struct LeafRecordBase { + TypeLeafKind Kind; + explicit LeafRecordBase(TypeLeafKind K) : Kind(K) {} + + virtual ~LeafRecordBase() {} + virtual void map(yaml::IO &io) = 0; + virtual CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const = 0; + virtual Error fromCodeViewRecord(CVType Type) = 0; +}; + +template struct LeafRecordImpl : public LeafRecordBase { + explicit LeafRecordImpl(TypeLeafKind K) + : LeafRecordBase(K), Record(static_cast(K)) {} + + void map(yaml::IO &io) override; + + Error fromCodeViewRecord(CVType Type) override { + return TypeDeserializer::deserializeAs(Type, Record); + } + + CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const override { + TypeTableBuilder Table(Allocator); + Table.writeKnownType(Record); + return CVType(Kind, Table.records().front()); + } + + mutable T Record; +}; + +template <> struct LeafRecordImpl : public LeafRecordBase { + explicit LeafRecordImpl(TypeLeafKind K) : LeafRecordBase(K) {} + + void map(yaml::IO &io) override; + CVType toCodeViewRecord(BumpPtrAllocator &Allocator) const override; + Error fromCodeViewRecord(CVType Type) override; + + std::vector Members; +}; + +struct MemberRecordBase { + TypeLeafKind Kind; + explicit MemberRecordBase(TypeLeafKind K) : Kind(K) {} + + virtual ~MemberRecordBase() {} + virtual void map(yaml::IO &io) = 0; + virtual void writeTo(FieldListRecordBuilder &FLRB) = 0; +}; + +template struct MemberRecordImpl : public MemberRecordBase { + explicit MemberRecordImpl(TypeLeafKind K) + : MemberRecordBase(K), Record(static_cast(K)) {} + void map(yaml::IO &io) override; + + void writeTo(FieldListRecordBuilder &FLRB) override { + FLRB.writeMemberType(Record); + } + + mutable T Record; +}; +} +} +} + +void ScalarTraits::output(const TypeIndex &S, void *, + llvm::raw_ostream &OS) { + OS << S.getIndex(); +} + +StringRef ScalarTraits::input(StringRef Scalar, void *Ctx, + TypeIndex &S) { + uint32_t I; + StringRef Result = ScalarTraits::input(Scalar, Ctx, I); + S.setIndex(I); + return Result; +} + +void ScalarTraits::output(const APSInt &S, void *, + llvm::raw_ostream &OS) { + S.print(OS, true); +} + +StringRef ScalarTraits::input(StringRef Scalar, void *Ctx, APSInt &S) { + S = APSInt(Scalar); + return ""; +} + +void ScalarEnumerationTraits::enumeration(IO &io, + TypeLeafKind &Value) { +#define CV_TYPE(name, val) io.enumCase(Value, #name, name); +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" +#undef CV_TYPE +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, PointerToMemberRepresentation &Value) { + IO.enumCase(Value, "Unknown", PointerToMemberRepresentation::Unknown); + IO.enumCase(Value, "SingleInheritanceData", + PointerToMemberRepresentation::SingleInheritanceData); + IO.enumCase(Value, "MultipleInheritanceData", + PointerToMemberRepresentation::MultipleInheritanceData); + IO.enumCase(Value, "VirtualInheritanceData", + PointerToMemberRepresentation::VirtualInheritanceData); + IO.enumCase(Value, "GeneralData", PointerToMemberRepresentation::GeneralData); + IO.enumCase(Value, "SingleInheritanceFunction", + PointerToMemberRepresentation::SingleInheritanceFunction); + IO.enumCase(Value, "MultipleInheritanceFunction", + PointerToMemberRepresentation::MultipleInheritanceFunction); + IO.enumCase(Value, "VirtualInheritanceFunction", + PointerToMemberRepresentation::VirtualInheritanceFunction); + IO.enumCase(Value, "GeneralFunction", + PointerToMemberRepresentation::GeneralFunction); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, VFTableSlotKind &Kind) { + IO.enumCase(Kind, "Near16", VFTableSlotKind::Near16); + IO.enumCase(Kind, "Far16", VFTableSlotKind::Far16); + IO.enumCase(Kind, "This", VFTableSlotKind::This); + IO.enumCase(Kind, "Outer", VFTableSlotKind::Outer); + IO.enumCase(Kind, "Meta", VFTableSlotKind::Meta); + IO.enumCase(Kind, "Near", VFTableSlotKind::Near); + IO.enumCase(Kind, "Far", VFTableSlotKind::Far); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, CallingConvention &Value) { + IO.enumCase(Value, "NearC", CallingConvention::NearC); + IO.enumCase(Value, "FarC", CallingConvention::FarC); + IO.enumCase(Value, "NearPascal", CallingConvention::NearPascal); + IO.enumCase(Value, "FarPascal", CallingConvention::FarPascal); + IO.enumCase(Value, "NearFast", CallingConvention::NearFast); + IO.enumCase(Value, "FarFast", CallingConvention::FarFast); + IO.enumCase(Value, "NearStdCall", CallingConvention::NearStdCall); + IO.enumCase(Value, "FarStdCall", CallingConvention::FarStdCall); + IO.enumCase(Value, "NearSysCall", CallingConvention::NearSysCall); + IO.enumCase(Value, "FarSysCall", CallingConvention::FarSysCall); + IO.enumCase(Value, "ThisCall", CallingConvention::ThisCall); + IO.enumCase(Value, "MipsCall", CallingConvention::MipsCall); + IO.enumCase(Value, "Generic", CallingConvention::Generic); + IO.enumCase(Value, "AlphaCall", CallingConvention::AlphaCall); + IO.enumCase(Value, "PpcCall", CallingConvention::PpcCall); + IO.enumCase(Value, "SHCall", CallingConvention::SHCall); + IO.enumCase(Value, "ArmCall", CallingConvention::ArmCall); + IO.enumCase(Value, "AM33Call", CallingConvention::AM33Call); + IO.enumCase(Value, "TriCall", CallingConvention::TriCall); + IO.enumCase(Value, "SH5Call", CallingConvention::SH5Call); + IO.enumCase(Value, "M32RCall", CallingConvention::M32RCall); + IO.enumCase(Value, "ClrCall", CallingConvention::ClrCall); + IO.enumCase(Value, "Inline", CallingConvention::Inline); + IO.enumCase(Value, "NearVector", CallingConvention::NearVector); +} + +void ScalarEnumerationTraits::enumeration(IO &IO, + PointerKind &Kind) { + IO.enumCase(Kind, "Near16", PointerKind::Near16); + IO.enumCase(Kind, "Far16", PointerKind::Far16); + IO.enumCase(Kind, "Huge16", PointerKind::Huge16); + IO.enumCase(Kind, "BasedOnSegment", PointerKind::BasedOnSegment); + IO.enumCase(Kind, "BasedOnValue", PointerKind::BasedOnValue); + IO.enumCase(Kind, "BasedOnSegmentValue", PointerKind::BasedOnSegmentValue); + IO.enumCase(Kind, "BasedOnAddress", PointerKind::BasedOnAddress); + IO.enumCase(Kind, "BasedOnSegmentAddress", + PointerKind::BasedOnSegmentAddress); + IO.enumCase(Kind, "BasedOnType", PointerKind::BasedOnType); + IO.enumCase(Kind, "BasedOnSelf", PointerKind::BasedOnSelf); + IO.enumCase(Kind, "Near32", PointerKind::Near32); + IO.enumCase(Kind, "Far32", PointerKind::Far32); + IO.enumCase(Kind, "Near64", PointerKind::Near64); +} + +void ScalarEnumerationTraits::enumeration(IO &IO, + PointerMode &Mode) { + IO.enumCase(Mode, "Pointer", PointerMode::Pointer); + IO.enumCase(Mode, "LValueReference", PointerMode::LValueReference); + IO.enumCase(Mode, "PointerToDataMember", PointerMode::PointerToDataMember); + IO.enumCase(Mode, "PointerToMemberFunction", + PointerMode::PointerToMemberFunction); + IO.enumCase(Mode, "RValueReference", PointerMode::RValueReference); +} + +void ScalarEnumerationTraits::enumeration(IO &IO, HfaKind &Value) { + IO.enumCase(Value, "None", HfaKind::None); + IO.enumCase(Value, "Float", HfaKind::Float); + IO.enumCase(Value, "Double", HfaKind::Double); + IO.enumCase(Value, "Other", HfaKind::Other); +} + +void ScalarEnumerationTraits::enumeration(IO &IO, + MemberAccess &Access) { + IO.enumCase(Access, "None", MemberAccess::None); + IO.enumCase(Access, "Private", MemberAccess::Private); + IO.enumCase(Access, "Protected", MemberAccess::Protected); + IO.enumCase(Access, "Public", MemberAccess::Public); +} + +void ScalarEnumerationTraits::enumeration(IO &IO, + MethodKind &Kind) { + IO.enumCase(Kind, "Vanilla", MethodKind::Vanilla); + IO.enumCase(Kind, "Virtual", MethodKind::Virtual); + IO.enumCase(Kind, "Static", MethodKind::Static); + IO.enumCase(Kind, "Friend", MethodKind::Friend); + IO.enumCase(Kind, "IntroducingVirtual", MethodKind::IntroducingVirtual); + IO.enumCase(Kind, "PureVirtual", MethodKind::PureVirtual); + IO.enumCase(Kind, "PureIntroducingVirtual", + MethodKind::PureIntroducingVirtual); +} + +void ScalarEnumerationTraits::enumeration( + IO &IO, WindowsRTClassKind &Value) { + IO.enumCase(Value, "None", WindowsRTClassKind::None); + IO.enumCase(Value, "Ref", WindowsRTClassKind::RefClass); + IO.enumCase(Value, "Value", WindowsRTClassKind::ValueClass); + IO.enumCase(Value, "Interface", WindowsRTClassKind::Interface); +} + +void ScalarEnumerationTraits::enumeration(IO &IO, LabelType &Value) { + IO.enumCase(Value, "Near", LabelType::Near); + IO.enumCase(Value, "Far", LabelType::Far); +} + +void ScalarBitSetTraits::bitset(IO &IO, + PointerOptions &Options) { + IO.bitSetCase(Options, "None", PointerOptions::None); + IO.bitSetCase(Options, "Flat32", PointerOptions::Flat32); + IO.bitSetCase(Options, "Volatile", PointerOptions::Volatile); + IO.bitSetCase(Options, "Const", PointerOptions::Const); + IO.bitSetCase(Options, "Unaligned", PointerOptions::Unaligned); + IO.bitSetCase(Options, "Restrict", PointerOptions::Restrict); + IO.bitSetCase(Options, "WinRTSmartPointer", + PointerOptions::WinRTSmartPointer); +} + +void ScalarBitSetTraits::bitset(IO &IO, + ModifierOptions &Options) { + IO.bitSetCase(Options, "None", ModifierOptions::None); + IO.bitSetCase(Options, "Const", ModifierOptions::Const); + IO.bitSetCase(Options, "Volatile", ModifierOptions::Volatile); + IO.bitSetCase(Options, "Unaligned", ModifierOptions::Unaligned); +} + +void ScalarBitSetTraits::bitset(IO &IO, + FunctionOptions &Options) { + IO.bitSetCase(Options, "None", FunctionOptions::None); + IO.bitSetCase(Options, "CxxReturnUdt", FunctionOptions::CxxReturnUdt); + IO.bitSetCase(Options, "Constructor", FunctionOptions::Constructor); + IO.bitSetCase(Options, "ConstructorWithVirtualBases", + FunctionOptions::ConstructorWithVirtualBases); +} + +void ScalarBitSetTraits::bitset(IO &IO, ClassOptions &Options) { + IO.bitSetCase(Options, "None", ClassOptions::None); + IO.bitSetCase(Options, "HasConstructorOrDestructor", + ClassOptions::HasConstructorOrDestructor); + IO.bitSetCase(Options, "HasOverloadedOperator", + ClassOptions::HasOverloadedOperator); + IO.bitSetCase(Options, "Nested", ClassOptions::Nested); + IO.bitSetCase(Options, "ContainsNestedClass", + ClassOptions::ContainsNestedClass); + IO.bitSetCase(Options, "HasOverloadedAssignmentOperator", + ClassOptions::HasOverloadedAssignmentOperator); + IO.bitSetCase(Options, "HasConversionOperator", + ClassOptions::HasConversionOperator); + IO.bitSetCase(Options, "ForwardReference", ClassOptions::ForwardReference); + IO.bitSetCase(Options, "Scoped", ClassOptions::Scoped); + IO.bitSetCase(Options, "HasUniqueName", ClassOptions::HasUniqueName); + IO.bitSetCase(Options, "Sealed", ClassOptions::Sealed); + IO.bitSetCase(Options, "Intrinsic", ClassOptions::Intrinsic); +} + +void ScalarBitSetTraits::bitset(IO &IO, MethodOptions &Options) { + IO.bitSetCase(Options, "None", MethodOptions::None); + IO.bitSetCase(Options, "Pseudo", MethodOptions::Pseudo); + IO.bitSetCase(Options, "NoInherit", MethodOptions::NoInherit); + IO.bitSetCase(Options, "NoConstruct", MethodOptions::NoConstruct); + IO.bitSetCase(Options, "CompilerGenerated", MethodOptions::CompilerGenerated); + IO.bitSetCase(Options, "Sealed", MethodOptions::Sealed); +} + +void MappingTraits::mapping(IO &IO, MemberPointerInfo &MPI) { + IO.mapRequired("ContainingType", MPI.ContainingType); + IO.mapRequired("Representation", MPI.Representation); +} + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ModifiedType", Record.ModifiedType); + IO.mapRequired("Modifiers", Record.Modifiers); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ReturnType", Record.ReturnType); + IO.mapRequired("CallConv", Record.CallConv); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("ParameterCount", Record.ParameterCount); + IO.mapRequired("ArgumentList", Record.ArgumentList); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ReturnType", Record.ReturnType); + IO.mapRequired("ClassType", Record.ClassType); + IO.mapRequired("ThisType", Record.ThisType); + IO.mapRequired("CallConv", Record.CallConv); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("ParameterCount", Record.ParameterCount); + IO.mapRequired("ArgumentList", Record.ArgumentList); + IO.mapRequired("ThisPointerAdjustment", Record.ThisPointerAdjustment); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("Mode", Record.Mode); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ClassType", Record.ClassType); + IO.mapRequired("FunctionType", Record.FunctionType); + IO.mapRequired("Name", Record.Name); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ArgIndices", Record.ArgIndices); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("StringIndices", Record.StringIndices); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ReferentType", Record.ReferentType); + IO.mapRequired("Attrs", Record.Attrs); + IO.mapOptional("MemberInfo", Record.MemberInfo); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ElementType", Record.ElementType); + IO.mapRequired("IndexType", Record.IndexType); + IO.mapRequired("Size", Record.Size); + IO.mapRequired("Name", Record.Name); +} + +void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("FieldList", Members); +} +} +} +} + +namespace { +class MemberRecordConversionVisitor : public TypeVisitorCallbacks { +public: + explicit MemberRecordConversionVisitor(std::vector &Records) + : Records(Records) {} + +#define TYPE_RECORD(EnumName, EnumVal, Name) +#define MEMBER_RECORD(EnumName, EnumVal, Name) \ + Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override { \ + return visitKnownMemberImpl(Record); \ + } +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" +private: + template Error visitKnownMemberImpl(T &Record) { + TypeLeafKind K = static_cast(Record.getKind()); + auto Impl = std::make_shared>(K); + Impl->Record = Record; + Records.push_back(MemberRecord{Impl}); + return Error::success(); + } + + std::vector &Records; +}; +} + +Error LeafRecordImpl::fromCodeViewRecord(CVType Type) { + MemberRecordConversionVisitor V(Members); + return visitMemberRecordStream(Type.content(), V); +} + +CVType LeafRecordImpl::toCodeViewRecord( + BumpPtrAllocator &Allocator) const { + TypeTableBuilder TTB(Allocator); + FieldListRecordBuilder FLRB(TTB); + FLRB.begin(); + for (const auto &Member : Members) { + Member.Member->writeTo(FLRB); + } + FLRB.end(true); + return CVType(Kind, TTB.records().front()); +} + +void MappingTraits::mapping(IO &io, OneMethodRecord &Record) { + io.mapRequired("Type", Record.Type); + io.mapRequired("Attrs", Record.Attrs.Attrs); + io.mapRequired("VFTableOffset", Record.VFTableOffset); + io.mapRequired("Name", Record.Name); +} + +namespace llvm { +namespace CodeViewYAML { +namespace detail { +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("MemberCount", Record.MemberCount); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("FieldList", Record.FieldList); + IO.mapRequired("Name", Record.Name); + IO.mapRequired("UniqueName", Record.UniqueName); + + IO.mapRequired("DerivationList", Record.DerivationList); + IO.mapRequired("VTableShape", Record.VTableShape); + IO.mapRequired("Size", Record.Size); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("MemberCount", Record.MemberCount); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("FieldList", Record.FieldList); + IO.mapRequired("Name", Record.Name); + IO.mapRequired("UniqueName", Record.UniqueName); + + IO.mapRequired("Size", Record.Size); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("NumEnumerators", Record.MemberCount); + IO.mapRequired("Options", Record.Options); + IO.mapRequired("FieldList", Record.FieldList); + IO.mapRequired("Name", Record.Name); + IO.mapRequired("UniqueName", Record.UniqueName); + + IO.mapRequired("UnderlyingType", Record.UnderlyingType); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Record.Type); + IO.mapRequired("BitSize", Record.BitSize); + IO.mapRequired("BitOffset", Record.BitOffset); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("Slots", Record.Slots); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("Guid", Record.Guid); + IO.mapRequired("Age", Record.Age); + IO.mapRequired("Name", Record.Name); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("Id", Record.Id); + IO.mapRequired("String", Record.String); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ParentScope", Record.ParentScope); + IO.mapRequired("FunctionType", Record.FunctionType); + IO.mapRequired("Name", Record.Name); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("UDT", Record.UDT); + IO.mapRequired("SourceFile", Record.SourceFile); + IO.mapRequired("LineNumber", Record.LineNumber); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("UDT", Record.UDT); + IO.mapRequired("SourceFile", Record.SourceFile); + IO.mapRequired("LineNumber", Record.LineNumber); + IO.mapRequired("Module", Record.Module); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("ArgIndices", Record.ArgIndices); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("CompleteClass", Record.CompleteClass); + IO.mapRequired("OverriddenVFTable", Record.OverriddenVFTable); + IO.mapRequired("VFPtrOffset", Record.VFPtrOffset); + IO.mapRequired("MethodNames", Record.MethodNames); +} + +template <> void LeafRecordImpl::map(IO &IO) { + IO.mapRequired("Methods", Record.Methods); +} + +template <> void MemberRecordImpl::map(IO &IO) { + MappingTraits::mapping(IO, Record); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("NumOverloads", Record.NumOverloads); + IO.mapRequired("MethodList", Record.MethodList); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Record.Type); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Type", Record.Type); + IO.mapRequired("FieldOffset", Record.FieldOffset); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Type", Record.Type); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Value", Record.Value); + IO.mapRequired("Name", Record.Name); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Type", Record.Type); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("Type", Record.Type); + IO.mapRequired("Offset", Record.Offset); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("Attrs", Record.Attrs.Attrs); + IO.mapRequired("BaseType", Record.BaseType); + IO.mapRequired("VBPtrType", Record.VBPtrType); + IO.mapRequired("VBPtrOffset", Record.VBPtrOffset); + IO.mapRequired("VTableIndex", Record.VTableIndex); +} + +template <> void MemberRecordImpl::map(IO &IO) { + IO.mapRequired("ContinuationIndex", Record.ContinuationIndex); +} +} +} +} + +template +static inline Expected fromCodeViewRecordImpl(CVType Type) { + LeafRecord Result; + + auto Impl = std::make_shared>(Type.kind()); + if (auto EC = Impl->fromCodeViewRecord(Type)) + return std::move(EC); + Result.Leaf = Impl; + return Result; +} + +Expected LeafRecord::fromCodeViewRecord(CVType Type) { +#define TYPE_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + return fromCodeViewRecordImpl(Type); +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + TYPE_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) + switch (Type.kind()) { +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" + default: { llvm_unreachable("Unknown leaf kind!"); } + } + return make_error(cv_error_code::corrupt_record); +} + +CVType LeafRecord::toCodeViewRecord(BumpPtrAllocator &Allocator) const { + return Leaf->toCodeViewRecord(Allocator); +} + +namespace llvm { +namespace yaml { +template <> struct MappingTraits { + static void mapping(IO &io, LeafRecordBase &Record) { Record.map(io); } +}; + +template <> struct MappingTraits { + static void mapping(IO &io, MemberRecordBase &Record) { Record.map(io); } +}; +} +} + +template +static void mapLeafRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind, + LeafRecord &Obj) { + if (!IO.outputting()) + Obj.Leaf = std::make_shared>(Kind); + + if (Kind == LF_FIELDLIST) + Obj.Leaf->map(IO); + else + IO.mapRequired(Class, *Obj.Leaf); +} + +void MappingTraits::mapping(IO &IO, LeafRecord &Obj) { + TypeLeafKind Kind; + if (IO.outputting()) + Kind = Obj.Leaf->Kind; + IO.mapRequired("Kind", Kind); + +#define TYPE_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + mapLeafRecordImpl(IO, #ClassName, Kind, Obj); \ + break; +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + TYPE_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD(EnumName, EnumVal, ClassName) +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) + switch (Kind) { +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" + default: { llvm_unreachable("Unknown leaf kind!"); } + } +} + +template +static void mapMemberRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind, + MemberRecord &Obj) { + if (!IO.outputting()) + Obj.Member = std::make_shared>(Kind); + + IO.mapRequired(Class, *Obj.Member); +} + +void MappingTraits::mapping(IO &IO, MemberRecord &Obj) { + TypeLeafKind Kind; + if (IO.outputting()) + Kind = Obj.Member->Kind; + IO.mapRequired("Kind", Kind); + +#define MEMBER_RECORD(EnumName, EnumVal, ClassName) \ + case EnumName: \ + mapMemberRecordImpl(IO, #ClassName, Kind, Obj); \ + break; +#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) \ + MEMBER_RECORD(EnumName, EnumVal, ClassName) +#define TYPE_RECORD(EnumName, EnumVal, ClassName) +#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName) + switch (Kind) { +#include "llvm/DebugInfo/CodeView/CodeViewTypes.def" + default: { llvm_unreachable("Unknown member kind!"); } + } +} diff --git a/lib/ObjectYAML/LLVMBuild.txt b/lib/ObjectYAML/LLVMBuild.txt index b8d1d2f1779e..44657e916a91 100644 --- a/lib/ObjectYAML/LLVMBuild.txt +++ b/lib/ObjectYAML/LLVMBuild.txt @@ -11,4 +11,4 @@ type = Library name = ObjectYAML parent = Libraries -required_libraries = Support +required_libraries = Support DebugInfoCodeView diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index abc53e97aa72..eb81e58b9b0e 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -164,7 +164,8 @@ static cl::opt EnableGVNHoist( "enable-npm-gvn-hoist", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); -static Regex DefaultAliasRegex("^(default|lto-pre-link|lto)<(O[0123sz])>$"); +static Regex DefaultAliasRegex( + "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$"); static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) { switch (Level) { @@ -345,6 +346,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, LPM2.addPass(IndVarSimplifyPass()); LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(LoopDeletionPass()); + // FIXME: The old pass manager has a hack to disable loop unrolling during + // ThinLTO when using sample PGO. Need to either fix it or port some + // workaround. LPM2.addPass(LoopUnrollPass::createFull(Level)); // We provide the opt remark emitter pass for LICM to use. We only need to do @@ -454,14 +458,10 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, } ModulePassManager -PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, - bool DebugLogging) { - assert(Level != O0 && "Must request optimizations for the default pipeline!"); +PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, + bool DebugLogging) { ModulePassManager MPM(DebugLogging); - // Force any function attributes we want the rest of the pipeline te observe. - MPM.addPass(ForceFunctionAttrsPass()); - // Do basic inference of function attributes from known properties of system // libraries and other oracles. MPM.addPass(InferFunctionAttrsPass()); @@ -504,16 +504,16 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, GlobalCleanupPM.addPass(SimplifyCFGPass()); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM))); - // Add all the requested passes for PGO Instrumentation, if requested. + // Add all the requested passes for PGO, if requested. if (PGOOpt) { assert(PGOOpt->RunProfileGen || PGOOpt->SamplePGO || !PGOOpt->ProfileUseFile.empty()); addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen, PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile); - } - // Indirect call promotion that promotes intra-module targes only. - MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO)); + // Indirect call promotion that promotes intra-module targes only. + MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO)); + } // Require the GlobalsAA analysis for the module so we can query it within // the CGSCC pipeline. @@ -562,17 +562,30 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass( std::move(MainCGPipeline), MaxDevirtIterations, DebugLogging))); - // This ends the canonicalization and simplification phase of the pipeline. - // At this point, we expect to have canonical and simple IR which we begin - // *optimizing* for efficient execution going forward. + return MPM; +} + +ModulePassManager +PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, + bool DebugLogging) { + ModulePassManager MPM(DebugLogging); + + // Optimize globals now that the module is fully simplified. + MPM.addPass(GlobalOptPass()); // Run partial inlining pass to partially inline functions that have // large bodies. if (RunPartialInlining) MPM.addPass(PartialInlinerPass()); - // Eliminate externally available functions now that inlining is over -- we - // won't emit these anyways. + // Remove avail extern fns and globals definitions since we aren't compiling + // an object file for later LTO. For LTO we want to preserve these so they + // are eligible for inlining at link-time. Note if they are unreferenced they + // will be removed by GlobalDCE later, so this only impacts referenced + // available externally globals. Eventually they will be suppressed during + // codegen, but eliminating here enables more opportunity for GlobalDCE as it + // may make globals referenced by available external functions dead and saves + // running remaining passes on the eliminated functions. MPM.addPass(EliminateAvailableExternallyPass()); // Do RPO function attribute inference across the module to forward-propagate @@ -670,6 +683,87 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, return MPM; } +ModulePassManager +PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, + bool DebugLogging) { + assert(Level != O0 && "Must request optimizations for the default pipeline!"); + + ModulePassManager MPM(DebugLogging); + + // Force any function attributes we want the rest of the pipeline to observe. + MPM.addPass(ForceFunctionAttrsPass()); + + // Add the core simplification pipeline. + MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); + + // Now add the optimization pipeline. + MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging)); + + return MPM; +} + +ModulePassManager +PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level, + bool DebugLogging) { + assert(Level != O0 && "Must request optimizations for the default pipeline!"); + + ModulePassManager MPM(DebugLogging); + + // Force any function attributes we want the rest of the pipeline to observe. + MPM.addPass(ForceFunctionAttrsPass()); + + // If we are planning to perform ThinLTO later, we don't bloat the code with + // unrolling/vectorization/... now. Just simplify the module as much as we + // can. + MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); + + // Run partial inlining pass to partially inline functions that have + // large bodies. + // FIXME: It isn't clear whether this is really the right place to run this + // in ThinLTO. Because there is another canonicalization and simplification + // phase that will run after the thin link, running this here ends up with + // less information than will be available later and it may grow functions in + // ways that aren't beneficial. + if (RunPartialInlining) + MPM.addPass(PartialInlinerPass()); + + // Reduce the size of the IR as much as possible. + MPM.addPass(GlobalOptPass()); + + // Rename anon globals to be able to export them in the summary. + MPM.addPass(NameAnonGlobalPass()); + + return MPM; +} + +ModulePassManager +PassBuilder::buildThinLTODefaultPipeline(OptimizationLevel Level, + bool DebugLogging) { + // FIXME: The summary index is not hooked in the new pass manager yet. + // When it's going to be hooked, enable WholeProgramDevirt and LowerTypeTest + // here. + + ModulePassManager MPM(DebugLogging); + + // Force any function attributes we want the rest of the pipeline to observe. + MPM.addPass(ForceFunctionAttrsPass()); + + // During the ThinLTO backend phase we perform early indirect call promotion + // here, before globalopt. Otherwise imported available_externally functions + // look unreferenced and are removed. + MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, + PGOOpt && PGOOpt->SamplePGO && + !PGOOpt->ProfileUseFile.empty())); + + // Add the core simplification pipeline. + MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging)); + + // Now add the optimization pipeline. + MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging)); + + return MPM; +} + ModulePassManager PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level, bool DebugLogging) { @@ -893,9 +987,16 @@ static Optional parseDevirtPassName(StringRef Name) { return Count; } +/// Tests whether a pass name starts with a valid prefix for a default pipeline +/// alias. +static bool startsWithDefaultPipelineAliasPrefix(StringRef Name) { + return Name.startswith("default") || Name.startswith("thinlto") || + Name.startswith("lto"); +} + static bool isModulePassName(StringRef Name) { // Manually handle aliases for pre-configured pipeline fragments. - if (Name.startswith("default") || Name.startswith("lto")) + if (startsWithDefaultPipelineAliasPrefix(Name)) return DefaultAliasRegex.match(Name); // Explicitly handle pass manager names. @@ -1090,7 +1191,7 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, } // Manually handle aliases for pre-configured pipeline fragments. - if (Name.startswith("default") || Name.startswith("lto")) { + if (startsWithDefaultPipelineAliasPrefix(Name)) { SmallVector Matches; if (!DefaultAliasRegex.match(Name, &Matches)) return false; @@ -1109,6 +1210,10 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, if (Matches[1] == "default") { MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging)); + } else if (Matches[1] == "thinlto-pre-link") { + MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L, DebugLogging)); + } else if (Matches[1] == "thinlto") { + MPM.addPass(buildThinLTODefaultPipeline(L, DebugLogging)); } else if (Matches[1] == "lto-pre-link") { MPM.addPass(buildLTOPreLinkDefaultPipeline(L, DebugLogging)); } else { diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp index 862232971162..bfb658cfa0b7 100644 --- a/lib/Support/BinaryStreamReader.cpp +++ b/lib/Support/BinaryStreamReader.cpp @@ -69,6 +69,26 @@ Error BinaryStreamReader::readCString(StringRef &Dest) { return Error::success(); } +Error BinaryStreamReader::readWideString(ArrayRef &Dest) { + uint32_t Length = 0; + uint32_t OriginalOffset = getOffset(); + const UTF16 *C; + while (true) { + if (auto EC = readObject(C)) + return EC; + if (*C == 0x0000) + break; + ++Length; + } + uint32_t NewOffset = getOffset(); + setOffset(OriginalOffset); + + if (auto EC = readArray(Dest, Length)) + return EC; + setOffset(NewOffset); + return Error::success(); +} + Error BinaryStreamReader::readFixedString(StringRef &Dest, uint32_t Length) { ArrayRef Bytes; if (auto EC = readBytes(Bytes, Length)) diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index fa28ba1b6ab6..ce638d453c19 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -381,6 +381,11 @@ static bool is_local_impl(struct STATVFS &Vfs) { #elif defined(__CYGWIN__) // Cygwin doesn't expose this information; would need to use Win32 API. return false; +#elif defined(__sun) + // statvfs::f_basetype contains a null-terminated FSType name of the mounted target + StringRef fstype(Vfs.f_basetype); + // NFS is the only non-local fstype?? + return !fstype.equals("nfs"); #else return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL); #endif diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp index f07208b1fb90..83f7147dc9f6 100644 --- a/lib/TableGen/Record.cpp +++ b/lib/TableGen/Record.cpp @@ -1572,12 +1572,6 @@ RecordVal::RecordVal(Init *N, RecTy *T, bool P) assert(Value && "Cannot create unset value for current type!"); } -RecordVal::RecordVal(StringRef N, RecTy *T, bool P) - : Name(StringInit::get(N)), TyAndPrefix(T, P) { - Value = UnsetInit::get()->convertInitializerTo(T); - assert(Value && "Cannot create unset value for current type!"); -} - StringRef RecordVal::getName() const { return cast(getNameInit())->getValue(); } @@ -1603,8 +1597,7 @@ void Record::init() { // Every record potentially has a def at the top. This value is // replaced with the top-level def name at instantiation time. - RecordVal DN("NAME", StringRecTy::get(), false); - addValue(DN); + addValue(RecordVal(StringInit::get("NAME"), StringRecTy::get(), false)); } void Record::checkName() { @@ -1640,10 +1633,6 @@ void Record::setName(Init *NewName) { // this. See TGParser::ParseDef and TGParser::ParseDefm. } -void Record::setName(StringRef Name) { - setName(StringInit::get(Name)); -} - void Record::resolveReferencesTo(const RecordVal *RV) { for (RecordVal &Value : Values) { if (RV == &Value) // Skip resolve the same field as the given one @@ -1714,7 +1703,7 @@ Init *Record::getValueInit(StringRef FieldName) const { return R->getValue(); } -std::string Record::getValueAsString(StringRef FieldName) const { +StringRef Record::getValueAsString(StringRef FieldName) const { const RecordVal *R = getValue(FieldName); if (!R || !R->getValue()) PrintFatalError(getLoc(), "Record `" + getName() + @@ -1793,10 +1782,10 @@ Record::getValueAsListOfInts(StringRef FieldName) const { return Ints; } -std::vector +std::vector Record::getValueAsListOfStrings(StringRef FieldName) const { ListInit *List = getValueAsListInit(FieldName); - std::vector Strings; + std::vector Strings; for (Init *I : List->getValues()) { if (StringInit *SI = dyn_cast(I)) Strings.push_back(SI->getValue()); diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp index 96015b06d798..b492cf9495c0 100644 --- a/lib/TableGen/TGParser.cpp +++ b/lib/TableGen/TGParser.cpp @@ -339,7 +339,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){ if (!IVal) return Error(Loc, "foreach iterator value is untyped"); - IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false)); + IterRec->addValue(RecordVal(IterVar->getNameInit(), IVal->getType(), false)); if (SetValue(IterRec.get(), Loc, IterVar->getNameInit(), None, IVal)) return Error(Loc, "when instantiating this def"); @@ -378,8 +378,8 @@ static bool isObjectStart(tgtok::TokKind K) { /// GetNewAnonymousName - Generate a unique anonymous name that can be used as /// an identifier. -std::string TGParser::GetNewAnonymousName() { - return "anonymous_" + utostr(AnonCounter++); +Init *TGParser::GetNewAnonymousName() { + return StringInit::get("anonymous_" + utostr(AnonCounter++)); } /// ParseObjectName - If an object name is specified, return it. Otherwise, @@ -2350,7 +2350,7 @@ Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto, bool IsAnonymous = false; if (!DefmPrefix) { - DefmPrefix = StringInit::get(GetNewAnonymousName()); + DefmPrefix = GetNewAnonymousName(); IsAnonymous = true; } diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h index 76f7d8fe5026..1b2966c9f6c9 100644 --- a/lib/TableGen/TGParser.h +++ b/lib/TableGen/TGParser.h @@ -110,7 +110,7 @@ class TGParser { bool AddSubMultiClass(MultiClass *CurMC, SubMultiClassReference &SubMultiClass); - std::string GetNewAnonymousName(); + Init *GetNewAnonymousName(); // IterRecord: Map an iterator name to a value. struct IterRecord { diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 4af5fef4287c..abe28460c83a 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -190,6 +190,7 @@ def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -226,6 +227,7 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td index 3fbbc0be682d..3b71cf8399a0 100644 --- a/lib/Target/AArch64/AArch64SchedM1.td +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel { let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. - let CompleteModel = 0; // Use the default model otherwise. + let CompleteModel = 1; // Use the default model otherwise. } //===----------------------------------------------------------------------===// @@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteLA : SchedWriteVariant<[SchedVar, SchedVar]>; def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; } def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSA : SchedWriteVariant<[SchedVar, SchedVar]>; @@ -125,13 +125,13 @@ def : WriteRes { let Latency = 0; } // Load instructions. def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } -def : SchedAlias; +def : SchedAlias; // Store instructions. def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } -def : SchedAlias; +def : SchedAlias; // FP data instructions. def : WriteRes { let Latency = 3; } @@ -231,6 +231,111 @@ def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } def M1WriteTB : SchedWriteRes<[M1UnitC, M1UnitALU]> { let Latency = 2; } +def M1WriteVLDA : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 6; } +def M1WriteVLDB : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 7; } +def M1WriteVLDC : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 8; } +def M1WriteVLDD : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDE : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDF : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 10; + let ResourceCycles = [5]; } +def M1WriteVLDG : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDH : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDI : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 12; + let ResourceCycles = [6]; } +def M1WriteVLDJ : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDK : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDL : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDM : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDN : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 14; + let ResourceCycles = [7]; } + +def M1WriteVSTA : WriteSequence<[WriteVST], 2>; +def M1WriteVSTB : WriteSequence<[WriteVST], 3>; +def M1WriteVSTC : WriteSequence<[WriteVST], 4>; +def M1WriteVSTD : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 7; + let ResourceCycles = [7]; } +def M1WriteVSTE : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 8; + let ResourceCycles = [8]; } +def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 15; + let ResourceCycles = [15]; } +def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 16; + let ResourceCycles = [16]; } +def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 14; + let ResourceCycles = [14]; } +def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 17; + let ResourceCycles = [17]; } // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; @@ -360,8 +465,233 @@ def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64 def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // ASIMD load instructions. +def : InstRW<[M1WriteVLDD], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[M1WriteVLDD, + WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; +def : InstRW<[M1WriteVLDE, + WriteAdr], (instregex "LD1i(64)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDG], (instregex "LD2i(8|16)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDG], (instregex "LD2i(32)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(32)_POST$")>; +def : InstRW<[M1WriteVLDH], (instregex "LD2i(64)$")>; +def : InstRW<[M1WriteVLDH, + WriteAdr], (instregex "LD2i(64)_POST$")>; + +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(2d)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(8|16)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(32)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(32)_POST$")>; +def : InstRW<[M1WriteVLDL], (instregex "LD3i(64)$")>; +def : InstRW<[M1WriteVLDL, + WriteAdr], (instregex "LD3i(64)_POST$")>; + +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(2d)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVLDK], (instregex "LD4i(8|16)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDK], (instregex "LD4i(32)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(32)_POST$")>; +def : InstRW<[M1WriteVLDM], (instregex "LD4i(64)$")>; +def : InstRW<[M1WriteVLDM, + WriteAdr], (instregex "LD4i(64)_POST$")>; + +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; // ASIMD store instructions. +def : InstRW<[M1WriteVSTD], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST1i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(64)_POST$")>; + +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST2i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(64)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(2d)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST3i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST3i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST3i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(2d)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST4i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST4i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST4i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; // Cryptography instructions. def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index cb3f72a524f5..d4a8cecdb29f 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -256,9 +256,9 @@ namespace { /// AArch64 Code Generator Pass Configuration Options. class AArch64PassConfig : public TargetPassConfig { public: - AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) + AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM.getOptLevel() != CodeGenOpt::None) substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } @@ -317,7 +317,7 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { - return new AArch64PassConfig(this, PM); + return new AArch64PassConfig(*this, PM); } void AArch64PassConfig::addIRPasses() { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index f473944cd528..0959014812d8 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -503,40 +503,37 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); - if (!FrameInfo.hasCalls()) { - Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - // If there are no calls, MachineRegisterInfo can tell us the used register - // count easily. + Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; - } + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. + + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; } - - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; - } - } - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; - - return Info; } - llvm_unreachable("calls not implemented"); + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 48827f463997..596f02ae4a64 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -456,7 +456,7 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: - AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // Exceptions and StackMaps are not supported, so these passes will never do // anything. @@ -487,7 +487,7 @@ class AMDGPUPassConfig : public TargetPassConfig { class R600PassConfig final : public AMDGPUPassConfig { public: - R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} ScheduleDAGInstrs *createMachineScheduler( @@ -503,7 +503,7 @@ class R600PassConfig final : public AMDGPUPassConfig { class GCNPassConfig final : public AMDGPUPassConfig { public: - GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) {} GCNTargetMachine &getGCNTargetMachine() const { @@ -682,7 +682,7 @@ void R600PassConfig::addPreEmitPass() { } TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { - return new R600PassConfig(this, PM); + return new R600PassConfig(*this, PM); } //===----------------------------------------------------------------------===// @@ -844,6 +844,6 @@ void GCNPassConfig::addPreEmitPass() { } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { - return new GCNPassConfig(this, PM); + return new GCNPassConfig(*this, PM); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 934bf7f31bab..a3c7c1982d0a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -69,7 +69,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { return -1; return 0; } - }; //===----------------------------------------------------------------------===// @@ -89,6 +88,10 @@ class R600TargetMachine final : public AMDGPUTargetMachine { TargetPassConfig *createPassConfig(PassManagerBase &PM) override; const R600Subtarget *getSubtargetImpl(const Function &) const override; + + bool isMachineVerifierClean() const override { + return false; + } }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index f5541e08e1b7..cc68c971b249 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -161,7 +161,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyOpSel, ImmTyOpSelHi, ImmTyNegLo, - ImmTyNegHi + ImmTyNegHi, + ImmTySwizzle }; struct TokOp { @@ -474,6 +475,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isSWaitCnt() const; bool isHwreg() const; bool isSendMsg() const; + bool isSwizzle() const; bool isSMRDOffset8() const; bool isSMRDOffset20() const; bool isSMRDLiteralOffset() const; @@ -659,6 +661,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyOpSelHi: OS << "OpSelHi"; break; case ImmTyNegLo: OS << "NegLo"; break; case ImmTyNegHi: OS << "NegHi"; break; + case ImmTySwizzle: OS << "Swizzle"; break; } } @@ -994,6 +997,12 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; + bool trySkipId(const StringRef Id); + bool trySkipToken(const AsmToken::TokenKind Kind); + bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg); + bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string"); + bool parseExpr(int64_t &Imm); + public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1003,6 +1012,19 @@ class AMDGPUAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseInterpAttr(OperandVector &Operands); OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg); + OperandMatchResultTy parseSwizzleOp(OperandVector &Operands); + bool parseSwizzleOffset(int64_t &Imm); + bool parseSwizzleMacro(int64_t &Imm); + bool parseSwizzleQuadPerm(int64_t &Imm); + bool parseSwizzleBitmaskPerm(int64_t &Imm); + bool parseSwizzleBroadcast(int64_t &Imm); + bool parseSwizzleSwap(int64_t &Imm); + bool parseSwizzleReverse(int64_t &Imm); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } @@ -2785,7 +2807,13 @@ void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, OptionalIdx[Op.getImmTy()] = i; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + AMDGPUOperand::ImmTy OffsetType = + (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si || + Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle : + AMDGPUOperand::ImmTyOffset; + + addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType); + if (!IsGdsHardcoded) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } @@ -3383,6 +3411,298 @@ bool AMDGPUOperand::isSendMsg() const { return isImmTy(ImmTySendMsg); } +//===----------------------------------------------------------------------===// +// parser helpers +//===----------------------------------------------------------------------===// + +bool +AMDGPUAsmParser::trySkipId(const StringRef Id) { + if (getLexer().getKind() == AsmToken::Identifier && + Parser.getTok().getString() == Id) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) { + if (getLexer().getKind() == Kind) { + Parser.Lex(); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind, + const StringRef ErrMsg) { + if (!trySkipToken(Kind)) { + Error(Parser.getTok().getLoc(), ErrMsg); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseExpr(int64_t &Imm) { + return !getParser().parseAbsoluteExpression(Imm); +} + +bool +AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) { + SMLoc S = Parser.getTok().getLoc(); + if (getLexer().getKind() == AsmToken::String) { + Val = Parser.getTok().getStringContents(); + Parser.Lex(); + return true; + } else { + Error(S, ErrMsg); + return false; + } +} + +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +LLVM_READNONE +static unsigned +encodeBitmaskPerm(const unsigned AndMask, + const unsigned OrMask, + const unsigned XorMask) { + using namespace llvm::AMDGPU::Swizzle; + + return BITMASK_PERM_ENC | + (AndMask << BITMASK_AND_SHIFT) | + (OrMask << BITMASK_OR_SHIFT) | + (XorMask << BITMASK_XOR_SHIFT); +} + +bool +AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op, + const unsigned MinVal, + const unsigned MaxVal, + const StringRef ErrMsg) { + for (unsigned i = 0; i < OpNum; ++i) { + if (!skipToken(AsmToken::Comma, "expected a comma")){ + return false; + } + SMLoc ExprLoc = Parser.getTok().getLoc(); + if (!parseExpr(Op[i])) { + return false; + } + if (Op[i] < MinVal || Op[i] > MaxVal) { + Error(ExprLoc, ErrMsg); + return false; + } + } + + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + int64_t Lane[LANE_NUM]; + if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX, + "expected a 2-bit lane id")) { + Imm = QUAD_PERM_ENC; + for (auto i = 0; i < LANE_NUM; ++i) { + Imm |= Lane[i] << (LANE_SHIFT * i); + } + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + int64_t LaneIdx; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, + "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + if (parseSwizzleOperands(1, &LaneIdx, + 0, GroupSize - 1, + "lane id must be in the interval [0,group size - 1]")) { + Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0); + return true; + } + return false; +} + +bool +AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 2, 32, "group size must be in the interval [2,32]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + SMLoc S = Parser.getTok().getLoc(); + int64_t GroupSize; + + if (!parseSwizzleOperands(1, &GroupSize, + 1, 16, "group size must be in the interval [1,16]")) { + return false; + } + if (!isPowerOf2_64(GroupSize)) { + Error(S, "group size must be a power of two"); + return false; + } + + Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (!skipToken(AsmToken::Comma, "expected a comma")) { + return false; + } + + StringRef Ctl; + SMLoc StrLoc = Parser.getTok().getLoc(); + if (!parseString(Ctl)) { + return false; + } + if (Ctl.size() != BITMASK_WIDTH) { + Error(StrLoc, "expected a 5-character mask"); + return false; + } + + unsigned AndMask = 0; + unsigned OrMask = 0; + unsigned XorMask = 0; + + for (size_t i = 0; i < Ctl.size(); ++i) { + unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i); + switch(Ctl[i]) { + default: + Error(StrLoc, "invalid mask"); + return false; + case '0': + break; + case '1': + OrMask |= Mask; + break; + case 'p': + AndMask |= Mask; + break; + case 'i': + AndMask |= Mask; + XorMask |= Mask; + break; + } + } + + Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask); + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) { + + SMLoc OffsetLoc = Parser.getTok().getLoc(); + + if (!parseExpr(Imm)) { + return false; + } + if (!isUInt<16>(Imm)) { + Error(OffsetLoc, "expected a 16-bit offset"); + return false; + } + return true; +} + +bool +AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) { + using namespace llvm::AMDGPU::Swizzle; + + if (skipToken(AsmToken::LParen, "expected a left parentheses")) { + + SMLoc ModeLoc = Parser.getTok().getLoc(); + bool Ok = false; + + if (trySkipId(IdSymbolic[ID_QUAD_PERM])) { + Ok = parseSwizzleQuadPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) { + Ok = parseSwizzleBitmaskPerm(Imm); + } else if (trySkipId(IdSymbolic[ID_BROADCAST])) { + Ok = parseSwizzleBroadcast(Imm); + } else if (trySkipId(IdSymbolic[ID_SWAP])) { + Ok = parseSwizzleSwap(Imm); + } else if (trySkipId(IdSymbolic[ID_REVERSE])) { + Ok = parseSwizzleReverse(Imm); + } else { + Error(ModeLoc, "expected a swizzle mode"); + } + + return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses"); + } + + return false; +} + +OperandMatchResultTy +AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Imm = 0; + + if (trySkipId("offset")) { + + bool Ok = false; + if (skipToken(AsmToken::Colon, "expected a colon")) { + if (trySkipId("swizzle")) { + Ok = parseSwizzleMacro(Imm); + } else { + Ok = parseSwizzleOffset(Imm); + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle)); + + return Ok? MatchOperand_Success : MatchOperand_ParseFail; + } else { + return MatchOperand_NoMatch; + } +} + +bool +AMDGPUOperand::isSwizzle() const { + return isImmTy(ImmTySwizzle); +} + //===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index 357e18108e7e..fc516c3b39c2 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -145,10 +145,10 @@ class DS_1A2D_Off8_RET +class DS_1A_RET : DS_Pseudo { let has_data0 = 0; @@ -440,7 +440,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>; } let mayStore = 0 in { diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index a817ff3cbaf0..523eea41897e 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -1160,6 +1160,112 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, O << SImm16; // Unknown simm16 code. } +static void printSwizzleBitmask(const uint16_t AndMask, + const uint16_t OrMask, + const uint16_t XorMask, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Probe0 = ((0 & AndMask) | OrMask) ^ XorMask; + uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask; + + O << "\""; + + for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) { + uint16_t p0 = Probe0 & Mask; + uint16_t p1 = Probe1 & Mask; + + if (p0 == p1) { + if (p0 == 0) { + O << "0"; + } else { + O << "1"; + } + } else { + if (p0 == 0) { + O << "p"; + } else { + O << "i"; + } + } + } + + O << "\""; +} + +void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + using namespace llvm::AMDGPU::Swizzle; + + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm == 0) { + return; + } + + O << " offset:"; + + if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) { + + O << "swizzle(" << IdSymbolic[ID_QUAD_PERM]; + for (auto i = 0; i < LANE_NUM; ++i) { + O << ","; + O << formatDec(Imm & LANE_MASK); + Imm >>= LANE_SHIFT; + } + O << ")"; + + } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) { + + uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK; + uint16_t OrMask = (Imm >> BITMASK_OR_SHIFT) & BITMASK_MASK; + uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK; + + if (AndMask == BITMASK_MAX && + OrMask == 0 && + countPopulation(XorMask) == 1) { + + O << "swizzle(" << IdSymbolic[ID_SWAP]; + O << ","; + O << formatDec(XorMask); + O << ")"; + + } else if (AndMask == BITMASK_MAX && + OrMask == 0 && XorMask > 0 && + isPowerOf2_64(XorMask + 1)) { + + O << "swizzle(" << IdSymbolic[ID_REVERSE]; + O << ","; + O << formatDec(XorMask + 1); + O << ")"; + + } else { + + uint16_t GroupSize = BITMASK_MAX - AndMask + 1; + if (GroupSize > 1 && + isPowerOf2_64(GroupSize) && + OrMask < GroupSize && + XorMask == 0) { + + O << "swizzle(" << IdSymbolic[ID_BROADCAST]; + O << ","; + O << formatDec(GroupSize); + O << ","; + O << formatDec(OrMask); + O << ")"; + + } else { + O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM]; + O << ","; + printSwizzleBitmask(AndMask, OrMask, XorMask, O); + O << ")"; + } + } + } else { + printU16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index c0b8e5c51089..c8094c4b840a 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -193,6 +193,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 80967edee0ab..5cd90323ff67 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -281,6 +281,46 @@ enum WidthMinusOne { // WidthMinusOne, (5) [15:11] } // namespace Hwreg +namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32. + +enum Id { // id of symbolic names + ID_QUAD_PERM = 0, + ID_BITMASK_PERM, + ID_SWAP, + ID_REVERSE, + ID_BROADCAST +}; + +enum EncBits { + + // swizzle mode encodings + + QUAD_PERM_ENC = 0x8000, + QUAD_PERM_ENC_MASK = 0xFF00, + + BITMASK_PERM_ENC = 0x0000, + BITMASK_PERM_ENC_MASK = 0x8000, + + // QUAD_PERM encodings + + LANE_MASK = 0x3, + LANE_MAX = LANE_MASK, + LANE_SHIFT = 2, + LANE_NUM = 4, + + // BITMASK_PERM encodings + + BITMASK_MASK = 0x1F, + BITMASK_MAX = BITMASK_MASK, + BITMASK_WIDTH = 5, + + BITMASK_AND_SHIFT = 0, + BITMASK_OR_SHIFT = 5, + BITMASK_XOR_SHIFT = 10 +}; + +} // namespace Swizzle + namespace SDWA { enum SdwaSel { diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b5e3ce3dfe3e..e22166d03e9a 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -826,7 +826,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { @@ -1149,8 +1150,10 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. - if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) { - if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + uint64_t TSFlags = Inst.getDesc().TSFlags; + if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { + if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && + TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1183,7 +1186,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && - (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) { + (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } } else if (TII->isSMRD(Inst)) { @@ -1715,6 +1718,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); MLI = &getAnalysis(); IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + const SIMachineFunctionInfo *MFI = MF.getInfo(); AMDGPUASI = ST->getAMDGPUAS(); HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1859,5 +1863,19 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + Modified = true; + } + return Modified; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index c5287c7f64ba..445bf79a7814 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -383,6 +383,14 @@ def SendMsgMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +def SwizzleMatchClass : AsmOperandClass { + let Name = "Swizzle"; + let PredicateMethod = "isSwizzle"; + let ParserMethod = "parseSwizzleOp"; + let RenderMethod = "addImmOperands"; + let IsOptional = 1; +} + def ExpTgtMatchClass : AsmOperandClass { let Name = "ExpTgt"; let PredicateMethod = "isExpTgt"; @@ -395,6 +403,11 @@ def SendMsgImm : Operand { let ParserMatchClass = SendMsgMatchClass; } +def SwizzleImm : Operand { + let PrintMethod = "printSwizzle"; + let ParserMatchClass = SwizzleMatchClass; +} + def SWaitMatchClass : AsmOperandClass { let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index b91cdddc5520..a648c178101a 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -66,6 +66,12 @@ class SIRegisterInfo final : public AMDGPURegisterInfo { const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; + // Stack access is very expensive. CSRs are also the high registers, and we + // want to minimize the number of used registers. + unsigned getCSRFirstUseCost() const override { + return 100; + } + unsigned getFrameRegister(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index b6868de6a74e..03b11ae80500 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -65,5 +65,18 @@ const char* const IdSymbolic[] = { }; } // namespace Hwreg + +namespace Swizzle { + +// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + "QUAD_PERM", + "BITMASK_PERM", + "SWAP", + "REVERSE", + "BROADCAST", +}; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h index b2dc2c0e364c..ebb2be22b487 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax. extern const char* const IdSymbolic[]; } // namespace Hwreg + +namespace Swizzle { // Symbolic names for the swizzle(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace Swizzle } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 90baabcdb652..ec49f0d37af4 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -757,14 +757,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MI.eraseFromParent(); } -static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MBB->addLiveIn(*I); -} - /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as -/// possible. This only gets used at -O0 so we don't care about efficiency of the -/// generated code. +/// possible. This only gets used at -O0 so we don't care about efficiency of +/// the generated code. bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, @@ -773,16 +768,15 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, bool IsThumb = STI->isThumb(); MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + unsigned NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -795,25 +789,35 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, if (UxtOp) { MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg()) - .addReg(Desired.getReg(), RegState::Kill); + BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg) + .addReg(DesiredReg, RegState::Kill); if (!IsThumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); } // .Lloadcmp: + // mov wStatus, #0 // ldrex rDest, [rAddr] // cmp rDest, rDesired // bne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); + if (!StatusDead) { + if (IsThumb) { + BuildMI(LoadCmpBB, DL, TII->get(ARM::tMOVi8), StatusReg) + .addDef(ARM::CPSR, RegState::Dead) + .addImm(0) + .add(predOps(ARMCC::AL)); + } else { + BuildMI(LoadCmpBB, DL, TII->get(ARM::MOVi), StatusReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + } + } MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg()); - MIB.addReg(Addr.getReg()); + MIB.addReg(AddrReg); if (LdrexOp == ARM::t2LDREX) MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); @@ -821,7 +825,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .add(Desired) + .addReg(DesiredReg) .add(predOps(ARMCC::AL)); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; BuildMI(LoadCmpBB, DL, TII->get(Bcc)) @@ -835,21 +839,16 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, // strex rStatus, rNew, [rAddr] // cmp rStatus, #0 // bne .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - - - MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg); - MIB.add(New); - MIB.add(Addr); + MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg) + .addReg(NewReg) + .addReg(AddrReg); if (StrexOp == ARM::t2STREX) MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset. MIB.add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) @@ -861,12 +860,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -894,19 +905,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, DebugLoc DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + MachineOperand New = MI.getOperand(4); + New.setIsKill(false); unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0); unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1); - unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0); - unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1); - - LivePhysRegs LiveRegs(TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0); + unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -922,26 +933,21 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // cmp rDestLo, rDesiredLo // sbcs rStatus, rDestHi, rDesiredHi // bne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD; MachineInstrBuilder MIB; MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD)); addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI); - MIB.addReg(Addr.getReg()).add(predOps(ARMCC::AL)); + MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr; BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestLo, getKillRegState(Dest.isDead())) - .addReg(DesiredLo, getKillRegState(Desired.isDead())) + .addReg(DesiredLo) .add(predOps(ARMCC::AL)); BuildMI(LoadCmpBB, DL, TII->get(CMPrr)) .addReg(DestHi, getKillRegState(Dest.isDead())) - .addReg(DesiredHi, getKillRegState(Desired.isDead())) + .addReg(DesiredHi) .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill); unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc; @@ -956,18 +962,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, // strexd rStatus, rNewLo, rNewHi, [rAddr] // cmp rStatus, #0 // bne .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD; MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg); addExclusiveRegPair(MIB, New, 0, IsThumb, TRI); - MIB.add(Addr).add(predOps(ARMCC::AL)); + MIB.addReg(AddrReg).add(predOps(ARMCC::AL)); unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri; BuildMI(StoreBB, DL, TII->get(CMPri)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(StoreBB, DL, TII->get(Bcc)) @@ -979,12 +981,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 4f7a0ab4e220..c2b2502843c0 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -968,8 +968,9 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) continue; - bool isLiveIn = MF.getRegInfo().isLiveIn(Reg); - if (!isLiveIn) + const MachineRegisterInfo &MRI = MF.getRegInfo(); + bool isLiveIn = MRI.isLiveIn(Reg); + if (!isLiveIn && !MRI.isReserved(Reg)) MBB.addLiveIn(Reg); // If NoGap is true, push consecutive registers and then leave the rest // for other instructions. e.g. diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index bee83dfb6f63..423f97ccacd6 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -1413,7 +1413,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd), // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them // and make use of the same compressed jump table format as Thumb-2. -let Size = 2 in { +let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1, + isIndirectBranch = 1 in { def tTBB_JT : tPseudoInst<(outs), (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index bf3d820e7b7d..45471a4e95b3 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -3494,7 +3494,8 @@ def t2B : T2I<(outs), (ins thumb_br_target:$target), IIC_Br, let AsmMatchConverter = "cvtThumbBranches"; } -let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in { +let Size = 4, isNotDuplicable = 1, isBranch = 1, isTerminator = 1, + isBarrier = 1, isIndirectBranch = 1 in { // available in both v8-M.Baseline and Thumb2 targets def t2BR_JT : t2basePseudoInst<(outs), diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index c4f23c66e4ea..f5e4043882ff 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -382,7 +382,7 @@ namespace { /// ARM Code Generator Pass Configuration Options. class ARMPassConfig : public TargetPassConfig { public: - ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM) + ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} ARMBaseTargetMachine &getARMTargetMachine() const { @@ -419,7 +419,7 @@ INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix", "ARM Execution Dependency Fix", false, false) TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { - return new ARMPassConfig(this, PM); + return new ARMPassConfig(*this, PM); } void ARMPassConfig::addIRPasses() { diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index e5eb27114c72..2fcee73228fe 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -60,6 +60,10 @@ class ARMBaseTargetMachine : public LLVMTargetMachine { TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; /// ARM/Thumb little endian target machine. diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 40bf545e8322..b0d1d3fb9ef0 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -729,6 +729,15 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // linker can handle it. GNU AS produces an error in this case. if (Sym->isExternal() || Value >= 0x400004) IsResolved = false; + // When an ARM function is called from a Thumb function, produce a + // relocation so the linker will use the correct branch instruction for ELF + // binaries. + if (Sym->isELF()) { + unsigned Type = dyn_cast(Sym)->getType(); + if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) && + !Asm.isThumbFunc(Sym)) + IsResolved = false; + } } // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index f917c35b9ceb..f10427e2ed57 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -698,13 +698,14 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, CopyRegs.insert(ArgReg); // Push the low registers and lr + const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!LoRegsToSave.empty()) { MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) { if (LoRegsToSave.count(Reg)) { - bool isKill = !MF.getRegInfo().isLiveIn(Reg); - if (isKill) + bool isKill = !MRI.isLiveIn(Reg); + if (isKill && !MRI.isReserved(Reg)) MBB.addLiveIn(Reg); MIB.addReg(Reg, getKillRegState(isKill)); @@ -746,8 +747,8 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, SmallVector RegsToPush; while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) { if (HiRegsToSave.count(*HiRegToSave)) { - bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave); - if (isKill) + bool isKill = !MRI.isLiveIn(*HiRegToSave); + if (isKill && !MRI.isReserved(*HiRegToSave)) MBB.addLiveIn(*HiRegToSave); // Emit a MOV from the high reg to the low reg. diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index ef9c00e4b784..7d3faac1dcc2 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -1500,9 +1500,9 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI, unsigned DstReg = MI.getOperand(0).getReg(); // BB: - // cp 0, N + // cpi N, 0 // breq RemBB - BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0); + BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0); BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB); // LoopBB: diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index f10ca394f36c..5dd8b2c27b21 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -904,7 +904,7 @@ let Defs = [SREG] in // Compares a register with an 8 bit immediate. def CPIRdK : FRdK<0b0011, (outs), - (ins GPR8:$rd, imm_ldi8:$k), + (ins LD8:$rd, imm_ldi8:$k), "cpi\t$rd, $k", [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>; } diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index fb3262916b4f..2ab0b1080c6a 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -57,7 +57,7 @@ namespace { /// AVR Code Generator Pass Configuration Options. class AVRPassConfig : public TargetPassConfig { public: - AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM) + AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} AVRTargetMachine &getAVRTargetMachine() const { @@ -71,7 +71,7 @@ class AVRPassConfig : public TargetPassConfig { } // namespace TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) { - return new AVRPassConfig(this, PM); + return new AVRPassConfig(*this, PM); } extern "C" void LLVMInitializeAVRTarget() { diff --git a/lib/Target/AVR/AVRTargetMachine.h b/lib/Target/AVR/AVRTargetMachine.h index 10345193d14a..795e94e6af03 100644 --- a/lib/Target/AVR/AVRTargetMachine.h +++ b/lib/Target/AVR/AVRTargetMachine.h @@ -41,6 +41,10 @@ class AVRTargetMachine : public LLVMTargetMachine { TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + bool isMachineVerifierClean() const override { + return false; + } + private: std::unique_ptr TLOF; AVRSubtarget SubTarget; diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp index 897695633e46..cf8e73540904 100644 --- a/lib/Target/BPF/BPFTargetMachine.cpp +++ b/lib/Target/BPF/BPFTargetMachine.cpp @@ -58,7 +58,7 @@ namespace { // BPF Code Generator Pass Configuration Options. class BPFPassConfig : public TargetPassConfig { public: - BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM) + BPFPassConfig(BPFTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} BPFTargetMachine &getBPFTargetMachine() const { @@ -70,7 +70,7 @@ class BPFPassConfig : public TargetPassConfig { } TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { - return new BPFPassConfig(this, PM); + return new BPFPassConfig(*this, PM); } // Install an instruction selector pass using diff --git a/lib/Target/BPF/CMakeLists.txt b/lib/Target/BPF/CMakeLists.txt index e2654b0465df..4918653ff19d 100644 --- a/lib/Target/BPF/CMakeLists.txt +++ b/lib/Target/BPF/CMakeLists.txt @@ -4,7 +4,7 @@ tablegen(LLVM BPFGenRegisterInfo.inc -gen-register-info) tablegen(LLVM BPFGenInstrInfo.inc -gen-instr-info) tablegen(LLVM BPFGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM BPFGenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM BPFGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM BPFGenDAGISel.inc -gen-dag-isel) tablegen(LLVM BPFGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM BPFGenCallingConv.inc -gen-callingconv) diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 8e10c521a77d..e4434136bf86 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -71,6 +71,9 @@ class HexagonDAGToDAGISel : public SelectionDAGISel { return true; } + bool ComplexPatternFuncMutatesDAG() const override { + return true; + } void PreprocessISelDAG() override; void EmitFunctionEntryCode() override; @@ -81,6 +84,7 @@ class HexagonDAGToDAGISel : public SelectionDAGISel { inline bool SelectAddrGP(SDValue &N, SDValue &R); bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP); bool SelectAddrFI(SDValue &N, SDValue &R); + bool DetectUseSxtw(SDValue &N, SDValue &R); StringRef getPassName() const override { return "Hexagon DAG->DAG Pattern Instruction Selection"; @@ -106,7 +110,6 @@ class HexagonDAGToDAGISel : public SelectionDAGISel { void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl); void SelectStore(SDNode *N); void SelectSHL(SDNode *N); - void SelectMul(SDNode *N); void SelectZeroExtend(SDNode *N); void SelectIntrinsicWChain(SDNode *N); void SelectIntrinsicWOChain(SDNode *N); @@ -118,7 +121,7 @@ class HexagonDAGToDAGISel : public SelectionDAGISel { #include "HexagonGenDAGISel.inc" private: - bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src); + bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src); bool isOrEquivalentToAdd(const SDNode *N) const; bool isAlignedMemNode(const MemSDNode *N) const; bool isPositiveHalfWord(const SDNode *N) const; @@ -597,90 +600,6 @@ void HexagonDAGToDAGISel::SelectStore(SDNode *N) { SelectCode(ST); } -void HexagonDAGToDAGISel::SelectMul(SDNode *N) { - SDLoc dl(N); - - // %conv.i = sext i32 %tmp1 to i64 - // %conv2.i = sext i32 %add to i64 - // %mul.i = mul nsw i64 %conv2.i, %conv.i - // - // --- match with the following --- - // - // %mul.i = mpy (%tmp1, %add) - // - - if (N->getValueType(0) == MVT::i64) { - // Shifting a i64 signed multiply. - SDValue MulOp0 = N->getOperand(0); - SDValue MulOp1 = N->getOperand(1); - - SDValue OP0; - SDValue OP1; - - // Handle sign_extend and sextload. - if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext0 = MulOp0.getOperand(0); - if (Sext0.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); - return; - } - OP0 = Sext0; - } else if (MulOp0.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast(MulOp0.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); - return; - } - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); - OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), TargetConst0, - Chain), 0); - } else { - SelectCode(N); - return; - } - - // Same goes for the second operand. - if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) { - SDValue Sext1 = MulOp1.getOperand(0); - if (Sext1.getNode()->getValueType(0) != MVT::i32) { - SelectCode(N); - return; - } - OP1 = Sext1; - } else if (MulOp1.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast(MulOp1.getNode()); - if (LD->getMemoryVT() != MVT::i32 || - LD->getExtensionType() != ISD::SEXTLOAD || - LD->getAddressingMode() != ISD::UNINDEXED) { - SelectCode(N); - return; - } - SDValue Chain = LD->getChain(); - SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); - OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32, - MVT::Other, - LD->getBasePtr(), TargetConst0, - Chain), 0); - } else { - SelectCode(N); - return; - } - - // Generate a mpy instruction. - SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl, - MVT::i64, OP0, OP1); - ReplaceNode(N, Result); - return; - } - - SelectCode(N); -} - void HexagonDAGToDAGISel::SelectSHL(SDNode *N) { SDLoc dl(N); SDValue Shl_0 = N->getOperand(0); @@ -843,7 +762,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) { SDValue V = N->getOperand(1); SDValue U; - if (isValueExtension(V, Bits, U)) { + if (keepsLowBits(V, Bits, U)) { SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), N->getOperand(0), U); ReplaceNode(N, R.getNode()); @@ -949,7 +868,6 @@ void HexagonDAGToDAGISel::Select(SDNode *N) { case ISD::SHL: return SelectSHL(N); case ISD::LOAD: return SelectLoad(N); case ISD::STORE: return SelectStore(N); - case ISD::MUL: return SelectMul(N); case ISD::ZERO_EXTEND: return SelectZeroExtend(N); case ISD::INTRINSIC_W_CHAIN: return SelectIntrinsicWChain(N); case ISD::INTRINSIC_WO_CHAIN: return SelectIntrinsicWOChain(N); @@ -1327,7 +1245,7 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() { } // Match a frame index that can be used in an addressing mode. -bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) { +bool HexagonDAGToDAGISel::SelectAddrFI(SDValue &N, SDValue &R) { if (N.getOpcode() != ISD::FrameIndex) return false; auto &HFI = *HST->getFrameLowering(); @@ -1388,16 +1306,83 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R, return false; } -bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, - unsigned FromBits, SDValue &Src) { +bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { + // This (complex pattern) function is meant to detect a sign-extension + // i32->i64 on a per-operand basis. This would allow writing single + // patterns that would cover a number of combinations of different ways + // a sign-extensions could be written. For example: + // (mul (DetectUseSxtw x) (DetectUseSxtw y)) -> (M2_dpmpyss_s0 x y) + // could match either one of these: + // (mul (sext x) (sext_inreg y)) + // (mul (sext-load *p) (sext_inreg y)) + // (mul (sext_inreg x) (sext y)) + // etc. + // + // The returned value will have type i64 and its low word will + // contain the value being extended. The high bits are not specified. + // The returned type is i64 because the original type of N was i64, + // but the users of this function should only use the low-word of the + // result, e.g. + // (mul sxtw:x, sxtw:y) -> (M2_dpmpyss_s0 (LoReg sxtw:x), (LoReg sxtw:y)) + + if (N.getValueType() != MVT::i64) + return false; + EVT SrcVT; + unsigned Opc = N.getOpcode(); + switch (Opc) { + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + // sext_inreg has the source type as a separate operand. + EVT T = Opc == ISD::SIGN_EXTEND + ? N.getOperand(0).getValueType() + : cast(N.getOperand(1))->getVT(); + if (T.getSizeInBits() != 32) + return false; + R = N.getOperand(0); + break; + } + case ISD::LOAD: { + LoadSDNode *L = cast(N); + if (L->getExtensionType() != ISD::SEXTLOAD) + return false; + // All extending loads extend to i32, so even if the value in + // memory is shorter than 32 bits, it will be i32 after the load. + if (L->getMemoryVT().getSizeInBits() > 32) + return false; + R = N; + break; + } + default: + return false; + } + EVT RT = R.getValueType(); + if (RT == MVT::i64) + return true; + assert(RT == MVT::i32); + // This is only to produce a value of type i64. Do not rely on the + // high bits produced by this. + const SDLoc &dl(N); + SDValue Ops[] = { + CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32), + R, CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32), + R, CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32) + }; + SDNode *T = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, + MVT::i64, Ops); + R = SDValue(T, 0); + return true; +} + +bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits, + SDValue &Src) { unsigned Opc = Val.getOpcode(); switch (Opc) { case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: { - SDValue const &Op0 = Val.getOperand(0); + const SDValue &Op0 = Val.getOperand(0); EVT T = Op0.getValueType(); - if (T.isInteger() && T.getSizeInBits() == FromBits) { + if (T.isInteger() && T.getSizeInBits() == NumBits) { Src = Op0; return true; } @@ -1408,23 +1393,23 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, case ISD::AssertZext: if (Val.getOperand(0).getValueType().isInteger()) { VTSDNode *T = cast(Val.getOperand(1)); - if (T->getVT().getSizeInBits() == FromBits) { + if (T->getVT().getSizeInBits() == NumBits) { Src = Val.getOperand(0); return true; } } break; case ISD::AND: { - // Check if this is an AND with "FromBits" of lower bits set to 1. - uint64_t FromMask = (1 << FromBits) - 1; + // Check if this is an AND with NumBits of lower bits set to 1. + uint64_t Mask = (1 << NumBits) - 1; if (ConstantSDNode *C = dyn_cast(Val.getOperand(0))) { - if (C->getZExtValue() == FromMask) { + if (C->getZExtValue() == Mask) { Src = Val.getOperand(1); return true; } } if (ConstantSDNode *C = dyn_cast(Val.getOperand(1))) { - if (C->getZExtValue() == FromMask) { + if (C->getZExtValue() == Mask) { Src = Val.getOperand(0); return true; } @@ -1433,16 +1418,16 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val, } case ISD::OR: case ISD::XOR: { - // OR/XOR with the lower "FromBits" bits set to 0. - uint64_t FromMask = (1 << FromBits) - 1; + // OR/XOR with the lower NumBits bits set to 0. + uint64_t Mask = (1 << NumBits) - 1; if (ConstantSDNode *C = dyn_cast(Val.getOperand(0))) { - if ((C->getZExtValue() & FromMask) == 0) { + if ((C->getZExtValue() & Mask) == 0) { Src = Val.getOperand(1); return true; } } if (ConstantSDNode *C = dyn_cast(Val.getOperand(1))) { - if ((C->getZExtValue() & FromMask) == 0) { + if ((C->getZExtValue() & Mask) == 0) { Src = Val.getOperand(0); return true; } diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 5ecf9320d5c2..4c6c6eeafbe0 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1928,11 +1928,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BSWAP, MVT::i32, Legal); setOperationAction(ISD::BSWAP, MVT::i64, Legal); - - // We custom lower i64 to i64 mul, so that it is not considered as a legal - // operation. There is a pattern that will match i64 mul and transform it - // to a series of instructions. - setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MUL, MVT::i64, Legal); for (unsigned IntExpOp : { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 66e07c67958e..0fef91ec4d3e 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1769,161 +1769,6 @@ bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const { return getType(MI) == HexagonII::TypeCJ && MI.isBranch(); } -bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const { - return (MI.isBranch() && isPredicated(MI)) || - isConditionalTransfer(MI) || - isConditionalALU32(MI) || - isConditionalLoad(MI) || - // Predicated stores which don't have a .new on any operands. - (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) && - !isPredicatedNew(MI)); -} - -bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case Hexagon::A2_paddf: - case Hexagon::A2_paddfnew: - case Hexagon::A2_paddif: - case Hexagon::A2_paddifnew: - case Hexagon::A2_paddit: - case Hexagon::A2_padditnew: - case Hexagon::A2_paddt: - case Hexagon::A2_paddtnew: - case Hexagon::A2_pandf: - case Hexagon::A2_pandfnew: - case Hexagon::A2_pandt: - case Hexagon::A2_pandtnew: - case Hexagon::A2_porf: - case Hexagon::A2_porfnew: - case Hexagon::A2_port: - case Hexagon::A2_portnew: - case Hexagon::A2_psubf: - case Hexagon::A2_psubfnew: - case Hexagon::A2_psubt: - case Hexagon::A2_psubtnew: - case Hexagon::A2_pxorf: - case Hexagon::A2_pxorfnew: - case Hexagon::A2_pxort: - case Hexagon::A2_pxortnew: - case Hexagon::A4_paslhf: - case Hexagon::A4_paslhfnew: - case Hexagon::A4_paslht: - case Hexagon::A4_paslhtnew: - case Hexagon::A4_pasrhf: - case Hexagon::A4_pasrhfnew: - case Hexagon::A4_pasrht: - case Hexagon::A4_pasrhtnew: - case Hexagon::A4_psxtbf: - case Hexagon::A4_psxtbfnew: - case Hexagon::A4_psxtbt: - case Hexagon::A4_psxtbtnew: - case Hexagon::A4_psxthf: - case Hexagon::A4_psxthfnew: - case Hexagon::A4_psxtht: - case Hexagon::A4_psxthtnew: - case Hexagon::A4_pzxtbf: - case Hexagon::A4_pzxtbfnew: - case Hexagon::A4_pzxtbt: - case Hexagon::A4_pzxtbtnew: - case Hexagon::A4_pzxthf: - case Hexagon::A4_pzxthfnew: - case Hexagon::A4_pzxtht: - case Hexagon::A4_pzxthtnew: - case Hexagon::C2_ccombinewf: - case Hexagon::C2_ccombinewt: - return true; - } - return false; -} - -// FIXME - Function name and it's functionality don't match. -// It should be renamed to hasPredNewOpcode() -bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const { - if (!MI.getDesc().mayLoad() || !isPredicated(MI)) - return false; - - int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode()); - // Instruction with valid predicated-new opcode can be promoted to .new. - return PNewOpcode >= 0; -} - -// Returns true if an instruction is a conditional store. -// -// Note: It doesn't include conditional new-value stores as they can't be -// converted to .new predicate. -bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - default: return false; - case Hexagon::S4_storeirbt_io: - case Hexagon::S4_storeirbf_io: - case Hexagon::S4_pstorerbt_rr: - case Hexagon::S4_pstorerbf_rr: - case Hexagon::S2_pstorerbt_io: - case Hexagon::S2_pstorerbf_io: - case Hexagon::S2_pstorerbt_pi: - case Hexagon::S2_pstorerbf_pi: - case Hexagon::S2_pstorerdt_io: - case Hexagon::S2_pstorerdf_io: - case Hexagon::S4_pstorerdt_rr: - case Hexagon::S4_pstorerdf_rr: - case Hexagon::S2_pstorerdt_pi: - case Hexagon::S2_pstorerdf_pi: - case Hexagon::S2_pstorerht_io: - case Hexagon::S2_pstorerhf_io: - case Hexagon::S4_storeirht_io: - case Hexagon::S4_storeirhf_io: - case Hexagon::S4_pstorerht_rr: - case Hexagon::S4_pstorerhf_rr: - case Hexagon::S2_pstorerht_pi: - case Hexagon::S2_pstorerhf_pi: - case Hexagon::S2_pstorerit_io: - case Hexagon::S2_pstorerif_io: - case Hexagon::S4_storeirit_io: - case Hexagon::S4_storeirif_io: - case Hexagon::S4_pstorerit_rr: - case Hexagon::S4_pstorerif_rr: - case Hexagon::S2_pstorerit_pi: - case Hexagon::S2_pstorerif_pi: - - // V4 global address store before promoting to dot new. - case Hexagon::S4_pstorerdt_abs: - case Hexagon::S4_pstorerdf_abs: - case Hexagon::S4_pstorerbt_abs: - case Hexagon::S4_pstorerbf_abs: - case Hexagon::S4_pstorerht_abs: - case Hexagon::S4_pstorerhf_abs: - case Hexagon::S4_pstorerit_abs: - case Hexagon::S4_pstorerif_abs: - return true; - - // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded - // from the "Conditional Store" list. Because a predicated new value store - // would NOT be promoted to a double dot new store. - // This function returns yes for those stores that are predicated but not - // yet promoted to predicate dot new instructions. - } -} - -bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const { - switch (MI.getOpcode()) { - case Hexagon::A2_tfrt: - case Hexagon::A2_tfrf: - case Hexagon::C2_cmoveit: - case Hexagon::C2_cmoveif: - case Hexagon::A2_tfrtnew: - case Hexagon::A2_tfrfnew: - case Hexagon::C2_cmovenewit: - case Hexagon::C2_cmovenewif: - case Hexagon::A2_tfrpt: - case Hexagon::A2_tfrpf: - return true; - - default: - return false; - } - return false; -} - // TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle // isFPImm and later getFPImm as well. bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const { @@ -3474,6 +3319,8 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const { // Returns the opcode to use when converting MI, which is a conditional jump, // into a conditional instruction which uses the .new value of the predicate. // We also use branch probabilities to add a hint to the jump. +// If MBPI is null, all edges will be treated as equally likely for the +// purposes of establishing a predication hint. int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, const MachineBranchProbabilityInfo *MBPI) const { // We assume that block can have at most two successors. @@ -3482,9 +3329,16 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, bool Taken = false; const BranchProbability OneHalf(1, 2); + auto getEdgeProbability = [MBPI] (const MachineBasicBlock *Src, + const MachineBasicBlock *Dst) { + if (MBPI) + return MBPI->getEdgeProbability(Src, Dst); + return BranchProbability(1, Src->succ_size()); + }; + if (BrTarget.isMBB()) { const MachineBasicBlock *Dst = BrTarget.getMBB(); - Taken = MBPI->getEdgeProbability(Src, Dst) >= OneHalf; + Taken = getEdgeProbability(Src, Dst) >= OneHalf; } else { // The branch target is not a basic block (most likely a function). // Since BPI only gives probabilities for targets that are basic blocks, @@ -3521,7 +3375,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, for (const MachineBasicBlock *SB : B.successors()) { if (!B.isLayoutSuccessor(SB)) continue; - Taken = MBPI->getEdgeProbability(Src, SB) < OneHalf; + Taken = getEdgeProbability(Src, SB) < OneHalf; break; } } else { @@ -3534,7 +3388,7 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI, BT = Op.getMBB(); break; } - Taken = BT && MBPI->getEdgeProbability(Src, BT) < OneHalf; + Taken = BT && getEdgeProbability(Src, BT) < OneHalf; } } // if (!Bad) } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 97b9bc954688..944d0161a7c8 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -314,11 +314,6 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { bool isAccumulator(const MachineInstr &MI) const; bool isComplex(const MachineInstr &MI) const; bool isCompoundBranchInstr(const MachineInstr &MI) const; - bool isCondInst(const MachineInstr &MI) const; - bool isConditionalALU32 (const MachineInstr &MI) const; - bool isConditionalLoad(const MachineInstr &MI) const; - bool isConditionalStore(const MachineInstr &MI) const; - bool isConditionalTransfer(const MachineInstr &MI) const; bool isConstExtended(const MachineInstr &MI) const; bool isDeallocRet(const MachineInstr &MI) const; bool isDependent(const MachineInstr &ProdMI, diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index e6ea67d55b43..9aa185fc85a6 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -59,6 +59,9 @@ cl::opt HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination.")); +static cl::opt SimplifyLimit("hlir-simplify-limit", cl::init(10000), + cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR")); + static const char *HexagonVolatileMemcpyName = "hexagon_memcpy_forward_vp4cp4n2"; @@ -477,7 +480,7 @@ Value *Simplifier::simplify(Context &C) { WorkListType Q; Q.push_back(C.Root); unsigned Count = 0; - const unsigned Limit = 100000; + const unsigned Limit = SimplifyLimit; while (!Q.empty()) { if (Count++ >= Limit) @@ -501,8 +504,7 @@ Value *Simplifier::simplify(Context &C) { Q.push_back(Op); } } - assert(Count < Limit && "Infinite loop in HLIR/simplify?"); - return C.Root; + return Count < Limit ? C.Root : nullptr; } diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 81b5e10c1173..70ed123bc898 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -382,48 +382,42 @@ def: T_MType_acc_pat3 ; def: T_MType_acc_pat3 ; def: T_MType_acc_pat3 ; +// This complex pattern is really only to detect various forms of +// sign-extension i32->i64. The selected value will be of type i64 +// whose low word is the value being extended. The high word is +// unspecified. +def Usxtw : ComplexPattern; + def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>; -def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>; def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>; +def Sext64: PatLeaf<(i64 Usxtw:$Rs)>; -// Return true if for a 32 to 64-bit sign-extended load. -def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{ - LoadSDNode *LD = dyn_cast(N); - if (!LD) - return false; - return LD->getExtensionType() == ISD::SEXTLOAD && - LD->getMemoryVT().getScalarType() == MVT::i32; -}]>; +def: Pat<(mul (Aext64 I32:$Rs), (Aext64 I32:$Rt)), + (M2_dpmpyuu_s0 I32:$Rs, I32:$Rt)>; -def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)), - (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>; - -def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)), - (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>; - -def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2), - (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>; +def: Pat<(mul Sext64:$Rs, Sext64:$Rt), + (M2_dpmpyss_s0 (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; // Multiply and accumulate, use full result. // Rxx[+-]=mpy(Rs,Rt) -def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))), - (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)), + (M2_dpmpyss_acc_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; -def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))), - (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)), + (M2_dpmpyss_nac_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>; -def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))), - (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))), + (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))), - (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(add I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))), + (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))), - (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))), + (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; -def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))), - (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>; +def: Pat<(sub I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))), + (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>; class Storepi_pat @@ -545,7 +539,8 @@ def: Storexm_simple_pat; def: Storexm_simple_pat; def: Storexm_simple_pat; -def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>; +def: Pat <(i64 (sext I32:$src)), (A2_sxtw I32:$src)>; +def: Pat <(i64 (sext_inreg I64:$src, i32)), (A2_sxtw (LoReg I64:$src))>; def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src), (A2_abs IntRegs:$src)>; @@ -1159,8 +1154,8 @@ multiclass MinMax_pats_p { defm: T_MinMax_pats; } -def: Pat<(add (Sext64 I32:$Rs), I64:$Rt), - (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>; +def: Pat<(add Sext64:$Rs, I64:$Rt), + (A2_addsp (LoReg Sext64:$Rs), DoubleRegs:$Rt)>; let AddedComplexity = 200 in { defm: MinMax_pats_p; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 8e93df6201ae..14ecf297d351 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -223,7 +223,7 @@ namespace { /// Hexagon Code Generator Pass Configuration Options. class HexagonPassConfig : public TargetPassConfig { public: - HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM) + HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} HexagonTargetMachine &getHexagonTargetMachine() const { @@ -245,7 +245,7 @@ class HexagonPassConfig : public TargetPassConfig { } // namespace TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) { - return new HexagonPassConfig(this, PM); + return new HexagonPassConfig(*this, PM); } void HexagonPassConfig::addIRPasses() { diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index cd474921d4bc..fa08afe4019d 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -273,25 +273,17 @@ bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI, if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister()) return true; - // Check if this is a predicate dependence. - const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg); - if (RC == &Hexagon::PredRegsRegClass) - return true; - - // Assumes that the first operand of the CALLr is the function address. - if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) { - const MachineOperand MO = MI.getOperand(0); - if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) - return true; + // Call-like instructions can be packetized with preceding instructions + // that define registers implicitly used or modified by the call. Explicit + // uses are still prohibited, as in the case of indirect calls: + // r0 = ... + // J2_jumpr r0 + if (DepType == SDep::Data) { + for (const MachineOperand MO : MI.operands()) + if (MO.isReg() && MO.getReg() == DepReg && !MO.isImplicit()) + return true; } - if (HII->isJumpR(MI)) { - const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1) - : MI.getOperand(0); - assert(MO.isReg() && MO.isUse()); - if (MO.getReg() == DepReg) - return true; - } return false; } @@ -333,11 +325,13 @@ bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI, const TargetRegisterClass *NewRC) { // Vector stores can be predicated, and can be new-value stores, but // they cannot be predicated on a .new predicate value. - if (NewRC == &Hexagon::PredRegsRegClass) + if (NewRC == &Hexagon::PredRegsRegClass) { if (HII->isHVXVec(MI) && MI.mayStore()) return false; - return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() || - HII->mayBeNewStore(MI); + return HII->isPredicated(MI) && HII->getDotNewPredOp(MI, nullptr) > 0; + } + // If the class is not PredRegs, it could only apply to new-value stores. + return HII->mayBeNewStore(MI); } // Promote an instructiont to its .cur form. @@ -760,11 +754,14 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI, return false; } -static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) { +static bool isImplicitDependency(const MachineInstr &I, bool CheckDef, + unsigned DepReg) { for (auto &MO : I.operands()) { - if (MO.isRegMask() && MO.clobbersPhysReg(DepReg)) + if (CheckDef && MO.isRegMask() && MO.clobbersPhysReg(DepReg)) return true; - if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit()) + if (!MO.isReg() || MO.getReg() != DepReg || !MO.isImplicit()) + continue; + if (CheckDef == MO.isDef()) return true; } return false; @@ -798,7 +795,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // If dependency is trough an implicitly defined register, we should not // newify the use. - if (isImplicitDependency(PI, DepReg)) + if (isImplicitDependency(PI, true, DepReg) || + isImplicitDependency(MI, false, DepReg)) return false; const MCInstrDesc& MCID = PI.getDesc(); @@ -808,8 +806,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, // predicate .new if (RC == &Hexagon::PredRegsRegClass) - if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn()) - return HII->predCanBeUsedAsDotNew(PI, DepReg); + return HII->predCanBeUsedAsDotNew(PI, DepReg); if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI)) return false; diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 2a9bc25d7fad..a2f005ce445a 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -76,7 +76,7 @@ namespace { // Lanai Code Generator Pass Configuration Options. class LanaiPassConfig : public TargetPassConfig { public: - LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager) + LanaiPassConfig(LanaiTargetMachine &TM, PassManagerBase *PassManager) : TargetPassConfig(TM, *PassManager) {} LanaiTargetMachine &getLanaiTargetMachine() const { @@ -91,7 +91,7 @@ class LanaiPassConfig : public TargetPassConfig { TargetPassConfig * LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) { - return new LanaiPassConfig(this, &PassManager); + return new LanaiPassConfig(*this, &PassManager); } // Install an instruction selector pass. diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 5278c70d909d..083ba6fdf841 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -49,6 +49,10 @@ class LanaiTargetMachine : public LLVMTargetMachine { TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; } // namespace llvm diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp index bebe5fa35ad4..d8fdc8ba674e 100644 --- a/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -52,7 +52,7 @@ namespace { /// MSP430 Code Generator Pass Configuration Options. class MSP430PassConfig : public TargetPassConfig { public: - MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM) + MSP430PassConfig(MSP430TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} MSP430TargetMachine &getMSP430TargetMachine() const { @@ -65,7 +65,7 @@ class MSP430PassConfig : public TargetPassConfig { } // namespace TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) { - return new MSP430PassConfig(this, PM); + return new MSP430PassConfig(*this, PM); } bool MSP430PassConfig::addInstSelector() { diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp index e7ceca9612a9..a222080f6b81 100644 --- a/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/lib/Target/Mips/Mips16FrameLowering.cpp @@ -1,4 +1,4 @@ -//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===// +//===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===// // // The LLVM Compiler Infrastructure // @@ -11,20 +11,29 @@ // //===----------------------------------------------------------------------===// -#include "Mips16FrameLowering.h" #include "MCTargetDesc/MipsBaseInfo.h" +#include "Mips16FrameLowering.h" #include "Mips16InstrInfo.h" #include "MipsInstrInfo.h" #include "MipsRegisterInfo.h" #include "MipsSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/Target/TargetOptions.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetFrameLowering.h" +#include +#include +#include using namespace llvm; @@ -63,7 +72,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, const std::vector &CSI = MFI.getCalleeSavedInfo(); - if (CSI.size()) { + if (!CSI.empty()) { const std::vector &CSI = MFI.getCalleeSavedInfo(); for (std::vector::const_iterator I = CSI.begin(), @@ -80,7 +89,6 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, if (hasFP(MF)) BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0) .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup); - } void Mips16FrameLowering::emitEpilogue(MachineFunction &MF, diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 092de216e9b8..a9d6ab055892 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -201,7 +201,7 @@ namespace { /// Mips Code Generator Pass Configuration Options. class MipsPassConfig : public TargetPassConfig { public: - MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM) + MipsPassConfig(MipsTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { // The current implementation of long branch pass requires a scratch // register ($at) to be available before branch instructions. Tail merging @@ -227,7 +227,7 @@ class MipsPassConfig : public TargetPassConfig { } // end anonymous namespace TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) { - return new MipsPassConfig(this, PM); + return new MipsPassConfig(*this, PM); } void MipsPassConfig::addIRPasses() { diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index 140d7133f879..a3462868cb11 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -66,6 +66,10 @@ class MipsTargetMachine : public LLVMTargetMachine { bool isLittleEndian() const { return isLittle; } const MipsABIInfo &getABI() const { return ABI; } + + bool isMachineVerifierClean() const override { + return false; + } }; /// Mips32/64 big endian target machine. diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index ab5298d0dcfd..8dfbfece9b8e 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -132,7 +132,7 @@ namespace { class NVPTXPassConfig : public TargetPassConfig { public: - NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) + NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} NVPTXTargetMachine &getNVPTXTargetMachine() const { @@ -163,7 +163,7 @@ class NVPTXPassConfig : public TargetPassConfig { } // end anonymous namespace TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { - return new NVPTXPassConfig(this, PM); + return new NVPTXPassConfig(*this, PM); } void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 1ed8e3b1e935..2f3981be22f8 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -65,6 +65,9 @@ class NVPTXTargetMachine : public LLVMTargetMachine { TargetIRAnalysis getTargetIRAnalysis() override; + bool isMachineVerifierClean() const override { + return false; + } }; // NVPTXTargetMachine. class NVPTXTargetMachine32 : public NVPTXTargetMachine { diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 5fa7b2c6bfb1..54414457388d 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -77,6 +77,11 @@ STATISTIC(SignExtensionsAdded, "Number of sign extensions for compare inputs added."); STATISTIC(ZeroExtensionsAdded, "Number of zero extensions for compare inputs added."); +STATISTIC(NumLogicOpsOnComparison, + "Number of logical ops on i1 values calculated in GPR."); +STATISTIC(OmittedForNonExtendUses, + "Number of compares not eliminated as they have non-extending uses."); + // FIXME: Remove this once the bug has been fixed! cl::opt ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); @@ -275,6 +280,8 @@ namespace { bool trySETCC(SDNode *N); bool tryEXTEND(SDNode *N); + bool tryLogicOpOfCompares(SDNode *N); + SDValue computeLogicOpInGPR(SDValue LogicOp); SDValue signExtendInputIfNeeded(SDValue Input); SDValue zeroExtendInputIfNeeded(SDValue Input); SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); @@ -282,6 +289,10 @@ namespace { int64_t RHSValue, SDLoc dl); SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, int64_t RHSValue, SDLoc dl); + SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); @@ -2501,6 +2512,11 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { return true; } +// Is this opcode a bitwise logical operation? +static bool isLogicOp(unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; +} + /// If this node is a sign/zero extension of an integer comparison, /// it can usually be computed in GPR's rather than using comparison /// instructions and ISEL. We only do this on 64-bit targets for now @@ -2513,13 +2529,20 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { N->getOpcode() == ISD::SIGN_EXTEND) && "Expecting a zero/sign extend node!"); - if (N->getOperand(0).getOpcode() != ISD::SETCC) + SDValue WideRes; + // If we are zero-extending the result of a logical operation on i1 + // values, we can keep the values in GPRs. + if (isLogicOp(N->getOperand(0).getOpcode()) && + N->getOperand(0).getValueType() == MVT::i1 && + N->getOpcode() == ISD::ZERO_EXTEND) + WideRes = computeLogicOpInGPR(N->getOperand(0)); + else if (N->getOperand(0).getOpcode() != ISD::SETCC) return false; - - SDValue WideRes = - getSETCCInGPR(N->getOperand(0), - N->getOpcode() == ISD::SIGN_EXTEND ? - SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); + else + WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); if (!WideRes) return false; @@ -2540,6 +2563,159 @@ bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { return true; } +// Lower a logical operation on i1 values into a GPR sequence if possible. +// The result can be kept in a GPR if requested. +// Three types of inputs can be handled: +// - SETCC +// - TRUNCATE +// - Logical operation (AND/OR/XOR) +// There is also a special case that is handled (namely a complement operation +// achieved with xor %a, -1). +SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) { + assert(isLogicOp(LogicOp.getOpcode()) && + "Can only handle logic operations here."); + assert(LogicOp.getValueType() == MVT::i1 && + "Can only handle logic operations on i1 values here."); + SDLoc dl(LogicOp); + SDValue LHS, RHS; + + // Special case: xor %a, -1 + bool IsBitwiseNegation = isBitwiseNot(LogicOp); + + // Produces a GPR sequence for each operand of the binary logic operation. + // For SETCC, it produces the respective comparison, for TRUNCATE it truncates + // the value in a GPR and for logic operations, it will recursively produce + // a GPR sequence for the operation. + auto getLogicOperand = [&] (SDValue Operand) -> SDValue { + unsigned OperandOpcode = Operand.getOpcode(); + if (OperandOpcode == ISD::SETCC) + return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig); + else if (OperandOpcode == ISD::TRUNCATE) { + SDValue InputOp = Operand.getOperand(0); + EVT InVT = InputOp.getValueType(); + return + SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 : + PPC::RLDICL, dl, InVT, InputOp, + getI64Imm(0, dl), getI64Imm(63, dl)), 0); + } else if (isLogicOp(OperandOpcode)) + return computeLogicOpInGPR(Operand); + return SDValue(); + }; + LHS = getLogicOperand(LogicOp.getOperand(0)); + RHS = getLogicOperand(LogicOp.getOperand(1)); + + // If a GPR sequence can't be produced for the LHS we can't proceed. + // Not producing a GPR sequence for the RHS is only a problem if this isn't + // a bitwise negation operation. + if (!LHS || (!RHS && !IsBitwiseNegation)) + return SDValue(); + + NumLogicOpsOnComparison++; + + // We will use the inputs as 64-bit values. + if (LHS.getValueType() == MVT::i32) + LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext); + if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32) + RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext); + + unsigned NewOpc; + switch (LogicOp.getOpcode()) { + default: llvm_unreachable("Unknown logic operation."); + case ISD::AND: NewOpc = PPC::AND8; break; + case ISD::OR: NewOpc = PPC::OR8; break; + case ISD::XOR: NewOpc = PPC::XOR8; break; + } + + if (IsBitwiseNegation) { + RHS = getI64Imm(1, dl); + NewOpc = PPC::XORI8; + } + + return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0); + +} + +/// Try performing logical operations on results of comparisons in GPRs. +/// It is typically preferred from a performance perspective over performing +/// the operations on individual bits in the CR. We only do this on 64-bit +/// targets for now as the code is specialized for 64-bit (it uses 64-bit +/// instructions and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + if (N->getValueType(0) != MVT::i1) + return false; + assert(isLogicOp(N->getOpcode()) && + "Expected a logic operation on setcc results."); + SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0)); + if (!LoweredLogical) + return false; + + SDLoc dl(N); + bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8; + unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt; + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue LHS = LoweredLogical.getOperand(0); + SDValue RHS = LoweredLogical.getOperand(1); + SDValue WideOp; + SDValue OpToConvToRecForm; + + // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode + // that is input to the XORI. + if (IsBitwiseNegate && + LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG) + OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1); + else if (IsBitwiseNegate) + // If the input to the XORI isn't an extension, that's what we're after. + OpToConvToRecForm = LoweredLogical.getOperand(0); + else + // If this is not an XORI, it is a reg-reg logical op and we can convert it + // to record-form. + OpToConvToRecForm = LoweredLogical; + + // Get the record-form version of the node we're looking to use to get the + // CR result from. + uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode(); + int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc); + + // Convert the right node to record-form. This is either the logical we're + // looking at or it is the input node to the negation (if we're looking at + // a bitwise negation). + if (NewOpc != -1 && IsBitwiseNegate) { + // The input to the XORI has a record-form. Use it. + assert(LoweredLogical.getConstantOperandVal(1) == 1 && + "Expected a PPC::XORI8 only for bitwise negation."); + // Emit the record-form instruction. + std::vector Ops; + for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++) + Ops.push_back(OpToConvToRecForm.getOperand(i)); + + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc, dl, + OpToConvToRecForm.getValueType(), + MVT::Glue, Ops), 0); + } else { + assert((NewOpc != -1 || !IsBitwiseNegate) && + "No record form available for AND8/OR8/XOR8?"); + WideOp = + SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, + MVT::i64, MVT::Glue, LHS, RHS), 0); + } + + // Select this node to a single bit from CR0 set by the record-form node + // just created. For bitwise negation, use the EQ bit which is the equivalent + // of negating the result (i.e. it is a bit set when the result of the + // operation is zero). + SDValue SRIdxVal = + CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32); + SDValue CRBit = + SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i1, CR0Reg, SRIdxVal, + WideOp.getValue(1)), 0); + ReplaceNode(N, CRBit.getNode()); + return true; +} + /// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. /// Useful when emitting comparison code for 32-bit values without using /// the compare instruction (which only considers the lower 32-bits). @@ -2677,6 +2853,77 @@ SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, } } +/// Produces a zero-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6) + // (zext (setcc %a, 0, seteq)) -> (lshr (ctlz %a), 6) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0); + return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz, + getI64Imm(58, dl), getI64Imm(63, dl)), + 0); + } + } +} + +/// Produces a sign-extended result of comparing two 64-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1) + // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA) + // {addcz.reg, addcz.CA} = (addcarry %a, -1) + // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA) + SDValue AddInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0); + SDValue Addic = + SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue, + AddInput, getI32Imm(~0U, dl)), 0); + return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic, + Addic, Addic.getValue(1)), 0); + } + } +} + +/// Does this SDValue have any uses for which keeping the value in a GPR is +/// appropriate. This is meant to be used on values that have type i1 since +/// it is somewhat meaningless to ask if values of other types can be kept in +/// GPR's. +static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) { + assert(Compare.getOpcode() == ISD::SETCC && + "An ISD::SETCC node required here."); + + // For values that have a single use, the caller should obviously already have + // checked if that use is an extending use. We check the other uses here. + if (Compare.hasOneUse()) + return true; + // We want the value in a GPR if it is being extended, used for a select, or + // used in logical operations. + for (auto CompareUse : Compare.getNode()->uses()) + if (CompareUse->getOpcode() != ISD::SIGN_EXTEND && + CompareUse->getOpcode() != ISD::ZERO_EXTEND && + CompareUse->getOpcode() != ISD::SELECT && + !isLogicOp(CompareUse->getOpcode())) { + OmittedForNonExtendUses++; + return false; + } + return true; +} + /// Returns an equivalent of a SETCC node but with the result the same width as /// the inputs. This can nalso be used for SELECT_CC if either the true or false /// values is a power of two while the other is zero. @@ -2686,6 +2933,11 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, Compare.getOpcode() == ISD::SELECT_CC) && "An ISD::SETCC node required here."); + // Don't convert this comparison to a GPR sequence because there are uses + // of the i1 result (i.e. uses that require the result in the CR). + if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG)) + return SDValue(); + SDValue LHS = Compare.getOperand(0); SDValue RHS = Compare.getOperand(1); @@ -2694,30 +2946,35 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, ISD::CondCode CC = cast(Compare.getOperand(CCOpNum))->get(); EVT InputVT = LHS.getValueType(); - if (InputVT != MVT::i32) + if (InputVT != MVT::i32 && InputVT != MVT::i64) return SDValue(); - SDLoc dl(Compare); - ConstantSDNode *RHSConst = dyn_cast(RHS); - int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; - if (ConvOpts == SetccInGPROpts::ZExtInvert || ConvOpts == SetccInGPROpts::SExtInvert) CC = ISD::getSetCCInverse(CC, true); - if (ISD::isSignedIntSetCC(CC)) { + bool Inputs32Bit = InputVT == MVT::i32; + if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) { LHS = signExtendInputIfNeeded(LHS); RHS = signExtendInputIfNeeded(RHS); - } else if (ISD::isUnsignedIntSetCC(CC)) { + } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) { LHS = zeroExtendInputIfNeeded(LHS); RHS = zeroExtendInputIfNeeded(RHS); } + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || ConvOpts == SetccInGPROpts::SExtInvert; - if (IsSext) + + if (IsSext && Inputs32Bit) return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); - return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (Inputs32Bit) + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); + else if (IsSext) + return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); } void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { @@ -2906,6 +3163,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::AND: { + if (tryLogicOpOfCompares(N)) + return; + unsigned Imm, Imm2, SH, MB, ME; uint64_t Imm64; @@ -3025,6 +3285,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryBitfieldInsert(N)) return; + if (tryLogicOpOfCompares(N)) + return; + short Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { @@ -3042,6 +3305,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ISD::XOR: { + if (tryLogicOpOfCompares(N)) + return; + break; + } case ISD::ADD: { short Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 216efcc4a1ee..41ff9d903aa0 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 8; + MaxLoadsPerMemcmpOptSize = 4; } } @@ -1112,6 +1116,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; @@ -1593,17 +1598,25 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } - // Check that the mask is shuffling words -static bool isWordShuffleMask(ShuffleVectorSDNode *N) { - for (unsigned i = 0; i < 4; ++i) { - unsigned B0 = N->getMaskElt(i*4); - unsigned B1 = N->getMaskElt(i*4+1); - unsigned B2 = N->getMaskElt(i*4+2); - unsigned B3 = N->getMaskElt(i*4+3); - if (B0 % 4) - return false; - if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1) +// Check that the mask is shuffling N byte elements. +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) { + assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) && + "Unexpected element width."); + + unsigned NumOfElem = 16 / Width; + unsigned MaskVal[16]; // Width is never greater than 16 + for (unsigned i = 0; i < NumOfElem; ++i) { + MaskVal[0] = N->getMaskElt(i * Width); + if (MaskVal[0] % Width) { return false; + } + + for (unsigned int j = 1; j < Width; ++j) { + MaskVal[j] = N->getMaskElt(i * Width + j); + if (MaskVal[j] != MaskVal[j-1] + 1) { + return false; + } + } } return true; @@ -1611,7 +1624,7 @@ static bool isWordShuffleMask(ShuffleVectorSDNode *N) { bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE) { - if (!isWordShuffleMask(N)) + if (!isNByteElemShuffleMask(N, 4)) return false; // Now we look at mask elements 0,4,8,12 @@ -1688,7 +1701,7 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE) { assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); // Ensure each byte index of the word is consecutive. - if (!isWordShuffleMask(N)) + if (!isNByteElemShuffleMask(N, 4)) return false; // Now we look at mask elements 0,4,8,12, which are the beginning of words. @@ -1746,6 +1759,66 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, } } +/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap +/// if the inputs to the instruction should be swapped and set \p DM to the +/// value for the immediate. +/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI +/// AND element 0 of the result comes from the first input (LE) or second input +/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered. +/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle +/// mask. +bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + + // Ensure each byte index of the double word is consecutive. + if (!isNByteElemShuffleMask(N, 8)) + return false; + + unsigned M0 = N->getMaskElt(0) / 8; + unsigned M1 = N->getMaskElt(8) / 8; + assert(((M0 | M1) < 4) && "A mask element out of bounds?"); + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + if ((M0 | M1) < 2) { + DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1); + Swap = false; + return true; + } else + return false; + } + + if (IsLE) { + if (M0 > 1 && M1 < 2) { + Swap = false; + } else if (M0 < 2 && M1 > 1) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (((~M1) & 1) << 1) + ((~M0) & 1); + return true; + } else { // BE + if (M0 < 2 && M1 > 1) { + Swap = false; + } else if (M0 > 1 && M1 < 2) { + M0 = (M0 + 2) % 4; + M1 = (M1 + 2) % 4; + Swap = true; + } else + return false; + + // Note: if control flow comes here that means Swap is already set above + DM = (M0 << 1) + (M1 & 1); + return true; + } +} + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. @@ -7760,6 +7833,19 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); } + if (Subtarget.hasVSX() && + PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2); + + SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI); + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 2f9eb95f6de6..7982a4a9e9fb 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -90,6 +90,10 @@ namespace llvm { /// VECSHL, + /// XXPERMDI - The PPC XXPERMDI instruction + /// + XXPERMDI, + /// The CMPB instruction (takes two operands of i32 or i64). CMPB, @@ -454,6 +458,10 @@ namespace llvm { /// for a XXSLDWI instruction. bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, bool &Swap, bool IsLE); + /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXPERMDI instruction. + bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 165970f9678c..295590b2acf6 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -735,12 +735,12 @@ def RLDICL_32_64 : MDForm_1<30, 0, "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; // End fast-isel. -let isCodeGenOnly = 1 in -def RLDICL_32 : MDForm_1<30, 0, - (outs gprc:$rA), - (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), - "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, - []>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm RLDICL_32 : MDForm_1r<30, 0, + (outs gprc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDICR : MDForm_1r<30, 1, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index fd6785e963a6..f3c68c443b1b 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1983,3 +1983,7 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const { return &PPC::VSRCRegClass; return RC; } + +int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) { + return PPC::getRecordFormOpcode(Opcode); +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h index b30d09e03ec4..8dd4dbb60879 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.h +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -290,6 +290,7 @@ class PPCInstrInfo : public PPCGenInstrInfo { return Reg >= PPC::V0 && Reg <= PPC::V31; } const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const; + static int getRecordFormOpcode(unsigned Opcode); }; } diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 26b99eced23c..8223aa655e38 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -53,6 +53,10 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; +def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>, + SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> +]>; + def SDT_PPCvcmp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> ]>; @@ -170,6 +174,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>; +def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 1589ab03e507..c4139ca8b7bd 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -843,7 +843,9 @@ let Uses = [RM] in { def XXPERMDI : XX3Form_2<60, 10, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), - "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>; + "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, + [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB, + imm32SExt16:$DM))]>; let isCodeGenOnly = 1 in def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM), "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index ddae5befee3e..b9004cc8a9f5 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -296,7 +296,7 @@ namespace { /// PPC Code Generator Pass Configuration Options. class PPCPassConfig : public TargetPassConfig { public: - PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM) + PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} PPCTargetMachine &getPPCTargetMachine() const { @@ -316,7 +316,7 @@ class PPCPassConfig : public TargetPassConfig { } // end anonymous namespace TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { - return new PPCPassConfig(this, PM); + return new PPCPassConfig(*this, PM); } void PPCPassConfig::addIRPasses() { diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index f2838351cee5..b8f5a2083d80 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -55,6 +55,10 @@ class PPCTargetMachine : public LLVMTargetMachine { const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; + + bool isMachineVerifierClean() const override { + return false; + } }; /// PowerPC 32-bit target machine. diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 7ee1317bf72f..5559cdc5fe46 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + MaxLoadSize = 8; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } @@ -239,9 +244,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) { } unsigned PPCTTIImpl::getCacheLineSize() { - // This is currently only used for the data prefetch pass which is only - // enabled for BG/Q by default. - return CacheLineSize; + // Check first if the user specified a custom line size. + if (CacheLineSize.getNumOccurrences() > 0) + return CacheLineSize; + + // On P7, P8 or P9 we have a cache line size of 128. + unsigned Directive = ST->getDarwinDirective(); + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || + Directive == PPC::DIR_PWR9) + return 128; + + // On other processors return a default of 64 bytes. + return 64; } unsigned PPCTTIImpl::getPrefetchDistance() { diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 6ce70fbd8778..2e0116fee04c 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,7 @@ class PPCTTIImpl : public BasicTTIImplBase { /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp index a20331cd0a3e..efdde04c582d 100644 --- a/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -56,5 +56,5 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT, } TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) { - return new TargetPassConfig(this, PM); + return new TargetPassConfig(*this, PM); } diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp index 1da4d3604304..49c67e0819f7 100644 --- a/lib/Target/Sparc/SparcTargetMachine.cpp +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -114,7 +114,7 @@ namespace { /// Sparc Code Generator Pass Configuration Options. class SparcPassConfig : public TargetPassConfig { public: - SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM) + SparcPassConfig(SparcTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} SparcTargetMachine &getSparcTargetMachine() const { @@ -128,7 +128,7 @@ class SparcPassConfig : public TargetPassConfig { } // namespace TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) { - return new SparcPassConfig(this, PM); + return new SparcPassConfig(*this, PM); } void SparcPassConfig::addIRPasses() { diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h index 48193fe095be..faf714cbe2c9 100644 --- a/lib/Target/Sparc/SparcTargetMachine.h +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -40,6 +40,10 @@ class SparcTargetMachine : public LLVMTargetMachine { TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; /// Sparc 32-bit target machine diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index ede5005fa491..f30d52f859d7 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -119,7 +119,7 @@ namespace { /// SystemZ Code Generator Pass Configuration Options. class SystemZPassConfig : public TargetPassConfig { public: - SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM) + SystemZPassConfig(SystemZTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} SystemZTargetMachine &getSystemZTargetMachine() const { @@ -212,7 +212,7 @@ void SystemZPassConfig::addPreEmitPass() { } TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { - return new SystemZPassConfig(this, PM); + return new SystemZPassConfig(*this, PM); } TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index a10ca64fa632..eb2f17a2091c 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -51,6 +51,8 @@ class SystemZTargetMachine : public LLVMTargetMachine { } bool targetSchedulesPostRAScheduling() const override { return true; }; + + bool isMachineVerifierClean() const override { return false; } }; } // end namespace llvm diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index b974681fb6af..d9b2b8743649 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -129,7 +129,7 @@ namespace { /// WebAssembly Code Generator Pass Configuration Options. class WebAssemblyPassConfig final : public TargetPassConfig { public: - WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM) + WebAssemblyPassConfig(WebAssemblyTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const { @@ -154,7 +154,7 @@ TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { TargetPassConfig * WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) { - return new WebAssemblyPassConfig(this, PM); + return new WebAssemblyPassConfig(*this, PM); } FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 6e062ec59347..b5a926f915af 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -587,6 +587,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSLLDQZ256rr: case X86::VPSLLDQZ512rr: Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; case X86::VPSLLDQZ128rm: case X86::VPSLLDQZ256rm: case X86::VPSLLDQZ512rm: @@ -604,6 +605,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSRLDQZ256rr: case X86::VPSRLDQZ512rr: Src1Name = getRegName(MI->getOperand(1).getReg()); + LLVM_FALLTHROUGH; case X86::VPSRLDQZ128rm: case X86::VPSRLDQZ256rm: case X86::VPSRLDQZ512rm: @@ -1091,6 +1093,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask); @@ -1099,6 +1102,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 313920e02c3e..5582526541ba 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -123,18 +123,26 @@ namespace { EdgeBundles *Bundles; // Return a bitmask of FP registers in block's live-in list. - static unsigned calcLiveInMask(MachineBasicBlock *MBB) { + static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) { unsigned Mask = 0; - for (const auto &LI : MBB->liveins()) { - if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) - continue; - Mask |= 1 << (LI.PhysReg - X86::FP0); + for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(); + I != MBB->livein_end(); ) { + MCPhysReg Reg = I->PhysReg; + static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums"); + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + Mask |= 1 << (Reg - X86::FP0); + if (RemoveFPs) { + I = MBB->removeLiveIn(I); + continue; + } + } + ++I; } return Mask; } // Partition all the CFG edges into LiveBundles. - void bundleCFG(MachineFunction &MF); + void bundleCFGRecomputeKillFlags(MachineFunction &MF); MachineBasicBlock *MBB; // Current basic block @@ -327,7 +335,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); // Prepare cross-MBB liveness. - bundleCFG(MF); + bundleCFGRecomputeKillFlags(MF); StackTop = 0; @@ -375,13 +383,15 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { /// registers live-out from a block is identical to the live-in set of all /// successors. This is not enforced by the normal live-in lists since /// registers may be implicitly defined, or not used by all successors. -void FPS::bundleCFG(MachineFunction &MF) { +void FPS::bundleCFGRecomputeKillFlags(MachineFunction &MF) { assert(LiveBundles.empty() && "Stale data in LiveBundles"); LiveBundles.resize(Bundles->getNumBundles()); // Gather the actual live-in masks for all MBBs. for (MachineBasicBlock &MBB : MF) { - const unsigned Mask = calcLiveInMask(&MBB); + setKillFlags(MBB); + + const unsigned Mask = calcLiveInMask(&MBB, false); if (!Mask) continue; // Update MBB ingoing bundle mask. @@ -396,7 +406,6 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; MBB = &BB; - setKillFlags(BB); setupBlockStack(); for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { @@ -453,6 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { unsigned Reg = DeadRegs[i]; // Check if Reg is live on the stack. An inline-asm register operand that // is in the clobber list and marked dead might not be live on the stack. + static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers"); if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); freeStackSlotAfter(I, Reg-X86::FP0); @@ -506,7 +516,6 @@ void FPS::setupBlockStack() { // Push the fixed live-in registers. for (unsigned i = Bundle.FixCount; i > 0; --i) { - MBB->addLiveIn(X86::ST0+i-1); DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" << unsigned(Bundle.FixStack[i-1]) << '\n'); pushReg(Bundle.FixStack[i-1]); @@ -515,7 +524,8 @@ void FPS::setupBlockStack() { // Kill off unwanted live-ins. This can happen with a critical edge. // FIXME: We could keep these live registers around as zombies. They may need // to be revived at the end of a short block. It might save a few instrs. - adjustLiveRegs(calcLiveInMask(MBB), MBB->begin()); + unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true); + adjustLiveRegs(Mask, MBB->begin()); DEBUG(MBB->dump()); } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 331e56976db7..328a80304602 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1062,6 +1062,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP) { + assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved"); + // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. @@ -1124,13 +1126,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, nullptr, DwarfFramePtr)); } } - - // Mark the FramePtr as live-in in every block. Don't do this again for - // funclet prologues. - if (!IsFunclet) { - for (MachineBasicBlock &EveryMBB : MF) - EveryMBB.addLiveIn(MachineFramePtr); - } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index c899f0fd5100..2a1633de0a23 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -418,8 +418,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { case X86ISD::XOR: case X86ISD::OR: case ISD::ADD: - case ISD::ADDC: - case ISD::ADDE: case ISD::ADDCARRY: case ISD::AND: case ISD::OR: diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8d78308afe9d..0a41f35f9320 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,3 +1,4 @@ + //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure @@ -80,12 +81,6 @@ static cl::opt ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); -static cl::opt MulConstantOptimization( - "mul-constant-optimization", cl::init(true), - cl::desc("Replace 'mul x, Const' with more effective instructions like " - "SHIFT, LEA, etc."), - cl::Hidden); - /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -317,16 +312,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); } - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { - if (VT == MVT::i64 && !Subtarget.is64Bit()) - continue; - // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. - setOperationAction(ISD::ADDC, VT, Custom); - setOperationAction(ISD::ADDE, VT, Custom); - setOperationAction(ISD::SUBC, VT, Custom); - setOperationAction(ISD::SUBE, VT, Custom); - } - setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, @@ -428,7 +413,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SETCCE, VT, Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -1583,6 +1567,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Support carry in as value rather than glue. setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); + setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { @@ -16304,6 +16289,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; + LLVM_FALLTHROUGH; default: NeedOF = true; break; @@ -17167,17 +17153,17 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, switch (SetCCOpcode) { default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: Invert = true; + case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; - case ISD::SETLT: Swap = true; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = X86ISD::PCMPGT; break; - case ISD::SETGE: Swap = true; + case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; - case ISD::SETUGE: Swap = true; + case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } @@ -17398,19 +17384,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } -SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); - assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); - assert(Carry.getOpcode() != ISD::CARRY_FALSE); + // Recreate the carry if needed. + EVT CarryVT = Carry.getValueType(); + APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), + Carry, DAG.getConstant(NegOne, DL, CarryVT)); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); - SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG); if (Op.getSimpleValueType() == MVT::i1) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); @@ -23269,32 +23260,6 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getNode()->getSimpleValueType(0); - - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - - unsigned Opc; - bool ExtraOp = false; - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid code"); - case ISD::ADDC: Opc = X86ISD::ADD; break; - case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; - case ISD::SUBC: Opc = X86ISD::SUB; break; - case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; - } - - if (!ExtraOp) - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1)); - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1), Op.getOperand(2)); -} - static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); @@ -23785,7 +23750,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); - case ISD::SETCCE: return LowerSETCCE(Op, DAG); + case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -23830,10 +23795,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: @@ -28946,12 +28907,118 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Try to match patterns such as +// (i16 bitcast (v16i1 x)) +// -> +// (i16 movmsk (16i8 sext (v16i1 x))) +// before the illegal vector is scalarized on subtargets that don't have legal +// vxi1 types. +static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, + const X86Subtarget &Subtarget) { + EVT VT = BitCast.getValueType(); + SDValue N0 = BitCast.getOperand(0); + EVT VecVT = N0->getValueType(0); + + if (!VT.isScalarInteger() || !VecVT.isSimple()) + return SDValue(); + + // With AVX512 vxi1 types are legal and we prefer using k-regs. + // MOVMSK is supported in SSE2 or later. + if (Subtarget.hasAVX512() || !Subtarget.hasSSE2()) + return SDValue(); + + // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and + // v8f64. So all legal 128-bit and 256-bit vectors are covered except for + // v8i16 and v16i16. + // For these two cases, we can shuffle the upper element bytes to a + // consecutive sequence at the start of the vector and treat the results as + // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However, + // for v16i16 this is not the case, because the shuffle is expensive, so we + // avoid sign-exteding to this type entirely. + // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: + // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) + MVT SExtVT; + MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + switch (VecVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i1: + SExtVT = MVT::v2i64; + FPCastVT = MVT::v2f64; + break; + case MVT::v4i1: + SExtVT = MVT::v4i32; + FPCastVT = MVT::v4f32; + // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) + // sign-extend to a 256-bit operation to avoid truncation. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v4i64; + FPCastVT = MVT::v4f64; + } + break; + case MVT::v8i1: + SExtVT = MVT::v8i16; + // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), + // sign-extend to a 256-bit operation to match the compare. + // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over + // 256-bit because the shuffle is cheaper than sign extending the result of + // the compare. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v8i32; + FPCastVT = MVT::v8f32; + } + break; + case MVT::v16i1: + SExtVT = MVT::v16i8; + // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), + // it is not profitable to sign-extend to 256-bit because this will + // require an extra cross-lane shuffle which is more exprensive than + // truncating the result of the compare to 128-bits. + break; + case MVT::v32i1: + // TODO: Handle pre-AVX2 cases by splitting to two v16i1's. + if (!Subtarget.hasInt256()) + return SDValue(); + SExtVT = MVT::v32i8; + break; + }; + + SDLoc DL(BitCast); + SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT); + if (SExtVT == MVT::v8i16) { + V = DAG.getBitcast(MVT::v16i8, V); + V = DAG.getVectorShuffle( + MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8), + {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + } else + assert(SExtVT.getScalarType() != MVT::i16 && + "Vectors of i16 must be shuffled"); + if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + V = DAG.getBitcast(FPCastVT, V); + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); + return DAG.getZExtOrTrunc(V, DL, VT); +} + static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); + // Try to match patterns such as + // (i16 bitcast (v16i1 x)) + // -> + // (i16 movmsk (16i8 sext (v16i1 x))) + // before the setcc result is scalarized on subtargets that don't have legal + // vxi1 types. + if (DCI.isBeforeLegalize()) + if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + return V; // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -29944,6 +30011,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -29974,6 +30042,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -30008,6 +30077,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -30036,6 +30106,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -30933,75 +31004,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, } } -static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, - EVT VT, SDLoc DL) { - - auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { - SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(Mult, DL, VT)); - Result = DAG.getNode(ISD::SHL, DL, VT, Result, - DAG.getConstant(Shift, DL, MVT::i8)); - Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0), - Result); - return Result; - }; - - auto combineMulMulAddOrSub = [&](bool isAdd) { - SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(9, DL, VT)); - Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT)); - Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0), - Result); - return Result; - }; - - switch (MulAmt) { - default: - break; - case 11: - // mul x, 11 => add ((shl (mul x, 5), 1), x) - return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); - case 21: - // mul x, 21 => add ((shl (mul x, 5), 2), x) - return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); - case 22: - // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); - case 19: - // mul x, 19 => sub ((shl (mul x, 5), 2), x) - return combineMulShlAddOrSub(5, 2, /*isAdd*/ false); - case 13: - // mul x, 13 => add ((shl (mul x, 3), 2), x) - return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); - case 23: - // mul x, 13 => sub ((shl (mul x, 3), 3), x) - return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); - case 14: - // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulShlAddOrSub(3, 2, /*isAdd*/ true)); - case 26: - // mul x, 26 => sub ((mul (mul x, 9), 3), x) - return combineMulMulAddOrSub(/*isAdd*/ false); - case 28: - // mul x, 28 => add ((mul (mul x, 9), 3), x) - return combineMulMulAddOrSub(/*isAdd*/ true); - case 29: - // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) - return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - combineMulMulAddOrSub(/*isAdd*/ true)); - case 30: - // mul x, 30 => sub (sub ((shl x, 5), x), x) - return DAG.getNode( - ISD::SUB, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(5, DL, MVT::i8)))); - } - return SDValue(); -} - /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -31011,8 +31013,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); - if (!MulConstantOptimization) - return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -31068,8 +31068,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); - } else if (!Subtarget.slowLEA()) - NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); + } if (!NewMul) { assert(MulAmt != 0 && @@ -34558,8 +34557,7 @@ static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG, isOneConstant(Carry.getOperand(1)))) Carry = Carry.getOperand(0); - if (Carry.getOpcode() == ISD::SETCC || - Carry.getOpcode() == X86ISD::SETCC || + if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { if (Carry.getConstantOperandVal(0) == X86::COND_B) return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1)); @@ -35126,7 +35124,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); + case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); @@ -35510,6 +35508,7 @@ TargetLowering::ConstraintWeight switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + LLVM_FALLTHROUGH; case 'R': case 'q': case 'Q': @@ -35861,6 +35860,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR64RegClass); break; } + LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 18106c2eb394..f51b6641db2f 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1163,7 +1163,7 @@ namespace llvm { SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 33fbd41bb631..0aee30081a35 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -195,6 +195,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; + LLVM_FALLTHROUGH; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 53a8e83b36fc..cb21f1bd7706 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -323,7 +323,7 @@ namespace { /// X86 Code Generator Pass Configuration Options. class X86PassConfig : public TargetPassConfig { public: - X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) + X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} X86TargetMachine &getX86TargetMachine() const { @@ -369,7 +369,7 @@ INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix", "X86 Execution Dependency Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { - return new X86PassConfig(this, PM); + return new X86PassConfig(*this, PM); } void X86PassConfig::addIRPasses() { @@ -433,6 +433,7 @@ bool X86PassConfig::addPreISel() { void X86PassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { + addPass(&LiveRangeShrinkID); addPass(createX86FixupSetCC()); addPass(createX86OptimizeLEAs()); addPass(createX86CallFrameOptimization()); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index cf933f52604e..1bf267d34ec2 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -49,6 +49,10 @@ class X86TargetMachine final : public LLVMTargetMachine { TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool isMachineVerifierClean() const override { + return false; + } }; } // end namespace llvm diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 2950e2efbea3..1a1cbd474888 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -54,7 +54,7 @@ namespace { /// XCore Code Generator Pass Configuration Options. class XCorePassConfig : public TargetPassConfig { public: - XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM) + XCorePassConfig(XCoreTargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} XCoreTargetMachine &getXCoreTargetMachine() const { @@ -70,7 +70,7 @@ class XCorePassConfig : public TargetPassConfig { } // end anonymous namespace TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) { - return new XCorePassConfig(this, PM); + return new XCorePassConfig(*this, PM); } void XCorePassConfig::addIRPasses() { diff --git a/lib/Transforms/Coroutines/CoroCleanup.cpp b/lib/Transforms/Coroutines/CoroCleanup.cpp index 5cf2a8c25d83..359876627fce 100644 --- a/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -101,7 +101,9 @@ namespace { struct CoroCleanup : FunctionPass { static char ID; // Pass identification, replacement for typeid - CoroCleanup() : FunctionPass(ID) {} + CoroCleanup() : FunctionPass(ID) { + initializeCoroCleanupPass(*PassRegistry::getPassRegistry()); + } std::unique_ptr L; diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp index b52989186165..ba05896af150 100644 --- a/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/lib/Transforms/Coroutines/CoroEarly.cpp @@ -183,7 +183,9 @@ namespace { struct CoroEarly : public FunctionPass { static char ID; // Pass identification, replacement for typeid. - CoroEarly() : FunctionPass(ID) {} + CoroEarly() : FunctionPass(ID) { + initializeCoroEarlyPass(*PassRegistry::getPassRegistry()); + } std::unique_ptr L; diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp index acb22449142b..42fd6d746145 100644 --- a/lib/Transforms/Coroutines/CoroElide.cpp +++ b/lib/Transforms/Coroutines/CoroElide.cpp @@ -258,7 +258,9 @@ static bool replaceDevirtTrigger(Function &F) { namespace { struct CoroElide : FunctionPass { static char ID; - CoroElide() : FunctionPass(ID) {} + CoroElide() : FunctionPass(ID) { + initializeCoroElidePass(*PassRegistry::getPassRegistry()); + } std::unique_ptr L; diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index cd549e4be282..613b4a7f03e9 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -681,7 +681,9 @@ namespace { struct CoroSplit : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - CoroSplit() : CallGraphSCCPass(ID) {} + CoroSplit() : CallGraphSCCPass(ID) { + initializeCoroSplitPass(*PassRegistry::getPassRegistry()); + } bool Run = false; diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 4c417f1c55eb..bc0967448cdd 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -652,12 +652,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); + auto IsTrivialPhi = [](PHINode *PN) -> Value * { + Value *CommonValue = PN->getIncomingValue(0); + if (all_of(PN->incoming_values(), + [&](Value *V) { return V == CommonValue; })) + return CommonValue; + return nullptr; + }; + if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( NewReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = PreReturn->begin(); Instruction *Ins = &NewReturnBlock->front(); + SmallVector DeadPhis; while (I != PreReturn->end()) { PHINode *OldPhi = dyn_cast(I); if (!OldPhi) @@ -674,8 +683,22 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } + + // After incoming values splitting, the old phi may become trivial. + // Keeping the trivial phi can introduce definition inside the outline + // region which is live-out, causing necessary overhead (load, store + // arg passing etc). + if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { + OldPhi->replaceAllUsesWith(OldPhiVal); + DeadPhis.push_back(OldPhi); + } + ++I; } + + for (auto *DP : DeadPhis) + DP->eraseFromParent(); + for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 659cb9df00a2..9dede4cedd1d 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -6,14 +6,8 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This pass prepares a module containing type metadata for ThinLTO by splitting -// it into regular and thin LTO parts if possible, and writing both parts to -// a multi-module bitcode file. Modules that do not contain type metadata are -// written unmodified as a single module. -// -//===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -436,3 +430,15 @@ ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str, raw_ostream *ThinLinkOS) { return new WriteThinLTOBitcode(Str, ThinLinkOS); } + +PreservedAnalyses +llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + writeThinLTOBitcode(OS, ThinLinkOS, + [&FAM](Function &F) -> AAResults & { + return FAM.getResult(F); + }, + M, &AM.getResult(M)); + return PreservedAnalyses::all(); +} diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 92a38f26dde7..b44499ec4be9 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3838,24 +3838,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // Mark any parameters that are known to be non-null with the nonnull // attribute. This is helpful for inlining calls to functions with null // checks on their arguments. - SmallVector Indices; + SmallVector ArgNos; unsigned ArgNo = 0; for (Value *V : CS.args()) { if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo, Attribute::NonNull) && isKnownNonNullAt(V, CS.getInstruction(), &DT)) - Indices.push_back(ArgNo + AttributeList::FirstArgIndex); + ArgNos.push_back(ArgNo); ArgNo++; } assert(ArgNo == CS.arg_size() && "sanity check"); - if (!Indices.empty()) { + if (!ArgNos.empty()) { AttributeList AS = CS.getAttributes(); LLVMContext &Ctx = CS.getInstruction()->getContext(); - AS = AS.addAttribute(Ctx, Indices, - Attribute::get(Ctx, Attribute::NonNull)); + AS = AS.addParamAttribute(Ctx, ArgNos, + Attribute::get(Ctx, Attribute::NonNull)); CS.setAttributes(AS); Changed = true; } diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 1e30dbf6b55a..b2d95271479c 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -182,6 +182,14 @@ static cl::opt cl::desc("Use this option to turn on/off " "memory intrinsic size profiling.")); +// Emit branch probability as optimization remarks. +static cl::opt + EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden, + cl::desc("When this option is on, the annotated " + "branch probability will be emitted as " + " optimization remarks: -Rpass-analysis=" + "pgo-instr-use")); + // Command line option to turn on CFG dot dump after profile annotation. // Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts extern cl::opt PGOViewCounts; @@ -192,6 +200,39 @@ extern cl::opt ViewBlockFreqFuncName; namespace { +// Return a string describing the branch condition that can be +// used in static branch probability heuristics: +std::string getBranchCondString(Instruction *TI) { + BranchInst *BI = dyn_cast(TI); + if (!BI || !BI->isConditional()) + return std::string(); + + Value *Cond = BI->getCondition(); + ICmpInst *CI = dyn_cast(Cond); + if (!CI) + return std::string(); + + std::string result; + raw_string_ostream OS(result); + OS << CmpInst::getPredicateName(CI->getPredicate()) << "_"; + CI->getOperand(0)->getType()->print(OS, true); + + Value *RHS = CI->getOperand(1); + ConstantInt *CV = dyn_cast(RHS); + if (CV) { + if (CV->isZero()) + OS << "_Zero"; + else if (CV->isOne()) + OS << "_One"; + else if (CV->isAllOnesValue()) + OS << "_MinusOne"; + else + OS << "_Const"; + } + OS.flush(); + return result; +} + /// The select instruction visitor plays three roles specified /// by the mode. In \c VM_counting mode, it simply counts the number of /// select instructions. In \c VM_instrument mode, it inserts code to count @@ -1424,6 +1465,29 @@ void setProfMetadata(Module *M, Instruction *TI, ArrayRef EdgeCounts, for (const auto &W : Weights) { dbgs() << W << " "; } dbgs() << "\n";); TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); + if (EmitBranchProbability) { + std::string BrCondStr = getBranchCondString(TI); + if (BrCondStr.empty()) + return; + + unsigned WSum = + std::accumulate(Weights.begin(), Weights.end(), 0, + [](unsigned w1, unsigned w2) { return w1 + w2; }); + uint64_t TotalCount = + std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), 0, + [](uint64_t c1, uint64_t c2) { return c1 + c2; }); + BranchProbability BP(Weights[0], WSum); + std::string BranchProbStr; + raw_string_ostream OS(BranchProbStr); + OS << BP; + OS << " (total count : " << TotalCount << ")"; + OS.flush(); + Function *F = TI->getParent()->getParent(); + emitOptimizationRemarkAnalysis( + F->getContext(), "pgo-use-annot", *F, TI->getDebugLoc(), + Twine(BrCondStr) + + " is true with probability : " + Twine(BranchProbStr)); + } } template <> struct GraphTraits { diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 300085eccb0c..325b64cd8b43 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -7,24 +7,7 @@ // //===----------------------------------------------------------------------===// // -// Coverage instrumentation that works with AddressSanitizer -// and potentially with other Sanitizers. -// -// We create a Guard variable with the same linkage -// as the function and inject this code into the entry block (SCK_Function) -// or all blocks (SCK_BB): -// if (Guard < 0) { -// __sanitizer_cov(&Guard); -// } -// The accesses to Guard are atomic. The rest of the logic is -// in __sanitizer_cov (it's fine to call it more than once). -// -// With SCK_Edge we also split critical edges this effectively -// instrumenting all edges. -// -// This coverage implementation provides very limited data: -// it only tells if a given function (block) was ever executed. No counters. -// But for many use cases this is what we need and the added slowdown small. +// Coverage instrumentation done on LLVM IR level, works with Sanitizers. // //===----------------------------------------------------------------------===// @@ -56,9 +39,6 @@ using namespace llvm; #define DEBUG_TYPE "sancov" -static const char *const SanCovModuleInitName = "__sanitizer_cov_module_init"; -static const char *const SanCovName = "__sanitizer_cov"; -static const char *const SanCovWithCheckName = "__sanitizer_cov_with_check"; static const char *const SanCovTracePCIndirName = "__sanitizer_cov_trace_pc_indir"; static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc"; @@ -84,12 +64,6 @@ static cl::opt ClCoverageLevel( "3: all blocks and critical edges"), cl::Hidden, cl::init(0)); -static cl::opt ClCoverageBlockThreshold( - "sanitizer-coverage-block-threshold", - cl::desc("Use a callback with a guard check inside it if there are" - " more than this number of blocks."), - cl::Hidden, cl::init(0)); - static cl::opt ClExperimentalTracePC("sanitizer-coverage-trace-pc", cl::desc("Experimental pc tracing"), cl::Hidden, cl::init(false)); @@ -151,6 +125,8 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { Options.TraceGep |= ClGEPTracing; Options.TracePC |= ClExperimentalTracePC; Options.TracePCGuard |= ClTracePCGuard; + if (!Options.TracePCGuard && !Options.TracePC) + Options.TracePCGuard = true; // TracePCGuard is default. Options.NoPrune |= !ClPruneBlocks; return Options; } @@ -184,18 +160,10 @@ class SanitizerCoverageModule : public ModulePass { ArrayRef SwitchTraceTargets); bool InjectCoverage(Function &F, ArrayRef AllBlocks); void CreateFunctionGuardArray(size_t NumGuards, Function &F); - void SetNoSanitizeMetadata(Instruction *I); - void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx, - bool UseCalls); - unsigned NumberOfInstrumentedBlocks() { - return SanCovFunction->getNumUses() + - SanCovWithCheckFunction->getNumUses(); - } + void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx); StringRef getSanCovTracePCGuardSection() const; StringRef getSanCovTracePCGuardSectionStart() const; StringRef getSanCovTracePCGuardSectionEnd() const; - Function *SanCovFunction; - Function *SanCovWithCheckFunction; Function *SanCovTracePCIndir; Function *SanCovTracePC, *SanCovTracePCGuard; Function *SanCovTraceCmpFunction[4]; @@ -209,7 +177,6 @@ class SanitizerCoverageModule : public ModulePass { LLVMContext *C; const DataLayout *DL; - GlobalVariable *GuardArray; GlobalVariable *FunctionGuardArray; // for trace-pc-guard. bool HasSancovGuardsSection; @@ -230,16 +197,11 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { IntptrPtrTy = PointerType::getUnqual(IntptrTy); Type *VoidTy = Type::getVoidTy(*C); IRBuilder<> IRB(*C); - Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty()); Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty()); Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty()); Int64Ty = IRB.getInt64Ty(); Int32Ty = IRB.getInt32Ty(); - SanCovFunction = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy)); - SanCovWithCheckFunction = checkSanitizerInterfaceFunction( - M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy)); SanCovTracePCIndir = checkSanitizerInterfaceFunction( M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy)); SanCovTraceCmpFunction[0] = @@ -278,41 +240,10 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction( SanCovTracePCGuardName, VoidTy, Int32PtrTy)); - // At this point we create a dummy array of guards because we don't - // know how many elements we will need. - Type *Int32Ty = IRB.getInt32Ty(); - - if (!Options.TracePCGuard) - GuardArray = - new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, - nullptr, "__sancov_gen_cov_tmp"); - for (auto &F : M) runOnFunction(F); - auto N = NumberOfInstrumentedBlocks(); - - GlobalVariable *RealGuardArray = nullptr; - if (!Options.TracePCGuard) { - // Now we know how many elements we need. Create an array of guards - // with one extra element at the beginning for the size. - Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1); - RealGuardArray = new GlobalVariable( - M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage, - Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov"); - - // Replace the dummy array with the real one. - GuardArray->replaceAllUsesWith( - IRB.CreatePointerCast(RealGuardArray, Int32PtrTy)); - GuardArray->eraseFromParent(); - } - // Create variable for module (compilation unit) name - Constant *ModNameStrConst = - ConstantDataArray::getString(M.getContext(), M.getName(), true); - GlobalVariable *ModuleName = new GlobalVariable( - M, ModNameStrConst->getType(), true, GlobalValue::PrivateLinkage, - ModNameStrConst, "__sancov_gen_modname"); if (Options.TracePCGuard) { if (HasSancovGuardsSection) { Function *CtorFunc; @@ -339,18 +270,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); } } - } else if (!Options.TracePC) { - Function *CtorFunc; - std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions( - M, SanCovModuleCtorName, SanCovModuleInitName, - {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy}, - {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy), - ConstantInt::get(IntptrTy, N), Constant::getNullValue(Int8PtrTy), - IRB.CreatePointerCast(ModuleName, Int8PtrTy)}); - - appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); } - return true; } @@ -494,13 +414,12 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F, return false; case SanitizerCoverageOptions::SCK_Function: CreateFunctionGuardArray(1, F); - InjectCoverageAtBlock(F, F.getEntryBlock(), 0, false); + InjectCoverageAtBlock(F, F.getEntryBlock(), 0); return true; default: { - bool UseCalls = ClCoverageBlockThreshold < AllBlocks.size(); CreateFunctionGuardArray(AllBlocks.size(), F); for (size_t i = 0, N = AllBlocks.size(); i < N; i++) - InjectCoverageAtBlock(F, *AllBlocks[i], i, UseCalls); + InjectCoverageAtBlock(F, *AllBlocks[i], i); return true; } } @@ -517,8 +436,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( Function &F, ArrayRef IndirCalls) { if (IndirCalls.empty()) return; - if (!Options.TracePC && !Options.TracePCGuard) - return; + assert(Options.TracePC || Options.TracePCGuard); for (auto I : IndirCalls) { IRBuilder<> IRB(I); CallSite CS(I); @@ -625,13 +543,8 @@ void SanitizerCoverageModule::InjectTraceForCmp( } } -void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) { - I->setMetadata(I->getModule()->getMDKindID("nosanitize"), - MDNode::get(*C, None)); -} - void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, - size_t Idx, bool UseCalls) { + size_t Idx) { BasicBlock::iterator IP = BB.getFirstInsertionPt(); bool IsEntryBB = &BB == &F.getEntryBlock(); DebugLoc EntryLoc; @@ -651,47 +564,14 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, if (Options.TracePC) { IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC. IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } else if (Options.TracePCGuard) { + } else { + assert(Options.TracePCGuard); auto GuardPtr = IRB.CreateIntToPtr( IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy), ConstantInt::get(IntptrTy, Idx * 4)), Int32PtrTy); - if (!UseCalls) { - auto GuardLoad = IRB.CreateLoad(GuardPtr); - GuardLoad->setAtomic(AtomicOrdering::Monotonic); - GuardLoad->setAlignment(8); - SetNoSanitizeMetadata(GuardLoad); // Don't instrument with e.g. asan. - auto Cmp = IRB.CreateICmpNE( - GuardLoad, Constant::getNullValue(GuardLoad->getType())); - auto Ins = SplitBlockAndInsertIfThen( - Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); - IRB.SetInsertPoint(Ins); - IRB.SetCurrentDebugLocation(EntryLoc); - } IRB.CreateCall(SanCovTracePCGuard, GuardPtr); IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } else { - Value *GuardP = IRB.CreateAdd( - IRB.CreatePointerCast(GuardArray, IntptrTy), - ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4)); - GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy); - if (UseCalls) { - IRB.CreateCall(SanCovWithCheckFunction, GuardP); - } else { - LoadInst *Load = IRB.CreateLoad(GuardP); - Load->setAtomic(AtomicOrdering::Monotonic); - Load->setAlignment(4); - SetNoSanitizeMetadata(Load); - Value *Cmp = - IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load); - Instruction *Ins = SplitBlockAndInsertIfThen( - Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000)); - IRB.SetInsertPoint(Ins); - IRB.SetCurrentDebugLocation(EntryLoc); - // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC. - IRB.CreateCall(SanCovFunction, GuardP); - IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge. - } } } diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index ee493a8ec7e1..7b625b9b136e 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -305,7 +305,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { /// Infer nonnull attributes for the arguments at the specified callsite. static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { - SmallVector Indices; + SmallVector ArgNos; unsigned ArgNo = 0; for (Value *V : CS.args()) { @@ -318,18 +318,19 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), CS.getInstruction()) == LazyValueInfo::False) - Indices.push_back(ArgNo + AttributeList::FirstArgIndex); + ArgNos.push_back(ArgNo); ArgNo++; } assert(ArgNo == CS.arg_size() && "sanity check"); - if (Indices.empty()) + if (ArgNos.empty()) return false; AttributeList AS = CS.getAttributes(); LLVMContext &Ctx = CS.getInstruction()->getContext(); - AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + AS = AS.addParamAttribute(Ctx, ArgNos, + Attribute::get(Ctx, Attribute::NonNull)); CS.setAttributes(AS); return true; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0d6e0538261d..0490d93f6455 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -80,10 +80,9 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, struct llvm::GVN::Expression { uint32_t opcode; Type *type; - bool commutative; SmallVector varargs; - Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {} + Expression(uint32_t o = ~2U) : opcode(o) {} bool operator==(const Expression &other) const { if (opcode != other.opcode) @@ -247,7 +246,6 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); - e.commutative = true; } if (CmpInst *C = dyn_cast(I)) { @@ -258,7 +256,6 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (C->getOpcode() << 8) | Predicate; - e.commutative = true; } else if (InsertValueInst *E = dyn_cast(I)) { for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); II != IE; ++II) @@ -284,7 +281,6 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (Opcode << 8) | Predicate; - e.commutative = true; return e; } @@ -352,25 +348,25 @@ GVN::ValueTable::~ValueTable() = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); - if (PHINode *PN = dyn_cast(V)) - NumberingPhi[num] = PN; } uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); - uint32_t e = assignExpNewValueNum(exp).first; + uint32_t &e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = createExpr(C); - auto ValNum = assignExpNewValueNum(exp); - if (ValNum.second) { - valueNumbering[C] = ValNum.first; - return ValNum.first; + uint32_t &e = expressionNumbering[exp]; + if (!e) { + e = nextValueNumber++; + valueNumbering[C] = e; + return e; } if (!MD) { - uint32_t e = assignExpNewValueNum(exp).first; + e = nextValueNumber++; valueNumbering[C] = e; return e; } @@ -526,29 +522,23 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { case Instruction::ExtractValue: exp = createExtractvalueExpr(cast(I)); break; - case Instruction::PHI: - valueNumbering[V] = nextValueNumber; - NumberingPhi[nextValueNumber] = cast(V); - return nextValueNumber++; default: valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - uint32_t e = assignExpNewValueNum(exp).first; + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; valueNumbering[V] = e; return e; } /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { +uint32_t GVN::ValueTable::lookup(Value *V) const { DenseMap::const_iterator VI = valueNumbering.find(V); - if (Verify) { - assert(VI != valueNumbering.end() && "Value not numbered?"); - return VI->second; - } - return (VI != valueNumbering.end()) ? VI->second : 0; + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; } /// Returns the value number of the given comparison, @@ -559,29 +549,21 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); - return assignExpNewValueNum(exp).first; + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + return e; } /// Remove all entries from the ValueTable. void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); - NumberingPhi.clear(); - PhiTranslateTable.clear(); - BlockRPONumber.clear(); nextValueNumber = 1; - Expressions.clear(); - ExprIdx.clear(); - nextExprNumber = 0; } /// Remove a value from the value numbering. void GVN::ValueTable::erase(Value *V) { - uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); - // If V is PHINode, V <--> value number is an one-to-one mapping. - if (isa(V)) - NumberingPhi.erase(Num); } /// verifyRemoved - Verify that the value is removed from all internal data @@ -1469,104 +1451,6 @@ bool GVN::processLoad(LoadInst *L) { return false; } -/// Return a pair the first field showing the value number of \p Exp and the -/// second field showing whether it is a value number newly created. -std::pair -GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { - uint32_t &e = expressionNumbering[Exp]; - bool CreateNewValNum = !e; - if (CreateNewValNum) { - Expressions.push_back(Exp); - if (ExprIdx.size() < nextValueNumber + 1) - ExprIdx.resize(nextValueNumber * 2); - e = nextValueNumber; - ExprIdx[nextValueNumber++] = nextExprNumber++; - } - return {e, CreateNewValNum}; -} - -void GVN::ValueTable::assignBlockRPONumber(Function &F) { - uint32_t NextBlockNumber = 1; - ReversePostOrderTraversal RPOT(&F); - for (BasicBlock *BB : RPOT) - BlockRPONumber[BB] = NextBlockNumber++; -} - -/// Return whether all the values related with the same \p num are -/// defined in \p BB. -bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, - GVN &Gvn) { - LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; - while (Vals && Vals->BB == BB) - Vals = Vals->Next; - return !Vals; -} - -/// Wrap phiTranslateImpl to provide caching functionality. -uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, - const BasicBlock *PhiBlock, uint32_t Num, - GVN &Gvn) { - auto FindRes = PhiTranslateTable.find({Num, Pred}); - if (FindRes != PhiTranslateTable.end()) - return FindRes->second; - uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); - PhiTranslateTable.insert({{Num, Pred}, NewNum}); - return NewNum; -} - -/// Translate value number \p Num using phis, so that it has the values of -/// the phis in BB. -uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, - const BasicBlock *PhiBlock, - uint32_t Num, GVN &Gvn) { - if (PHINode *PN = NumberingPhi[Num]) { - if (BlockRPONumber[Pred] >= BlockRPONumber[PhiBlock]) - return Num; - for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { - if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) - if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) - return TransVal; - } - return Num; - } - - // If there is any value related with Num is defined in a BB other than - // PhiBlock, it cannot depend on a phi in PhiBlock without going through - // a backedge. We can do an early exit in that case to save compile time. - if (!areAllValsInBB(Num, PhiBlock, Gvn)) - return Num; - - if (ExprIdx[Num] == 0 || Num >= ExprIdx.size()) - return Num; - Expression Exp = Expressions[ExprIdx[Num]]; - - for (unsigned i = 0; i < Exp.varargs.size(); i++) { - // For InsertValue and ExtractValue, some varargs are index numbers - // instead of value numbers. Those index numbers should not be - // translated. - if ((i > 1 && Exp.opcode == Instruction::InsertValue) || - (i > 0 && Exp.opcode == Instruction::ExtractValue)) - continue; - Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); - } - - if (Exp.commutative) { - assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); - if (Exp.varargs[0] > Exp.varargs[1]) { - std::swap(Exp.varargs[0], Exp.varargs[1]); - uint32_t Opcode = Exp.opcode >> 8; - if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) - Exp.opcode = (Opcode << 8) | - CmpInst::getSwappedPredicate( - static_cast(Exp.opcode & 255)); - } - } - - if (uint32_t NewNum = expressionNumbering[Exp]) - return NewNum; - return Num; -} - // In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in @@ -1972,7 +1856,6 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); - VN.assignBlockRPONumber(F); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -2062,9 +1945,7 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, success = false; break; } - uint32_t TValNo = - VN.phiTranslate(Pred, Instr->getParent(), VN.lookup(Op), *this); - if (Value *V = findLeader(Pred, TValNo)) { + if (Value *V = findLeader(Pred, VN.lookup(Op))) { Instr->setOperand(i, V); } else { success = false; @@ -2081,12 +1962,10 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Instr->insertBefore(Pred->getTerminator()); Instr->setName(Instr->getName() + ".pre"); Instr->setDebugLoc(Instr->getDebugLoc()); - - unsigned Num = VN.lookupOrAdd(Instr); - VN.add(Instr, Num); + VN.add(Instr, ValNo); // Update the availability map to include the new instruction. - addToLeaderTable(Num, Instr, Pred); + addToLeaderTable(ValNo, Instr, Pred); return true; } @@ -2135,8 +2014,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { break; } - uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); - Value *predV = findLeader(P, TValNo); + Value *predV = findLeader(P, ValNo); if (!predV) { predMap.push_back(std::make_pair(static_cast(nullptr), P)); PREPred = P; diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index a143b9a3c645..930696b036c0 100644 --- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -98,11 +98,20 @@ template static bool handleBrSelExpect(BrSelInst &BSI) { CallInst *CI; ICmpInst *CmpI = dyn_cast(BSI.getCondition()); + CmpInst::Predicate Predicate; + uint64_t ValueComparedTo = 0; if (!CmpI) { CI = dyn_cast(BSI.getCondition()); + Predicate = CmpInst::ICMP_NE; + ValueComparedTo = 0; } else { - if (CmpI->getPredicate() != CmpInst::ICMP_NE) + Predicate = CmpI->getPredicate(); + if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ) return false; + ConstantInt *CmpConstOperand = dyn_cast(CmpI->getOperand(1)); + if (!CmpConstOperand) + return false; + ValueComparedTo = CmpConstOperand->getZExtValue(); CI = dyn_cast(CmpI->getOperand(0)); } @@ -121,9 +130,8 @@ template static bool handleBrSelExpect(BrSelInst &BSI) { MDBuilder MDB(CI->getContext()); MDNode *Node; - // If expect value is equal to 1 it means that we are more likely to take - // branch 0, in other case more likely is branch 1. - if (ExpectedValue->isOne()) + if ((ExpectedValue->getZExtValue() == ValueComparedTo) == + (Predicate == CmpInst::ICMP_EQ)) Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); else Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 5e9f40019ce8..27809f5b6f66 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -613,7 +613,7 @@ class NewGVN { return CClass; } void initializeCongruenceClasses(Function &F); - const Expression *makePossiblePhiOfOps(Instruction *, bool, + const Expression *makePossiblePhiOfOps(Instruction *, SmallPtrSetImpl &); void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue); @@ -1937,7 +1937,8 @@ void NewGVN::touchAndErase(Map &M, const KeyType &Key) { } void NewGVN::addAdditionalUsers(Value *To, Value *User) const { - AdditionalUsers[To].insert(User); + if (isa(To)) + AdditionalUsers[To].insert(User); } void NewGVN::markUsersTouched(Value *V) { @@ -2423,7 +2424,7 @@ static bool okayForPHIOfOps(const Instruction *I) { // When we see an instruction that is an op of phis, generate the equivalent phi // of ops form. const Expression * -NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge, +NewGVN::makePossiblePhiOfOps(Instruction *I, SmallPtrSetImpl &Visited) { if (!okayForPHIOfOps(I)) return nullptr; @@ -2438,24 +2439,6 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge, return nullptr; unsigned IDFSNum = InstrToDFSNum(I); - // Pretty much all of the instructions we can convert to phi of ops over a - // backedge that are adds, are really induction variables, and those are - // pretty much pointless to convert. This is very coarse-grained for a - // test, so if we do find some value, we can change it later. - // But otherwise, what can happen is we convert the induction variable from - // - // i = phi (0, tmp) - // tmp = i + 1 - // - // to - // i = phi (0, tmpphi) - // tmpphi = phi(1, tmpphi+1) - // - // Which we don't want to happen. We could just avoid this for all non-cycle - // free phis, and we made go that route. - if (HasBackedge && I->getOpcode() == Instruction::Add) - return nullptr; - SmallPtrSet ProcessedPHIs; // TODO: We don't do phi translation on memory accesses because it's // complicated. For a load, we'd need to be able to simulate a new memoryuse, @@ -2470,6 +2453,16 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge, // Convert op of phis to phi of ops for (auto &Op : I->operands()) { + // TODO: We can't handle expressions that must be recursively translated + // IE + // a = phi (b, c) + // f = use a + // g = f + phi of something + // To properly make a phi of ops for g, we'd have to properly translate and + // use the instruction for f. We should add this by splitting out the + // instruction creation we do below. + if (isa(Op) && PHINodeUses.count(cast(Op))) + return nullptr; if (!isa(Op)) continue; auto *OpPHI = cast(Op); @@ -2782,8 +2775,7 @@ void NewGVN::valueNumberInstruction(Instruction *I) { // Make a phi of ops if necessary if (Symbolized && !isa(Symbolized) && !isa(Symbolized) && PHINodeUses.count(I)) { - // FIXME: Backedge argument - auto *PHIE = makePossiblePhiOfOps(I, false, Visited); + auto *PHIE = makePossiblePhiOfOps(I, Visited); if (PHIE) Symbolized = PHIE; } diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index ed72099ec3ed..24d28a6c2831 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" @@ -141,16 +142,77 @@ static bool definedInCaller(const SetVector &Blocks, Value *V) { return false; } -void CodeExtractor::findInputsOutputs(ValueSet &Inputs, - ValueSet &Outputs) const { +void CodeExtractor::findAllocas(ValueSet &SinkCands) const { + Function *Func = (*Blocks.begin())->getParent(); + for (BasicBlock &BB : *Func) { + if (Blocks.count(&BB)) + continue; + for (Instruction &II : BB) { + auto *AI = dyn_cast(&II); + if (!AI) + continue; + + // Returns true if matching life time markers are found within + // the outlined region. + auto GetLifeTimeMarkers = [&](Instruction *Addr) { + Instruction *LifeStart = nullptr, *LifeEnd = nullptr; + for (User *U : Addr->users()) { + if (!definedInRegion(Blocks, U)) + return false; + + IntrinsicInst *IntrInst = dyn_cast(U); + if (IntrInst) { + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) + LifeStart = IntrInst; + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) + LifeEnd = IntrInst; + } + } + return LifeStart && LifeEnd; + }; + + if (GetLifeTimeMarkers(AI)) { + SinkCands.insert(AI); + continue; + } + + // Follow the bitcast: + Instruction *MarkerAddr = nullptr; + for (User *U : AI->users()) { + if (U->stripPointerCasts() == AI) { + Instruction *Bitcast = cast(U); + if (GetLifeTimeMarkers(Bitcast)) { + MarkerAddr = Bitcast; + continue; + } + } + if (!definedInRegion(Blocks, U)) { + MarkerAddr = nullptr; + break; + } + } + if (MarkerAddr) { + if (!definedInRegion(Blocks, MarkerAddr)) + SinkCands.insert(MarkerAddr); + SinkCands.insert(AI); + } + } + } +} + +void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, + const ValueSet &SinkCands) const { + for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. for (Instruction &II : *BB) { for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; - ++OI) - if (definedInCaller(Blocks, *OI)) - Inputs.insert(*OI); + ++OI) { + Value *V = *OI; + if (!SinkCands.count(V) && definedInCaller(Blocks, V)) + Inputs.insert(V); + } for (User *U : II.users()) if (!definedInRegion(Blocks, U)) { @@ -718,7 +780,7 @@ Function *CodeExtractor::extractCodeRegion() { if (!isEligible()) return nullptr; - ValueSet inputs, outputs; + ValueSet inputs, outputs, SinkingCands; // Assumption: this is a single-entry code region, and the header is the first // block in the region. @@ -757,8 +819,15 @@ Function *CodeExtractor::extractCodeRegion() { "newFuncRoot"); newFuncRoot->getInstList().push_back(BranchInst::Create(header)); + findAllocas(SinkingCands); + // Find inputs to, outputs from the code region. - findInputsOutputs(inputs, outputs); + findInputsOutputs(inputs, outputs, SinkingCands); + + // Now sink all instructions which only have non-phi uses inside the region + for (auto *II : SinkingCands) + cast(II)->moveBefore(*newFuncRoot, + newFuncRoot->getFirstInsertionPt()); // Calculate the exit blocks for the extracted region and the total exit // weights for each of those blocks. diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp index 8877aeafecde..9e71cba4f1b7 100644 --- a/lib/Transforms/Utils/PredicateInfo.cpp +++ b/lib/Transforms/Utils/PredicateInfo.cpp @@ -541,7 +541,40 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, // // TODO: Use this algorithm to perform fast single-variable renaming in // promotememtoreg and memoryssa. -void PredicateInfo::renameUses(SmallPtrSetImpl &OpsToRename) { +void PredicateInfo::renameUses(SmallPtrSetImpl &OpSet) { + // Sort OpsToRename since we are going to iterate it. + SmallVector OpsToRename(OpSet.begin(), OpSet.end()); + std::sort(OpsToRename.begin(), OpsToRename.end(), [&](const Value *A, + const Value *B) { + auto *ArgA = dyn_cast_or_null(A); + auto *ArgB = dyn_cast_or_null(B); + + // If A and B are args, order them based on their arg no. + if (ArgA && !ArgB) + return true; + if (ArgB && !ArgA) + return false; + if (ArgA && ArgB) + return ArgA->getArgNo() < ArgB->getArgNo(); + + // Else, A are B are instructions. + // If they belong to different BBs, order them by the dominance of BBs. + auto *AInst = cast(A); + auto *BInst = cast(B); + if (AInst->getParent() != BInst->getParent()) + return DT.dominates(AInst->getParent(), BInst->getParent()); + + // Else, A and B belong to the same BB. + // Order A and B by their dominance. + auto *BB = AInst->getParent(); + auto LookupResult = OBBMap.find(BB); + if (LookupResult != OBBMap.end()) + return LookupResult->second->dominates(AInst, BInst); + + auto Result = OBBMap.insert({BB, make_unique(BB)}); + return Result.first->second->dominates(AInst, BInst); + }); + ValueDFS_Compare Compare(OBBMap); // Compute liveness, and rename in O(uses) per Op. for (auto *Op : OpsToRename) { diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 49effda5d833..cc6c47e8f978 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -85,20 +85,6 @@ static bool isCallingConvCCompatible(CallInst *CI) { return false; } -/// Return true if it only matters that the value is equal or not-equal to zero. -static bool isOnlyUsedInZeroEqualityComparison(Value *V) { - for (User *U : V->users()) { - if (ICmpInst *IC = dyn_cast(U)) - if (IC->isEquality()) - if (Constant *C = dyn_cast(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - /// Return true if it is only used in equality comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { for (User *U : V->users()) { diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 2b83b8426d14..8b9a64c220cc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7170,10 +7170,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - // Note: Even if all instructions are scalarized, return true if any memory - // accesses appear in the loop to get benefits from address folding etc. bool TypeNotScalarized = - VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; + VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); } @@ -7312,7 +7310,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. @@ -7445,9 +7443,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } else if (Legal->isUniform(Op2)) { Op2VK = TargetTransformInfo::OK_UniformValue; } - SmallVector Operands(I->operand_values()); - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + SmallVector Operands(I->operand_values()); + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -7470,7 +7469,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - VectorTy = ToVectorTy(getMemInstValueType(I), VF); + unsigned Width = VF; + if (Width > 1) { + InstWidening Decision = getWideningDecision(I, Width); + assert(Decision != CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == CM_Scalarize) + Width = 1; + } + VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: @@ -7495,7 +7502,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } Type *SrcScalarTy = I->getOperand(0)->getType(); - Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + Type *SrcVecTy = + VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; if (canTruncateToMinimalBitwidth(I, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, @@ -7515,7 +7523,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); } case Instruction::Call: { bool NeedToScalarize; diff --git a/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll new file mode 100644 index 000000000000..c1d25c1e3c21 --- /dev/null +++ b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll @@ -0,0 +1,18 @@ +; Ensures that our struct ops are sane. + +; RUN: opt < %s -disable-basicaa -cfl-anders-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -aa-pipeline=cfl-anders-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s + +; Since we ignore non-pointer values, we effectively ignore extractvalue +; instructions. This means that %c "doesn't exist" in test_structure's graph, +; so we currently get MayAlias. +; XFAIL: * + +; CHECK-LABEL: Function: test_structure +; CHECK: NoAlias: i64** %c, { i64**, i64** }* %a +define void @test_structure() { + %a = alloca {i64**, i64**}, align 8 + %b = load {i64**, i64**}, {i64**, i64**}* %a + %c = extractvalue {i64**, i64**} %b, 0 + ret void +} diff --git a/test/Bitcode/thinlto-function-summary-callgraph.ll b/test/Bitcode/thinlto-function-summary-callgraph.ll index 8cc60ad63362..566f3a077e7b 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph.ll @@ -11,20 +11,23 @@ ; RUN: llvm-lto -thinlto-index-stats %p/Inputs/thinlto-function-summary-callgraph-combined.1.bc | FileCheck %s --check-prefix=OLD-COMBINED ; CHECK: +; CHECK-NEXT: ; CHECK: ; COMBINED-NEXT: @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) @@ -72,22 +74,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesea: -; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] -; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] -; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] -; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] -; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] -; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] -; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] -; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKCORTEX: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKCORTEX: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKCORTEX: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKCORTEX: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKCORTEX: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKCORTEX: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKCORTEX: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKCORTEX: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]] @@ -173,22 +175,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesda: -; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] -; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] -; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] -; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] -; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] -; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] -; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] -; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKCORTEX: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKCORTEX: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKCORTEX: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKCORTEX: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKCORTEX: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKCORTEX: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKCORTEX: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKCORTEX: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]] diff --git a/test/CodeGen/AArch64/pr33172.ll b/test/CodeGen/AArch64/pr33172.ll new file mode 100644 index 000000000000..1e1da78b28ff --- /dev/null +++ b/test/CodeGen/AArch64/pr33172.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s + +; CHECK-LABEL: pr33172 +; CHECK: ldp +; CHECK: stp + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios10.3.0" + +@main.b = external global [200 x float], align 8 +@main.x = external global [200 x float], align 8 + +; Function Attrs: nounwind ssp +define void @pr33172() local_unnamed_addr { +entry: + %wide.load8281058.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 12) to i64*), align 8 + %wide.load8291059.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 14) to i64*), align 8 + store i64 %wide.load8281058.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 12) to i64*), align 8 + store i64 %wide.load8291059.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 14) to i64*), align 8 + %wide.load8281058.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 16) to i64*), align 8 + %wide.load8291059.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 18) to i64*), align 8 + store i64 %wide.load8281058.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 16) to i64*), align 8 + store i64 %wide.load8291059.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 18) to i64*), align 8 + tail call void @llvm.memset.p0i8.i64(i8* bitcast ([200 x float]* @main.b to i8*), i8 0, i64 undef, i32 8, i1 false) #2 + unreachable +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 + +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll index a3a78d326a62..02642142ae2c 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0 ; FUNC-LABEL: {{^}}ds_swizzle: -; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100 +; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11") ; CHECK: s_waitcnt lgkmcnt define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir index 064db49924e1..720642ad1ddb 100644 --- a/test/CodeGen/AMDGPU/merge-m0.mir +++ b/test/CodeGen/AMDGPU/merge-m0.mir @@ -50,7 +50,6 @@ name: test alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir index cd50e01032c3..cd0d410368c7 100644 --- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -86,7 +86,6 @@ name: sdwa_imm_operand alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false @@ -248,7 +247,6 @@ body: | name: sdwa_sgpr_operand alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir new file mode 100644 index 000000000000..44dbd38f2d30 --- /dev/null +++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -0,0 +1,33 @@ +# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s + +--- | + define float @waitcnt-permute(i32 %x, i32 %y) { + entry: + %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y) + %1 = bitcast i32 %0 to float + %2 = fadd float 1.000000e+00, %1 + ret float %2 + } + + declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) + +... +--- +# CHECK-LABEL: name: waitcnt-permute{{$}} +# CHECK: DS_BPERMUTE_B32 +# CHECK-NEXT: S_WAITCNT 127 + +name: waitcnt-permute +liveins: + - { reg: '%vgpr0' } + - { reg: '%vgpr1' } + - { reg: '%sgpr30_sgpr31' } +body: | + bb.0: + liveins: %vgpr0, %vgpr1, %sgpr30_sgpr31 + + %vgpr0 = DS_BPERMUTE_B32 killed %vgpr0, killed %vgpr1, 0, implicit %exec + %vgpr0 = V_ADD_F32_e32 1065353216, killed %vgpr0, implicit %exec + S_SETPC_B64_return killed %sgpr30_sgpr31, implicit killed %vgpr0 + +... diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll index f8ad2bbbbe0e..a3be72112c76 100644 --- a/test/CodeGen/ARM/cmpxchg-O0.ll +++ b/test/CodeGen/ARM/cmpxchg-O0.ll @@ -10,10 +10,11 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { ; CHECK: dmb ish ; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrexb [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strexb [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: @@ -29,10 +30,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind ; CHECK: dmb ish ; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrexh [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strexh [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: @@ -48,10 +50,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind ; CHECK: dmb ish ; CHECK-NOT: uxt ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrex [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strex [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: diff --git a/test/CodeGen/ARM/v6-jumptable-clobber.mir b/test/CodeGen/ARM/v6-jumptable-clobber.mir index 0e9bc42565f3..6577ef848671 100644 --- a/test/CodeGen/ARM/v6-jumptable-clobber.mir +++ b/test/CodeGen/ARM/v6-jumptable-clobber.mir @@ -190,7 +190,6 @@ name: foo alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false @@ -289,7 +288,6 @@ body: | name: bar alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AVR/rot.ll b/test/CodeGen/AVR/rot.ll index e43daf3e6aa8..a7b77d97ba69 100644 --- a/test/CodeGen/AVR/rot.ll +++ b/test/CodeGen/AVR/rot.ll @@ -6,7 +6,7 @@ define i8 @rol8(i8 %val, i8 %amt) { ; CHECK: andi r22, 7 - ; CHECK-NEXT: cp r22, r0 + ; CHECK-NEXT: cpi r22, 0 ; CHECK-NEXT: breq LBB0_2 ; CHECK-NEXT: LBB0_1: @@ -32,7 +32,7 @@ define i8 @rol8(i8 %val, i8 %amt) { define i8 @ror8(i8 %val, i8 %amt) { ; CHECK: andi r22, 7 - ; CHECK-NEXT: cp r22, r0 + ; CHECK-NEXT: cpi r22, 0 ; CHECK-NEXT: breq LBB1_2 ; CHECK-NEXT: LBB1_1: diff --git a/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir new file mode 100644 index 000000000000..2233e3289f11 --- /dev/null +++ b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir @@ -0,0 +1,17 @@ +# RUN: llc -march=hexagon -start-after if-converter %s -o - | FileCheck %s +# CHECK: p0 = r0 +# CHECK-NEXT: jumpr r31 + +# Make sure that the packetizer does not attempt to newify the J2_jumpr +# only because of the def-use of p0. + +--- +name: fred +tracksRegLiveness: true +body: | + bb.0: + liveins: %d0 + %p0 = C2_tfrrp %r0 + J2_jumpr %r31, implicit-def %pc, implicit %p0 +... + diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll new file mode 100644 index 000000000000..b25010f2a90f --- /dev/null +++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll @@ -0,0 +1,62 @@ +; RUN: opt -march=hexagon -hexagon-loop-idiom -S < %s | FileCheck %s +; +; The number of nested selects caused the simplification loop to take +; more than the maximum number of iterations. This caused the compiler +; to crash under suspicion of an infinite loop. This (still reduced) +; testcase shows a legitimate case where this limit was exceeded. +; Instead of crashing, gracefully abort the simplification. +; +; Check for sane output. +; CHECK: define void @fred + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() unnamed_addr #0 { +b0: + %v1 = select i1 false, i32 undef, i32 2 + br label %b2 + +b2: ; preds = %b2, %b0 + %v3 = sext i16 undef to i32 + %v4 = add nsw i32 %v1, %v3 + %v5 = select i1 undef, i32 undef, i32 %v4 + %v6 = icmp slt i32 %v5, undef + %v7 = select i1 %v6, i32 %v5, i32 undef + %v8 = icmp slt i32 %v7, 0 + %v9 = select i1 %v8, i32 %v7, i32 0 + %v10 = sub i32 undef, undef + %v11 = add i32 %v10, %v9 + %v12 = sext i16 undef to i32 + %v13 = sext i16 undef to i32 + %v14 = add nsw i32 %v1, %v13 + %v15 = select i1 undef, i32 undef, i32 %v14 + %v16 = icmp slt i32 %v15, undef + %v17 = select i1 %v16, i32 %v15, i32 undef + %v18 = select i1 undef, i32 %v17, i32 %v12 + %v19 = add i32 undef, %v18 + %v20 = sext i16 undef to i32 + %v21 = sext i16 0 to i32 + %v22 = add nsw i32 %v1, %v21 + %v23 = sext i16 undef to i32 + %v24 = add nsw i32 %v1, %v23 + %v25 = select i1 undef, i32 undef, i32 %v24 + %v26 = icmp slt i32 %v25, %v22 + %v27 = select i1 %v26, i32 %v25, i32 %v22 + %v28 = icmp slt i32 %v27, %v20 + %v29 = select i1 %v28, i32 %v27, i32 %v20 + %v30 = add i32 undef, %v29 + %v31 = add i32 %v11, undef + %v32 = add i32 %v31, undef + %v33 = add i32 %v32, %v19 + %v34 = add i32 %v33, %v30 + %v35 = add nsw i32 %v34, 32768 + %v36 = icmp ult i32 %v35, 65536 + %v37 = select i1 %v36, i32 %v34, i32 undef + br i1 undef, label %b2, label %b38 + +b38: ; preds = %b2 + unreachable +} + +attributes #0 = { "target-cpu"="hexagonv60" } diff --git a/test/CodeGen/Hexagon/mul64-sext.ll b/test/CodeGen/Hexagon/mul64-sext.ll new file mode 100644 index 000000000000..8bbe6649a1fb --- /dev/null +++ b/test/CodeGen/Hexagon/mul64-sext.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +target triple = "hexagon-unknown--elf" + +; CHECK-LABEL: mul_1 +; CHECK: r1:0 = mpy(r2,r0) +define i64 @mul_1(i64 %a0, i64 %a1) #0 { +b2: + %v3 = shl i64 %a0, 32 + %v4 = ashr exact i64 %v3, 32 + %v5 = shl i64 %a1, 32 + %v6 = ashr exact i64 %v5, 32 + %v7 = mul nsw i64 %v6, %v4 + ret i64 %v7 +} + +; CHECK-LABEL: mul_2 +; CHECK: r0 = memb(r0+#0) +; CHECK: r1:0 = mpy(r2,r0) +; CHECK: jumpr r31 +define i64 @mul_2(i8* %a0, i64 %a1) #0 { +b2: + %v3 = load i8, i8* %a0 + %v4 = sext i8 %v3 to i64 + %v5 = shl i64 %a1, 32 + %v6 = ashr exact i64 %v5, 32 + %v7 = mul nsw i64 %v6, %v4 + ret i64 %v7 +} + +; CHECK-LABEL: mul_acc_1 +; CHECK: r5:4 += mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_acc_1(i64 %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = add i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_acc_2 +; CHECK: r2 = memw(r2+#0) +; CHECK: r5:4 += mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_acc_2(i64 %a0, i32* %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = load i32, i32* %a1 + %v7 = sext i32 %v6 to i64 + %v8 = mul nsw i64 %v7, %v5 + %v9 = add i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_nac_1 +; CHECK: r5:4 -= mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_nac_1(i64 %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = sub i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_nac_2 +; CHECK: r0 = memw(r0+#0) +; CHECK: r5:4 -= mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_nac_2(i32* %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = load i32, i32* %a0 + %v5 = sext i32 %v4 to i64 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = sub i64 %a2, %v8 + ret i64 %v9 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MIR/Generic/multiRunPass.mir b/test/CodeGen/MIR/Generic/multiRunPass.mir index bca007de80b7..bd1c0d0b458e 100644 --- a/test/CodeGen/MIR/Generic/multiRunPass.mir +++ b/test/CodeGen/MIR/Generic/multiRunPass.mir @@ -7,8 +7,8 @@ # This test ensures that the command line accepts # several run passes on the same command line and # actually create the proper pipeline for it. -# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt -# PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos +# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt +# PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos # Make sure there are no other passes happening after what we asked. # CHECK-NEXT: --- | diff --git a/test/CodeGen/Mips/compactbranches/empty-block.mir b/test/CodeGen/Mips/compactbranches/empty-block.mir index 7831e51e3157..7fb1afae9121 100644 --- a/test/CodeGen/Mips/compactbranches/empty-block.mir +++ b/test/CodeGen/Mips/compactbranches/empty-block.mir @@ -39,7 +39,6 @@ name: l5 alignment: 2 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/PowerPC/expand-isel.ll b/test/CodeGen/PowerPC/expand-isel.ll index 553cc3c372e5..c8707bda8e84 100644 --- a/test/CodeGen/PowerPC/expand-isel.ll +++ b/test/CodeGen/PowerPC/expand-isel.ll @@ -212,13 +212,14 @@ cleanup: ret i32 %retval.0 ; CHECK-LABEL: @testComplexISEL -; CHECK: bc 12, 2, [[TRUE:.LBB[0-9]+]] -; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]] -; CHECK-NEXT: [[TRUE]] -; CHECK-NEXT: addi r3, r12, 0 -; CHECK-NEXT: [[SUCCESSOR]] -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: blr +; CHECK-DAG: [[LI:r[0-9]+]], 1 +; CHECK-DAG: cmplwi [[LD:r[0-9]+]], 0 +; CHECK: beq cr0, [[EQ:.LBB[0-9_]+]] +; CHECK: blr +; CHECK: [[EQ]] +; CHECK: xor [[XOR:r[0-9]+]] +; CHECK: cntlzd [[CZ:r[0-9]+]], [[XOR]] +; CHECK: rldicl [[SH:r[0-9]+]], [[CZ]], 58, 63 } !1 = !{!2, !2, i64 0} diff --git a/test/CodeGen/PowerPC/logic-ops-on-compares.ll b/test/CodeGen/PowerPC/logic-ops-on-compares.ll new file mode 100644 index 000000000000..df021c20ea86 --- /dev/null +++ b/test/CodeGen/PowerPC/logic-ops-on-compares.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +; Function Attrs: nounwind +define signext i32 @logic_ne_32(i32 signext %a, i32 signext %b, i32 signext %c) { +; CHECK-LABEL: logic_ne_32: +; CHECK: xor r7, r3, r4 +; CHECK-NEXT: li r6, 55 +; CHECK-NEXT: xor r5, r5, r6 +; CHECK-NEXT: or r7, r7, r4 +; CHECK-NEXT: cntlzw r5, r5 +; CHECK-NEXT: cntlzw r6, r7 +; CHECK-NEXT: srwi r6, r6, 5 +; CHECK-NEXT: srwi r5, r5, 5 +; CHECK-NEXT: or. r5, r6, r5 +; CHECK-NEXT: bc 4, 1 +entry: + %tobool = icmp eq i32 %a, %b + %tobool1 = icmp eq i32 %b, 0 + %or.cond = and i1 %tobool, %tobool1 + %tobool3 = icmp eq i32 %c, 55 + %or.cond5 = or i1 %or.cond, %tobool3 + br i1 %or.cond5, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call signext i32 @foo(i32 signext %a) #2 + br label %return + +if.end: ; preds = %entry + %call4 = tail call signext i32 @bar(i32 signext %b) #2 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i32 [ %call4, %if.end ], [ %call, %if.then ] + ret i32 %retval.0 +} + +define void @neg_truncate_i32(i32 *%ptr) { +; CHECK-LABEL: neg_truncate_i32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lwz r3, 0(r3) +; CHECK-NEXT: rldicl. r3, r3, 0, 63 +; CHECK-NEXT: bclr 12, 2, 0 +; CHECK-NEXT: # BB#1: # %if.end29.thread136 +; CHECK-NEXT: .LBB1_2: # %if.end29 +entry: + %0 = load i32, i32* %ptr, align 4 + %rem17127 = and i32 %0, 1 + %cmp18 = icmp eq i32 %rem17127, 0 + br label %if.else + +if.else: ; preds = %entry + br i1 %cmp18, label %if.end29, label %if.end29.thread136 + +if.end29.thread136: ; preds = %if.else + unreachable + +if.end29: ; preds = %if.else + ret void + +} + +; Function Attrs: nounwind +define i64 @logic_ne_64(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: logic_ne_64: +; CHECK: xor r7, r3, r4 +; CHECK-NEXT: li r6, 55 +; CHECK-NEXT: xor r5, r5, r6 +; CHECK-NEXT: or r7, r7, r4 +; CHECK-NEXT: cntlzd r6, r7 +; CHECK-NEXT: cntlzd r5, r5 +; CHECK-NEXT: rldicl r6, r6, 58, 63 +; CHECK-NEXT: rldicl r5, r5, 58, 63 +; CHECK-NEXT: or. r5, r6, r5 +; CHECK-NEXT: bc 4, 1 +entry: + %tobool = icmp eq i64 %a, %b + %tobool1 = icmp eq i64 %b, 0 + %or.cond = and i1 %tobool, %tobool1 + %tobool3 = icmp eq i64 %c, 55 + %or.cond5 = or i1 %or.cond, %tobool3 + br i1 %or.cond5, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call i64 @foo64(i64 %a) #2 + br label %return + +if.end: ; preds = %entry + %call4 = tail call i64 @bar64(i64 %b) #2 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i64 [ %call4, %if.end ], [ %call, %if.then ] + ret i64 %retval.0 +} + +define void @neg_truncate_i64(i64 *%ptr) { +; CHECK-LABEL: neg_truncate_i64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: ld r3, 0(r3) +; CHECK-NEXT: rldicl. r3, r3, 0, 63 +; CHECK-NEXT: bclr 12, 2, 0 +; CHECK-NEXT: # BB#1: # %if.end29.thread136 +; CHECK-NEXT: .LBB3_2: # %if.end29 +entry: + %0 = load i64, i64* %ptr, align 4 + %rem17127 = and i64 %0, 1 + %cmp18 = icmp eq i64 %rem17127, 0 + br label %if.else + +if.else: ; preds = %entry + br i1 %cmp18, label %if.end29, label %if.end29.thread136 + +if.end29.thread136: ; preds = %if.else + unreachable + +if.end29: ; preds = %if.else + ret void + +} + +declare signext i32 @foo(i32 signext) +declare signext i32 @bar(i32 signext) +declare i64 @foo64(i64) +declare i64 @bar64(i64) diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll new file mode 100644 index 000000000000..3095429758f6 --- /dev/null +++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -0,0 +1,121 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4 +@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4 +@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4 +@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4 +@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4 +@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4 + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +; Validate with if(memcmp()) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16) + %not.tobool = icmp ne i32 %call, 0 + %. = zext i1 %not.tobool to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest01 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with if(memcmp() == 0) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp ne i32 %call, 0 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest02 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with > 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp slt i32 %call, 1 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest03 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with < 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16) + %call.lobit = lshr i32 %call, 31 + %call.lobit.not = xor i32 %call.lobit, 1 + ret i32 %call.lobit.not + + ; CHECK-LABEL: @zeroEqualityTest04 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.tobool = icmp eq i32 %call, 0 + %cond = zext i1 %not.tobool to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest05 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK: li 3, 0 +} + +; Validate with !memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.lnot = icmp ne i32 %call, 0 + %cond = zext i1 %not.lnot to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest06 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} diff --git a/test/CodeGen/PowerPC/memcmp.ll b/test/CodeGen/PowerPC/memcmp.ll new file mode 100644 index 000000000000..bae713cb2072 --- /dev/null +++ b/test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK + +; Check size 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2 + ret i32 %call + +; CHECK-LABEL: @test1 +; CHECK: ldbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 4 +; Function Attrs: nounwind readonly +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + +; CHECK-LABEL: @test2 +; CHECK: lwbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 2 +; Function Attrs: nounwind readonly +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2 + ret i32 %call + +; CHECK-LABEL: @test3 +; CHECK: lhbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 1 +; Function Attrs: nounwind readonly +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2 + ret i32 %call + +; CHECK-LABEL: @test4 +; CHECK: lbz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lbz [[LOAD2:[0-9]+]] +; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1 diff --git a/test/CodeGen/PowerPC/memcmpIR.ll b/test/CodeGen/PowerPC/memcmpIR.ll new file mode 100644 index 000000000000..f052cc258df8 --- /dev/null +++ b/test/CodeGen/PowerPC/memcmpIR.ll @@ -0,0 +1,194 @@ +; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s +; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE + +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { +entry: + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) + ret i32 %call +} + +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) + ret i32 %call +} + +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) + ret i32 %call +} + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) + ret i32 %call +} + +define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) { + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %conv = sext i32 %SIZE to i64 + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) + ret i32 %call +} diff --git a/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll new file mode 100644 index 000000000000..7ca5332865ca --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -march=ppc64 -mcpu=a2 -enable-ppc-prefetching=true | FileCheck %s -check-prefix=CHECK-DCBT + +; Function Attrs: nounwind +define signext i32 @check_cache_line() local_unnamed_addr { +entry: + %call = tail call i32* bitcast (i32* (...)* @magici to i32* ()*)() + %call115 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)() + %cmp16 = icmp sgt i32 %call115, 0 + br i1 %cmp16, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add5, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %res.017 = phi i32 [ %add5, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %call, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %res.017 + %1 = add nuw nsw i64 %indvars.iv, 16 + %arrayidx4 = getelementptr inbounds i32, i32* %call, i64 %1 + %2 = load i32, i32* %arrayidx4, align 4 + %add5 = add nsw i32 %add, %2 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %call1 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)() + %3 = sext i32 %call1 to i64 + %cmp = icmp slt i64 %indvars.iv.next, %3 + br i1 %cmp, label %for.body, label %for.cond.cleanup +; CHECK-LABEL: check_cache_line +; CHECK: dcbt +; CHECK-NOT: dcbt +; CHECK: blr +; CHECK-DCBT-LABEL: check_cache_line +; CHECK-DCBT: dcbt +; CHECK-DCBT: dcbt +; CHECK-DCBT: blr +} + +declare i32* @magici(...) local_unnamed_addr + +declare signext i32 @iter(...) local_unnamed_addr + diff --git a/test/CodeGen/PowerPC/pristine-and-livein.mir b/test/CodeGen/PowerPC/pristine-and-livein.mir deleted file mode 100644 index 6d93bb68c102..000000000000 --- a/test/CodeGen/PowerPC/pristine-and-livein.mir +++ /dev/null @@ -1,330 +0,0 @@ -# RUN: llc -run-pass=post-RA-sched %s -o - | FileCheck %s - -# CHECK: callee-saved-register: '[[REG:%x[0-9]+]]' -# CHECK: callee-saved-register: '{{%x[0-9]+}}' -# CHECK-NOT: [[REG]] = LI8 0 -# CHECK: STD killed [[REG]], ---- | - ; ModuleID = '' - source_filename = "bugpoint-output-4d91ae2.bc" - target datalayout = "e-m:e-i64:64-n32:64" - target triple = "powerpc64le--linux-gnu" - - ; Function Attrs: norecurse nounwind readonly - define i64 @adler32_z(i64 %adler, i8* readonly %buf, i64 %len) local_unnamed_addr #0 { - entry: - %shr = lshr i64 %adler, 16 - %and = and i64 %shr, 65535 - %and1 = and i64 %adler, 65535 - br i1 undef, label %if.then, label %if.end15 - - if.then: ; preds = %entry - %add5 = add nsw i64 %and1, %and - %sub9 = add nsw i64 %add5, 281474976645135 - %shl = shl i64 %add5, 16 - %or = or i64 %shl, %and1 - br label %cleanup - - if.end15: ; preds = %entry - br i1 undef, label %while.cond.preheader, label %while.cond30.preheader - - while.cond30.preheader: ; preds = %if.end15 - br i1 undef, label %while.body33.preheader, label %while.body109.preheader - - while.body33.preheader: ; preds = %while.cond30.preheader - br label %while.body33 - - while.cond.preheader: ; preds = %if.end15 - %sub25 = add i64 %and1, -65521 - %rem = urem i64 %and, 65521 - %shl27 = shl nuw nsw i64 %rem, 16 - %or28 = or i64 %shl27, %and1 - br label %cleanup - - while.body33: ; preds = %do.end, %while.body33.preheader - %indvar = phi i64 [ %indvar.next, %do.end ], [ 0, %while.body33.preheader ] - %sum2.2385 = phi i64 [ %rem102, %do.end ], [ %and, %while.body33.preheader ] - %len.addr.1384 = phi i64 [ %sub34, %do.end ], [ %len, %while.body33.preheader ] - %buf.addr.1383 = phi i8* [ %scevgep390, %do.end ], [ %buf, %while.body33.preheader ] - %adler.addr.3382 = phi i64 [ %rem101, %do.end ], [ %and1, %while.body33.preheader ] - %0 = mul i64 %indvar, 5552 - %1 = add i64 %0, -13 - %scevgep2 = getelementptr i8, i8* %buf, i64 %1 - %sub34 = add i64 %len.addr.1384, -5552 - call void @llvm.ppc.mtctr.i64(i64 347) - br label %do.body - - do.body: ; preds = %do.body, %while.body33 - %adler.addr.4 = phi i64 [ %adler.addr.3382, %while.body33 ], [ %add49, %do.body ] - %sum2.3 = phi i64 [ %sum2.2385, %while.body33 ], [ %add98, %do.body ] - %tmp15.phi = phi i8* [ %scevgep2, %while.body33 ], [ %tmp15.inc, %do.body ] - %tmp15.inc = getelementptr i8, i8* %tmp15.phi, i64 16 - %add38 = add i64 %adler.addr.4, %sum2.3 - %add42 = add i64 %add38, %adler.addr.4 - %add46 = add i64 %add42, %adler.addr.4 - %tmp15 = load i8, i8* %tmp15.inc, align 1, !tbaa !1 - %conv48 = zext i8 %tmp15 to i64 - %add49 = add i64 %adler.addr.4, %conv48 - %add50 = add i64 %add46, %add49 - %add54 = add i64 %add50, %add49 - %add58 = add i64 %add54, %add49 - %add62 = add i64 %add58, %add49 - %add66 = add i64 %add62, %add49 - %add70 = add i64 %add66, %add49 - %add74 = add i64 %add70, %add49 - %add78 = add i64 %add74, %add49 - %add82 = add i64 %add78, %add49 - %add86 = add i64 %add82, %add49 - %add90 = add i64 %add86, %add49 - %add94 = add i64 %add90, %add49 - %add98 = add i64 %add94, %add49 - %2 = call i1 @llvm.ppc.is.decremented.ctr.nonzero() - br i1 %2, label %do.body, label %do.end - - do.end: ; preds = %do.body - %scevgep390 = getelementptr i8, i8* %buf.addr.1383, i64 5552 - %rem101 = urem i64 %add49, 65521 - %rem102 = urem i64 %add98, 65521 - %cmp31 = icmp ugt i64 %sub34, 5551 - %indvar.next = add i64 %indvar, 1 - br i1 %cmp31, label %while.body33, label %while.end103 - - while.end103: ; preds = %do.end - br i1 undef, label %if.end188, label %while.body109.preheader - - while.body109.preheader: ; preds = %while.end103, %while.cond30.preheader - %buf.addr.1.lcssa394400 = phi i8* [ %buf, %while.cond30.preheader ], [ %scevgep390, %while.end103 ] - %arrayidx151 = getelementptr inbounds i8, i8* %buf.addr.1.lcssa394400, i64 10 - %tmp45 = load i8, i8* %arrayidx151, align 1, !tbaa !1 - %conv152 = zext i8 %tmp45 to i64 - br label %while.body109 - - while.body109: ; preds = %while.body109, %while.body109.preheader - %adler.addr.5373 = phi i64 [ %add153, %while.body109 ], [ undef, %while.body109.preheader ] - %add153 = add i64 %adler.addr.5373, %conv152 - br label %while.body109 - - if.end188: ; preds = %while.end103 - %shl189 = shl nuw nsw i64 %rem102, 16 - %or190 = or i64 %shl189, %rem101 - br label %cleanup - - cleanup: ; preds = %if.end188, %while.cond.preheader, %if.then - %retval.0 = phi i64 [ %or, %if.then ], [ %or28, %while.cond.preheader ], [ %or190, %if.end188 ] - ret i64 %retval.0 - } - - ; Function Attrs: nounwind - declare void @llvm.ppc.mtctr.i64(i64) #1 - - ; Function Attrs: nounwind - declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #1 - - ; Function Attrs: nounwind - declare void @llvm.stackprotector(i8*, i8**) #1 - - attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { nounwind } - - !llvm.ident = !{!0} - - !0 = !{!"clang version 5.0.0 "} - !1 = !{!2, !2, i64 0} - !2 = !{!"omnipotent char", !3, i64 0} - !3 = !{!"Simple C/C++ TBAA"} - -... ---- -name: adler32_z -alignment: 4 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -liveins: - - { reg: '%x3' } - - { reg: '%x4' } - - { reg: '%x5' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -fixedStack: - - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' } - - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' } - - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false } -body: | - bb.0.entry: - successors: %bb.1.if.then(0x40000000), %bb.3.if.end15(0x40000000) - liveins: %x3, %x4, %x5, %x29, %x30 - - %x6 = RLWINM8 %x3, 16, 16, 31 - %x3 = RLDICL killed %x3, 0, 48 - BC undef %cr5lt, %bb.3.if.end15 - - bb.1.if.then: - successors: %bb.2.if.then(0x80000000) - liveins: %x3, %x6, %x29, %x30 - - %x4 = ADD8 %x3, killed %x6 - - bb.2.if.then: - liveins: %lr8, %rm, %x3, %x4 - - %x4 = RLDICR killed %x4, 16, 47 - %x3 = OR8 killed %x4, killed %x3 - BLR8 implicit %lr8, implicit %rm, implicit %x3 - - bb.3.if.end15: - successors: %bb.6.while.cond.preheader(0x40000000), %bb.4.while.cond30.preheader(0x40000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - BC undef %cr5lt, %bb.6.while.cond.preheader - - bb.4.while.cond30.preheader: - successors: %bb.7.while.body33.preheader(0x40000000), %bb.5(0x40000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - BCn undef %cr5lt, %bb.7.while.body33.preheader - - bb.5: - successors: %bb.12.while.body109.preheader(0x80000000) - liveins: %x4, %x29, %x30 - - %x7 = OR8 %x4, killed %x4 - B %bb.12.while.body109.preheader - - bb.6.while.cond.preheader: - successors: %bb.2.if.then(0x80000000) - liveins: %x3, %x6, %x29, %x30 - - %x4 = LIS8 15 - %x4 = ORI8 killed %x4, 225 - %x4 = RLDICR killed %x4, 32, 31 - %x4 = ORIS8 killed %x4, 3375 - %x4 = ORI8 killed %x4, 50637 - %x4 = MULHDU %x6, killed %x4 - %x5 = SUBF8 %x4, %x6 - %x5 = RLDICL killed %x5, 63, 1 - %x4 = ADD8 killed %x5, killed %x4 - %x5 = LI8 0 - %x4 = RLDICL killed %x4, 49, 15 - %x5 = ORI8 killed %x5, 65521 - %x4 = MULLD killed %x4, killed %x5 - %x4 = SUBF8 killed %x4, killed %x6 - B %bb.2.if.then - - bb.7.while.body33.preheader: - successors: %bb.8.while.body33(0x80000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1) - STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16) - %x7 = LIS8 15 - %x7 = ORI8 killed %x7, 225 - %x7 = RLDICR killed %x7, 32, 31 - %x8 = LI8 0 - %x7 = ORIS8 killed %x7, 3375 - %x9 = LI8 347 - %x10 = ORI8 killed %x7, 50637 - %x11 = ORI8 %x8, 65521 - %x7 = OR8 %x4, %x4 - - bb.8.while.body33: - successors: %bb.9.do.body(0x80000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11 - - %x12 = MULLI8 %x8, 5552 - %x12 = ADD8 %x4, killed %x12 - %x12 = ADDI8 killed %x12, -13 - %x5 = ADDI8 killed %x5, -5552 - MTCTR8loop %x9, implicit-def dead %ctr8 - - bb.9.do.body: - successors: %bb.9.do.body(0x7c000000), %bb.10.do.end(0x04000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11, %x12 - - %x0, %x12 = LBZU8 16, killed %x12 :: (load 1 from %ir.tmp15.inc, !tbaa !1) - %x6 = ADD8 %x3, killed %x6 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x3 = ADD8 killed %x3, killed %x0 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - BDNZ8 %bb.9.do.body, implicit-def %ctr8, implicit %ctr8 - - bb.10.do.end: - successors: %bb.8.while.body33(0x7c000000), %bb.11.while.end103(0x04000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11 - - %x12 = MULHDU %x3, %x10 - %x0 = MULHDU %x6, %x10 - %x30 = SUBF8 %x12, %x3 - %x29 = SUBF8 %x0, %x6 - %x30 = RLDICL killed %x30, 63, 1 - %x29 = RLDICL killed %x29, 63, 1 - %x12 = ADD8 killed %x30, killed %x12 - %x0 = ADD8 killed %x29, killed %x0 - %cr0 = CMPLDI %x5, 5551 - %x12 = RLDICL killed %x12, 49, 15 - %x0 = RLDICL killed %x0, 49, 15 - %x12 = MULLD killed %x12, %x11 - %x0 = MULLD killed %x0, %x11 - %x7 = ADDI8 killed %x7, 5552 - %x3 = SUBF8 killed %x12, killed %x3 - %x6 = SUBF8 killed %x0, killed %x6 - %x8 = ADDI8 killed %x8, 1 - BCC 44, killed %cr0, %bb.8.while.body33 - - bb.11.while.end103: - successors: %bb.14.if.end188(0x40000000), %bb.12.while.body109.preheader(0x40000000) - liveins: %x3, %x6, %x7 - - %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16) - %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1) - BC undef %cr5lt, %bb.14.if.end188 - - bb.12.while.body109.preheader: - successors: %bb.13.while.body109(0x80000000) - liveins: %x7, %x29, %x30 - - %x3 = LBZ8 10, killed %x7 :: (load 1 from %ir.arrayidx151, !tbaa !1) - %x4 = IMPLICIT_DEF - - bb.13.while.body109: - successors: %bb.13.while.body109(0x80000000) - liveins: %x3, %x4, %x29, %x30 - - %x4 = ADD8 killed %x4, %x3 - B %bb.13.while.body109 - - bb.14.if.end188: - liveins: %x3, %x6, %x29, %x30 - - %x4 = RLDICR killed %x6, 16, 47 - %x3 = OR8 killed %x4, killed %x3 - BLR8 implicit %lr8, implicit %rm, implicit %x3 - -... diff --git a/test/CodeGen/PowerPC/testComparesieqsll.ll b/test/CodeGen/PowerPC/testComparesieqsll.ll new file mode 100644 index 000000000000..57c7365eff03 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsll.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsll.c' + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_z(i64 %a) { +; CHECK-LABEL: test_ieqsll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_sext_z(i64 %a) { +; CHECK-LABEL: test_ieqsll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_z_store(i64 %a) { +; CHECK-LABEL: test_ieqsll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_ieqsll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequll.ll b/test/CodeGen/PowerPC/testComparesiequll.ll new file mode 100644 index 000000000000..c28929071845 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequll.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequll.c' + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_z(i64 %a) { +; CHECK-LABEL: test_iequll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_sext_z(i64 %a) { +; CHECK-LABEL: test_iequll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_z_store(i64 %a) { +; CHECK-LABEL: test_iequll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_iequll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsll.ll b/test/CodeGen/PowerPC/testCompareslleqsll.ll new file mode 100644 index 000000000000..4797ddfbfe97 --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsll.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_z(i64 %a) { +; CHECK-LABEL: test_lleqsll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_sext_z(i64 %a) { +; CHECK-LABEL: test_lleqsll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_z_store(i64 %a) { +; CHECK-LABEL: test_lleqsll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_lleqsll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequll.ll b/test/CodeGen/PowerPC/testComparesllequll.ll new file mode 100644 index 000000000000..4dc7be69d2c8 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequll.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_z(i64 %a) { +; CHECK-LABEL: test_llequll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_sext_z(i64 %a) { +; CHECK-LABEL: test_llequll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_z_store(i64 %a) { +; CHECK-LABEL: test_llequll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_llequll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/vec_xxpermdi.ll b/test/CodeGen/PowerPC/vec_xxpermdi.ll new file mode 100644 index 000000000000..9be2a1864a04 --- /dev/null +++ b/test/CodeGen/PowerPC/vec_xxpermdi.ll @@ -0,0 +1,307 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-BE + +; Possible LE ShuffleVector masks (Case 1): +; ShuffleVector((vector double)a, (vector double)b, 3, 1) +; ShuffleVector((vector double)a, (vector double)b, 2, 1) +; ShuffleVector((vector double)a, (vector double)b, 3, 0) +; ShuffleVector((vector double)a, (vector double)b, 2, 0) +; which targets at: +; xxpermdi a, b, 0 +; xxpermdi a, b, 1 +; xxpermdi a, b, 2 +; xxpermdi a, b, 3 +; Possible LE Swap ShuffleVector masks (Case 2): +; ShuffleVector((vector double)a, (vector double)b, 1, 3) +; ShuffleVector((vector double)a, (vector double)b, 0, 3) +; ShuffleVector((vector double)a, (vector double)b, 1, 2) +; ShuffleVector((vector double)a, (vector double)b, 0, 2) +; which targets at: +; xxpermdi b, a, 0 +; xxpermdi b, a, 1 +; xxpermdi b, a, 2 +; xxpermdi b, a, 3 +; Possible LE ShuffleVector masks when a == b, b is undef (Case 3): +; ShuffleVector((vector double)a, (vector double)a, 1, 1) +; ShuffleVector((vector double)a, (vector double)a, 0, 1) +; ShuffleVector((vector double)a, (vector double)a, 1, 0) +; ShuffleVector((vector double)a, (vector double)a, 0, 0) +; which targets at: +; xxpermdi a, a, 0 +; xxpermdi a, a, 1 +; xxpermdi a, a, 2 +; xxpermdi a, a, 3 + +; Possible BE ShuffleVector masks (Case 4): +; ShuffleVector((vector double)a, (vector double)b, 0, 2) +; ShuffleVector((vector double)a, (vector double)b, 0, 3) +; ShuffleVector((vector double)a, (vector double)b, 1, 2) +; ShuffleVector((vector double)a, (vector double)b, 1, 3) +; which targets at: +; xxpermdi a, b, 0 +; xxpermdi a, b, 1 +; xxpermdi a, b, 2 +; xxpermdi a, b, 3 +; Possible BE Swap ShuffleVector masks (Case 5): +; ShuffleVector((vector double)a, (vector double)b, 2, 0) +; ShuffleVector((vector double)a, (vector double)b, 3, 0) +; ShuffleVector((vector double)a, (vector double)b, 2, 1) +; ShuffleVector((vector double)a, (vector double)b, 3, 1) +; which targets at: +; xxpermdi b, a, 0 +; xxpermdi b, a, 1 +; xxpermdi b, a, 2 +; xxpermdi b, a, 3 +; Possible BE ShuffleVector masks when a == b, b is undef (Case 6): +; ShuffleVector((vector double)a, (vector double)a, 0, 0) +; ShuffleVector((vector double)a, (vector double)a, 0, 1) +; ShuffleVector((vector double)a, (vector double)a, 1, 0) +; ShuffleVector((vector double)a, (vector double)a, 1, 1) +; which targets at: +; xxpermdi a, a, 0 +; xxpermdi a, a, 1 +; xxpermdi a, a, 2 +; xxpermdi a, a, 3 + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-LE: xxmrghd 34, 34, 35 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-LE: xxpermdi 34, 34, 35, 1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-LE: xxpermdi 34, 34, 35, 2 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-LE: xxmrgld 34, 34, 35 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-LE: xxmrghd 34, 35, 34 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-LE: xxpermdi 34, 35, 34, 2 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-LE: xxmrgld 34, 35, 34 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_0 +; CHECK-LE: xxspltd 34, 34, 0 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_2 +; CHCECK-LE: xxswapd 34, 34 +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_3 +; CHECK-LE: xxspltd 34, 34, 1 +; CHECK-LE: blr +} + +; Start testing BE +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-BE: xxmrghd 34, 34, 35 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-BE: xxpermdi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-BE: xxpermdi 34, 34, 35, 2 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-BE: xxmrgld 34, 34, 35 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-BE: xxmrghd 34, 35, 34 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-BE: xxpermdi 34, 35, 34, 1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-BE: xxpermdi 34, 35, 34, 2 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-BE: xxmrgld 34, 35, 34 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_0 +; CHECK-BE: xxspltd 34, 34, 0 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_2 +; CHCECK-LE: xxswapd 34, 34 +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_3 +; CHECK-BE: xxspltd 34, 34, 1 +; CHECK-BE: blr +} + +; More test cases to test different types of vector inputs +define <16 x i8> @test_be_vec_xxpermdi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) { + entry: + %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> + ret <16 x i8> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v16i8_v16i8 +; CHECK-BE: xxpermdi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <8 x i16> @test_le_swap_vec_xxpermdi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) { + entry: + %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> + ret <8 x i16> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v8i16_v8i16 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <4 x i32> @test_le_swap_vec_xxpermdi_v4i32_v4i32(<4 x i32> %VA, <4 x i32> %VB) { + entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB,<4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v4i32_v4i32 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} diff --git a/test/CodeGen/Thumb2/tbb-removeadd.mir b/test/CodeGen/Thumb2/tbb-removeadd.mir index 89ed98720539..106066791343 100644 --- a/test/CodeGen/Thumb2/tbb-removeadd.mir +++ b/test/CodeGen/Thumb2/tbb-removeadd.mir @@ -39,7 +39,6 @@ name: Func alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll index 4ec703921e29..24aa5b98d0bb 100644 --- a/test/CodeGen/X86/2007-01-08-InstrSched.ll +++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -13,10 +13,10 @@ define float @foo(float %x) nounwind { ; CHECK: mulss ; CHECK: mulss -; CHECK: mulss +; CHECK: addss ; CHECK: mulss ; CHECK: addss -; CHECK: addss +; CHECK: mulss ; CHECK: addss ; CHECK: ret } diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll index bc394f6e156f..6c60aed67a7b 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll @@ -5,7 +5,6 @@ define void @test_void_return() { ; CHECK-LABEL: name: test_void_return ; CHECK: alignment: 4 ; CHECK-NEXT: exposesReturnsTwice: false -; CHECK-NEXT: noVRegs: false ; CHECK-NEXT: legalized: false ; CHECK-NEXT: regBankSelected: false ; CHECK-NEXT: selected: false diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll index b9f7fc68cf68..ad82b8cfb775 100644 --- a/test/CodeGen/X86/add-of-carry.ll +++ b/test/CodeGen/X86/add-of-carry.ll @@ -9,9 +9,11 @@ define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp { ; CHECK-LABEL: test1: ; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: adcl %ecx, %eax ; CHECK-NEXT: retl %add4 = add i32 %x, %sum %cmp = icmp ult i32 %add4, %x diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll index 3f4ee362e230..3c84af4aa9ec 100644 --- a/test/CodeGen/X86/addcarry.ll +++ b/test/CodeGen/X86/addcarry.ll @@ -86,21 +86,14 @@ entry: define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) { ; CHECK-LABEL: pr31719: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: addq 8(%rsi), %rcx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: addq 16(%rsi), %r8 -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 24(%rsi), %r9 ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: adcq $0, %rcx -; CHECK-NEXT: adcq %r8, %r10 -; CHECK-NEXT: adcq %r9, %rax +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: adcq 24(%rsi), %r9 ; CHECK-NEXT: movq %rdx, (%rdi) ; CHECK-NEXT: movq %rcx, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %rax, 24(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq entry: @@ -190,9 +183,9 @@ entry: define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) { ; CHECK-LABEL: shiftadd: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: leaq (%rdx,%rcx), %rax ; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: adcq $0, %rax +; CHECK-NEXT: adcq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: retq entry: %0 = zext i64 %a to i128 diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 2aaf14001758..aa28ef5175ed 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE2-NEXT: paddd %xmm6, %xmm9 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp -; SSE2-NEXT: .Lcfi0: -; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa 16(%rsi), %xmm13 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: paddd %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: paddd %xmm8, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: paddd %xmm2, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm0, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm11 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: psrld $1, %xmm15 ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: packuswb %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: packuswb %xmm9, %xmm6 ; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm11 +; SSE2-NEXT: packuswb %xmm6, %xmm11 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm7 +; SSE2-NEXT: packuswb %xmm3, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rax) +; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: movdqu %xmm13, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; SSE2-NEXT: paddd %xmm6, %xmm9 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 341dd867e4ff..647b7a8f4dfc 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -113,9 +113,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: insertps $48 ; CHECK: vaddps +; CHECK: insertps $48 +; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps ; CHECK-NEXT: ret diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index 63b0281a7339..e29cf09718ad 100644 --- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: korw %k3, %k2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX %AX %EAX diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 4890afec2164..c03623a2f035 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm4 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 32da0a70218e..431223611fae 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, < ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 563cad04b8c2..b04c1ab38e55 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -479,11 +479,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -498,11 +498,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -517,11 +517,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -536,11 +536,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -555,11 +555,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -574,11 +574,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -593,11 +593,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -612,11 +612,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -685,8 +685,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -4398,8 +4399,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4418,8 +4419,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4520,9 +4521,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4543,9 +4544,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4612,9 +4613,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll index 4ef88ac495c3..96aefdb10584 100644 --- a/test/CodeGen/X86/avx512-mask-spills.ll +++ b/test/CodeGen/X86/avx512-mask-spills.ll @@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload -; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload -; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9b4e73a18fc2..faa055dfbbf3 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 51f9a382ccbf..ca01033bf78b 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 7df07b0413ed..571f345d4616 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index 8f528394f5bd..f8f47c87100a 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 37aea45e6107..96254f7c95b0 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index cf79819734a2..636358fb91cb 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -39,8 +39,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 06ee237593e7..d54208c00987 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -404,8 +404,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -424,8 +424,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 52a84deebf51..595b3e0ebb86 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, < ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index ad9ea93c2031..1bfdfd0e634d 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 30ecc0d2e49e..9659dc6d455a 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index 3ca686cef3bf..b2fe6eba88ab 100644 --- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4d906a4fd29a..c2d8df6476b3 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 1f324d679564..684b0468cf51 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index a681c3b0aa42..092b139fca2f 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -6,68 +6,35 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { -; SSE2-SSSE3-LABEL: v8i16: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i16: ; AVX12: ## BB#0: ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrw $7, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $6, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $5, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $4, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: @@ -90,22 +57,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i32: @@ -113,19 +66,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: @@ -149,22 +91,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movd %xmm3, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4f32: @@ -172,19 +100,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: @@ -208,56 +125,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-SSSE3-NEXT: andb $1, %cl -; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AX %AX %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v16i8: @@ -265,55 +134,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrb $15, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $14, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $13, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $12, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $11, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $10, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $9, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $8, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $7, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $6, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $5, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $4, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $3, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $2, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $1, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $0, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AX %AX %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: @@ -383,14 +205,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: @@ -405,26 +221,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: @@ -439,26 +250,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i8: @@ -537,14 +343,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: @@ -559,26 +359,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: @@ -593,26 +388,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i16: @@ -683,14 +473,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -703,24 +487,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: @@ -733,24 +512,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: @@ -801,14 +575,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2i64: @@ -816,13 +584,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrq $1, %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovq %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: @@ -846,14 +609,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movq %xmm3, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2f64: @@ -861,13 +618,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrq $1, %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovq %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: @@ -892,29 +644,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; SSE2-SSSE3-NEXT: psrad $24, %xmm3 ; SSE2-SSSE3-NEXT: pslld $24, %xmm2 ; SSE2-SSSE3-NEXT: psrad $24, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pslld $24, %xmm1 ; SSE2-SSSE3-NEXT: psrad $24, %xmm1 ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: @@ -923,26 +661,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: @@ -975,29 +702,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; SSE2-SSSE3-NEXT: psrad $16, %xmm3 ; SSE2-SSSE3-NEXT: pslld $16, %xmm2 ; SSE2-SSSE3-NEXT: psrad $16, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pslld $16, %xmm1 ; SSE2-SSSE3-NEXT: psrad $16, %xmm1 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: @@ -1006,26 +719,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $16, %xmm2, %xmm2 ; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: @@ -1052,45 +754,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { -; SSE2-SSSE3-LABEL: v8i8: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm3 -; SSE2-SSSE3-NEXT: psraw $8, %xmm3 -; SSE2-SSSE3-NEXT: psllw $8, %xmm2 -; SSE2-SSSE3-NEXT: psraw $8, %xmm2 -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i8: +; SSE2: ## BB#0: +; SSE2-NEXT: psllw $8, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: psllw $8, %xmm2 +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: ## BB#0: +; SSSE3-NEXT: psllw $8, %xmm3 +; SSSE3-NEXT: psraw $8, %xmm3 +; SSSE3-NEXT: psllw $8, %xmm2 +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i8: ; AVX12: ## BB#0: @@ -1098,38 +797,16 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrw $7, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $6, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $5, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $4, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index 06b1a76f6bae..a6d6ca155302 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -1,8 +1,83 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512 define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { +; SSE2-SSSE3-LABEL: v4i64: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v4i64: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 @@ -12,19 +87,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -45,30 +109,36 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { } define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { -; AVX2-LABEL: v4f64: -; AVX2: ## BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-SSSE3-LABEL: v4f64: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: cmpltpd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: cmpltpd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm6 +; SSE2-SSSE3-NEXT: psrad $31, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: v4f64: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f64: ; AVX512: ## BB#0: @@ -87,6 +157,78 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> } define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { +; SSE2-LABEL: v16i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: psllw $7, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AX %AX %EAX +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm7, %xmm5 +; SSSE3-NEXT: pshufb %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: psllw $7, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: ## kill: %AX %AX %EAX +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AX %AX %EAX +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v16i16: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 @@ -96,55 +238,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -164,6 +259,79 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { } define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { +; SSE2-LABEL: v8i32: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i32: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psllw $15, %xmm0 +; SSSE3-NEXT: psraw $15, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufb %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: psllw $15, %xmm4 +; SSSE3-NEXT: psraw $15, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm4, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v8i32: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 @@ -173,31 +341,9 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -217,42 +363,74 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { } define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; AVX2-LABEL: v8f32: -; AVX2: ## BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-LABEL: v8f32: +; SSE2: ## BB#0: +; SSE2-NEXT: cmpltps %xmm1, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: cmpltps %xmm0, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: cmpltps %xmm5, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: cmpltps %xmm4, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8f32: +; SSSE3: ## BB#0: +; SSSE3-NEXT: cmpltps %xmm1, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: cmpltps %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: psllw $15, %xmm2 +; SSSE3-NEXT: psraw $15, %xmm2 +; SSSE3-NEXT: cmpltps %xmm5, %xmm7 +; SSSE3-NEXT: pshufb %xmm1, %xmm7 +; SSSE3-NEXT: cmpltps %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSSE3-NEXT: psllw $15, %xmm6 +; SSSE3-NEXT: psraw $15, %xmm6 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm6, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8f32: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8f32: ; AVX512: ## BB#0: @@ -270,121 +448,250 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) } define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { +; SSE2-SSSE3-LABEL: v32i8: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: shll $16, %ecx +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: orl %ecx, %eax +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: ## BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: movl (%rsp), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index d1508f99fc71..9bf7b41a4f26 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -1,69 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { -; SSE2-SSSE3-LABEL: v8i16: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX +; SSE2-NEXT: retq ; -; AVX1-LABEL: v8i16: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; SSSE3-LABEL: v8i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8i16: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: ; AVX512: ## BB#0: @@ -80,41 +46,16 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-SSSE3-LABEL: v4i32: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i32: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i32: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: ; AVX512: ## BB#0: @@ -132,42 +73,16 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) { ; SSE2-SSSE3-LABEL: v4f32: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4f32: -; AVX1: ## BB#0: -; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vextractps $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $0, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4f32: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: ; AVX512: ## BB#0: @@ -185,111 +100,16 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-SSSE3-LABEL: v16i8: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-SSSE3-NEXT: andb $1, %cl -; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AX %AX %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v16i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; AVX1-NEXT: retq +; AVX12-LABEL: v16i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AX %AX %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: ; AVX512: ## BB#0: @@ -330,14 +150,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: @@ -353,15 +167,27 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i8: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i8: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1 @@ -406,14 +232,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: @@ -429,15 +249,27 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i16: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1 @@ -478,14 +310,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -499,15 +325,25 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL %AL %EAX ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i32: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i32: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 @@ -538,27 +374,16 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i64: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v2i64: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: ; AVX512: ## BB#0: @@ -576,27 +401,16 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) { ; SSE2-SSSE3-LABEL: v2f64: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2f64: -; AVX1: ## BB#0: -; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v2f64: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: ; AVX512: ## BB#0: @@ -618,45 +432,20 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: ; AVX512: ## BB#0: @@ -682,45 +471,20 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL %AL %EAX ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: ## BB#0: -; AVX1-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i16: +; AVX12: ## BB#0: +; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: ; AVX512: ## BB#0: @@ -739,73 +503,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { -; SSE2-SSSE3-LABEL: v8i8: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i8: +; SSE2: ## BB#0: +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL %AL %EAX +; SSE2-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; SSSE3-LABEL: v8i8: +; SSSE3: ## BB#0: +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL %AL %EAX +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL %AL %EAX +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: ; AVX512: ## BB#0: diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index 51c6ad7c7f9e..b2c619c48d4d 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -8,55 +8,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AX %AX %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -76,33 +29,8 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-LABEL: v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -122,33 +50,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { ; AVX2-LABEL: v8f32: ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -167,117 +70,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -296,21 +90,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX2-LABEL: v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -331,21 +112,8 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX2-LABEL: v4f64: ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: ## kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll index a9c74df9d0d9..1340b7662a7a 100644 --- a/test/CodeGen/X86/bswap_tree2.ll +++ b/test/CodeGen/X86/bswap_tree2.ll @@ -9,31 +9,32 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %eax -; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 diff --git a/test/CodeGen/X86/eh-unknown.ll b/test/CodeGen/X86/eh-unknown.ll new file mode 100644 index 000000000000..7c495bdadc67 --- /dev/null +++ b/test/CodeGen/X86/eh-unknown.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s + +; An unknown personality forces us to emit an Itanium LSDA. Make sure that the +; Itanium call site table actually tells the personality to keep unwinding, +; i.e. we have an entry and it says "has no landing pad". + +declare void @throwit() +declare void @__unknown_ehpersonality(...) + +define void @use_unknown_ehpersonality() + personality void (...)* @__unknown_ehpersonality { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_unknown_ehpersonality: +; CHECK: .Lfunc_begin0: +; CHECK: .seh_handler __unknown_ehpersonality, @unwind, @except +; CHECK: callq throwit +; CHECK: .Lfunc_end0: +; CHECK: .seh_handlerdata +; CHECK: .Lexception0: +; CHECK: .byte 255 # @LPStart Encoding = omit +; CHECK: .byte 0 # @TType Encoding = absptr +; CHECK: .asciz "\217\200" # @TType base offset +; CHECK: .byte 3 # Call site Encoding = udata4 +; CHECK: .byte 13 # Call site table length +; CHECK: .long .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << +; CHECK: .long .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0 +; CHECK: .long 0 # has no landing pad +; CHECK: .byte 0 # On action: cleanup diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll index bd8888966cf2..338a95f6a80c 100644 --- a/test/CodeGen/X86/fmsubadd-combine.ll +++ b/test/CodeGen/X86/fmsubadd-combine.ll @@ -117,9 +117,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2 ; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3 ; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd512: @@ -137,9 +137,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2 ; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3 ; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA4-NEXT: retq entry: %AB = fmul <8 x double> %A, %B @@ -157,9 +157,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2 ; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3 ; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1 +; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_ps512: @@ -178,9 +178,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2 ; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3 ; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1 +; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA4-NEXT: retq entry: %AB = fmul <16 x float> %A, %B diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index d68236e9d250..eb06eb75a4d7 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll index 98082ec611d4..6c6bc8bdc1d1 100644 --- a/test/CodeGen/X86/fp128-i128.ll +++ b/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/test/CodeGen/X86/gnu-seh-nolpads.ll b/test/CodeGen/X86/gnu-seh-nolpads.ll new file mode 100644 index 000000000000..311f4d522b1d --- /dev/null +++ b/test/CodeGen/X86/gnu-seh-nolpads.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=x86_64-windows-gnu < %s | FileCheck %s + +declare void @throwit() +declare void @__gxx_personality_seh0(...) +declare void @__gcc_personality_seh0(...) + +define void @use_gxx_seh() + personality void (...)* @__gxx_personality_seh0 { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_gxx_seh: +; CHECK: .seh_proc use_gxx_seh +; CHECK-NOT: .seh_handler __gxx_personality_seh0 +; CHECK: callq throwit +; CHECK: .seh_handlerdata +; CHECK: .seh_endproc + +define void @use_gcc_seh() + personality void (...)* @__gcc_personality_seh0 { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_gcc_seh: +; CHECK: .seh_proc use_gcc_seh +; CHECK-NOT: .seh_handler __gcc_personality_seh0 +; CHECK: callq throwit +; CHECK: .seh_handlerdata +; CHECK: .seh_endproc + diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir index d0ba057fa009..b05c4467d309 100644 --- a/test/CodeGen/X86/implicit-null-checks.mir +++ b/test/CodeGen/X86/implicit-null-checks.mir @@ -379,7 +379,7 @@ liveins: - { reg: '%esi' } # CHECK: bb.0.entry: # CHECK: %eax = MOV32ri 2200000 -# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x) +# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %eax, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x) # CHECK-NEXT: JMP_1 %bb.1.not_null body: | @@ -544,7 +544,7 @@ liveins: - { reg: '%rsi' } # CHECK: bb.0.entry: # CHECK: %rbx = MOV64rr %rdx -# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x) +# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %rbx, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x) body: | bb.0.entry: @@ -656,7 +656,7 @@ body: | name: use_alternate_load_op # CHECK-LABEL: name: use_alternate_load_op # CHECK: bb.0.entry: -# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ +# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -689,7 +689,7 @@ body: | name: imp_null_check_gep_load_with_use_dep # CHECK-LABEL: name: imp_null_check_gep_load_with_use_dep # CHECK: bb.0.entry: -# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x) +# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x) # CHECK-NEXT: JMP_1 %bb.1.not_null alignment: 4 tracksRegLiveness: true @@ -721,7 +721,7 @@ name: imp_null_check_load_with_base_sep # CHECK-LABEL: name: imp_null_check_load_with_base_sep # CHECK: bb.0.entry: # CHECK: %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags -# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags +# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %esi, %rdi, 1, _, 0, _, implicit-def %eflags # CHECK-NEXT: JMP_1 %bb.1.not_null alignment: 4 tracksRegLiveness: true @@ -752,7 +752,7 @@ body: | name: inc_store # CHECK-LABEL: name: inc_store # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %rsi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %rsi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -782,7 +782,7 @@ body: | name: inc_store_plus_offset # CHECK-LABEL: inc_store_plus_offset # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %rsi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %rsi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -813,7 +813,7 @@ name: inc_store_with_dep # CHECK-LABEL: inc_store_with_dep # CHECK: bb.0.entry: # CHECK: %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags -# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi +# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -972,7 +972,7 @@ body: | name: inc_store_with_reused_base # CHECK-LABEL: inc_store_with_reused_base # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -1174,7 +1174,7 @@ body: | name: inc_store_with_load_and_store # CHECK-LABEL: inc_store_with_load_and_store # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags +# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %esi, implicit-def %eflags # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -1205,7 +1205,7 @@ body: | name: inc_store_and_load_no_alias # CHECK-LABEL: inc_store_and_load_no_alias # CHECK: bb.0.entry: -# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr) +# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr) # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll new file mode 100644 index 000000000000..a9cf086dbd90 --- /dev/null +++ b/test/CodeGen/X86/lrshrink.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call +; to minimize live-range. + +define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) { +entry: + br i1 %a, label %then, label %else + +then: + br label %else + +else: + %0 = phi i64 [ 4, %entry ], [ 10, %then ] + %r = phi i64 [ %r1, %entry ], [ %r2, %then ] + %s = phi i64 [ %s1, %entry ], [ %s2, %then ] + %t = phi i64 [ %t1, %entry ], [ %t2, %then ] +; CHECK-LABEL: test: +; CHECK: add +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add + %1 = tail call i32 @_Z3foov() + %2 = zext i32 %1 to i64 + %3 = tail call i32 @_Z3foov() + %4 = zext i32 %3 to i64 + %5 = tail call i32 @_Z3foov() + %6 = zext i32 %5 to i64 + %7 = add nuw nsw i64 %0, %r + tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %8 = add nuw nsw i64 %2, %7 + %9 = add nuw nsw i64 %4, %8 + %10 = add nuw nsw i64 %6, %9 + %11 = add nuw nsw i64 %s, %t + tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %12 = add nuw nsw i64 %10, %11 + ret i64 %12 +} + +declare i32 @_Z3foov() +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!1, !2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug) +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !DIFile(filename: "a.c", directory: "./") +!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0) +!5 = !DILocalVariable(name: "x", scope: !4) +!6 = !DILocation(line: 4, scope: !4) diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index d332b2f3169f..af86df510016 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: pmullw %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index e62a1d04dad6..94bbe75702cb 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll index 6d2465ddd3a8..e3e2737cf3e6 100644 --- a/test/CodeGen/X86/mul-constant-i16.ll +++ b/test/CodeGen/X86/mul-constant-i16.ll @@ -188,16 +188,13 @@ define i16 @test_mul_by_11(i16 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: imull $11, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_11: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,2), %eax +; X64-NEXT: imull $11, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 11 @@ -228,16 +225,13 @@ define i16 @test_mul_by_13(i16 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $13, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_13: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: imull $13, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 13 @@ -247,19 +241,14 @@ define i16 @test_mul_by_13(i16 %x) { define i16 @test_mul_by_14(i16 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $14, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_14: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $14, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 14 @@ -349,19 +338,14 @@ define i16 @test_mul_by_19(i16 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: shll $2, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $19, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_19: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: shll $2, %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $19, %edi, %eax +; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 19 ret i16 %mul @@ -391,16 +375,13 @@ define i16 @test_mul_by_21(i16 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $21, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_21: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: imull $21, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 21 @@ -410,19 +391,14 @@ define i16 @test_mul_by_21(i16 %x) { define i16 @test_mul_by_22(i16 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $22, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_22: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $22, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 22 @@ -433,19 +409,14 @@ define i16 @test_mul_by_23(i16 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $23, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_23: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: shll $3, %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $23, %edi, %eax +; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 23 ret i16 %mul @@ -495,19 +466,14 @@ define i16 @test_mul_by_26(i16 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $26, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_26: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $26, %edi, %eax +; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 26 ret i16 %mul @@ -536,19 +502,14 @@ define i16 @test_mul_by_27(i16 %x) { define i16 @test_mul_by_28(i16 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $28, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_28: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $28, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 28 @@ -558,21 +519,14 @@ define i16 @test_mul_by_28(i16 %x) { define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $29, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_29: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $29, %edi, %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 @@ -583,22 +537,14 @@ define i16 @test_mul_by_30(i16 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: imull $30, %eax, %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_30: ; X64: # BB#0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $5, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: subl %ecx, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $30, %edi, %eax +; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 30 ret i16 %mul @@ -641,30 +587,3 @@ define i16 @test_mul_by_32(i16 %x) { %mul = mul nsw i16 %x, 32 ret i16 %mul } - -; (x*9+42)*(x*5+2) -define i16 @test_mul_spec(i16 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: # kill: %AX %AX %EAX -; X86-NEXT: retl -; -; X64-LABEL: test_mul_spec: -; X64: # BB#0: -; X64-NEXT: # kill: %EDI %EDI %RDI -; X64-NEXT: leal 42(%rdi,%rdi,8), %ecx -; X64-NEXT: leal 2(%rdi,%rdi,4), %eax -; X64-NEXT: imull %ecx, %eax -; X64-NEXT: # kill: %AX %AX %EAX -; X64-NEXT: retq - %mul = mul nsw i16 %x, 9 - %add = add nsw i16 %mul, 42 - %mul2 = mul nsw i16 %x, 5 - %add2 = add nsw i16 %mul2, 2 - %mul3 = mul nsw i16 %add, %add2 - ret i16 %mul3 -} diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll index b1e9a929b7f2..76e46e1f1b09 100644 --- a/test/CodeGen/X86/mul-constant-i32.ll +++ b/test/CodeGen/X86/mul-constant-i32.ll @@ -1,12 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG -; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 define i32 @test_mul_by_1(i32 %x) { ; X86-LABEL: test_mul_by_1: @@ -14,40 +8,10 @@ define i32 @test_mul_by_1(i32 %x) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_1: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_1: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_1: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_1: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_1: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_1: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_1: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_1: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 1 ret i32 %mul } @@ -59,47 +23,11 @@ define i32 @test_mul_by_2(i32 %x) { ; X86-NEXT: addl %eax, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_2: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_2: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_2: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: addl %eax, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_2: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_2: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_2: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_2: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_2: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 2 ret i32 %mul } @@ -110,46 +38,11 @@ define i32 @test_mul_by_3(i32 %x) { ; X86-NEXT: imull $3, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_3: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_3: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_3: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_3: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_3: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_3: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_3: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_3: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 3 ret i32 %mul } @@ -161,47 +54,11 @@ define i32 @test_mul_by_4(i32 %x) { ; X86-NEXT: shll $2, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_4: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_4: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_4: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $2, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_4: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_4: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_4: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_4: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_4: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 4 ret i32 %mul } @@ -212,46 +69,11 @@ define i32 @test_mul_by_5(i32 %x) { ; X86-NEXT: imull $5, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_5: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_5: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_5: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_5: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_5: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_5: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_5: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_5: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 5 ret i32 %mul } @@ -264,46 +86,12 @@ define i32 @test_mul_by_6(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_6: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_6: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_6: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_6: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_6: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_6: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_6: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_6: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 6 ret i32 %mul } @@ -316,46 +104,12 @@ define i32 @test_mul_by_7(i32 %x) { ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_7: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_7: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_7: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_7: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_7: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_7: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_7: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_7: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (,%rdi,8), %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 7 ret i32 %mul } @@ -367,47 +121,11 @@ define i32 @test_mul_by_8(i32 %x) { ; X86-NEXT: shll $3, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_8: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_8: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_8: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $3, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_8: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_8: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_8: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_8: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_8: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 8 ret i32 %mul } @@ -418,46 +136,11 @@ define i32 @test_mul_by_9(i32 %x) { ; X86-NEXT: imull $9, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_9: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_9: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_9: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_9: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_9: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_9: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_9: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_9: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 9 ret i32 %mul } @@ -470,46 +153,12 @@ define i32 @test_mul_by_10(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_10: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_10: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_10: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_10: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_10: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_10: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_10: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_10: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 10 ret i32 %mul } @@ -517,49 +166,13 @@ define i32 @test_mul_by_10(i32 %x) { define i32 @test_mul_by_11(i32 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_11: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_11: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_11: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_11: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_11: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_11: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_11: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_11: +; X64: # BB#0: +; X64-NEXT: imull $11, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 11 ret i32 %mul } @@ -572,46 +185,12 @@ define i32 @test_mul_by_12(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_12: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_12: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_12: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_12: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_12: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_12: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_12: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_12: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 12 ret i32 %mul } @@ -619,49 +198,13 @@ define i32 @test_mul_by_12(i32 %x) { define i32 @test_mul_by_13(i32 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_13: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_13: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_13: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_13: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_13: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_13: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_13: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_13: +; X64: # BB#0: +; X64-NEXT: imull $13, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 13 ret i32 %mul } @@ -669,52 +212,13 @@ define i32 @test_mul_by_13(i32 %x) { define i32 @test_mul_by_14(i32 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_14: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_14: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_14: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_14: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_14: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_14: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_14: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_14: +; X64: # BB#0: +; X64-NEXT: imull $14, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 14 ret i32 %mul } @@ -727,46 +231,12 @@ define i32 @test_mul_by_15(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_15: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_15: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_15: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_15: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_15: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_15: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_15: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_15: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 15 ret i32 %mul } @@ -778,47 +248,11 @@ define i32 @test_mul_by_16(i32 %x) { ; X86-NEXT: shll $4, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_16: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_16: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shll $4, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_16: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $4, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_16: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_16: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_16: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shll $4, %edi # sched: [1:1.00] -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_16: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_16: +; X64: # BB#0: +; X64-NEXT: shll $4, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 16 ret i32 %mul } @@ -832,49 +266,13 @@ define i32 @test_mul_by_17(i32 %x) { ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_17: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_17: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_17: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_17: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_17: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_17: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_17: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_17: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $4, %eax +; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 17 ret i32 %mul } @@ -887,46 +285,12 @@ define i32 @test_mul_by_18(i32 %x) { ; X86-NEXT: leal (%eax,%eax,8), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_18: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_18: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_18: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_18: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_18: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_18: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_18: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_18: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 18 ret i32 %mul } @@ -934,54 +298,13 @@ define i32 @test_mul_by_18(i32 %x) { define i32 @test_mul_by_19(i32 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: shll $2, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_19: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_19: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: shll $2, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_19: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_19: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_19: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_19: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_19: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_19: +; X64: # BB#0: +; X64-NEXT: imull $19, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 19 ret i32 %mul } @@ -994,46 +317,12 @@ define i32 @test_mul_by_20(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_20: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_20: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_20: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_20: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_20: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_20: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_20: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_20: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 20 ret i32 %mul } @@ -1041,49 +330,13 @@ define i32 @test_mul_by_20(i32 %x) { define i32 @test_mul_by_21(i32 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_21: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_21: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_21: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_21: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_21: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_21: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_21: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_21: +; X64: # BB#0: +; X64-NEXT: imull $21, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 21 ret i32 %mul } @@ -1091,52 +344,13 @@ define i32 @test_mul_by_21(i32 %x) { define i32 @test_mul_by_22(i32 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_22: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_22: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_22: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_22: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_22: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_22: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_22: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_22: +; X64: # BB#0: +; X64-NEXT: imull $22, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 22 ret i32 %mul } @@ -1144,54 +358,13 @@ define i32 @test_mul_by_22(i32 %x) { define i32 @test_mul_by_23(i32 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_23: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_23: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: shll $3, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_23: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_23: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_23: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_23: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_23: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_23: +; X64: # BB#0: +; X64-NEXT: imull $23, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 23 ret i32 %mul } @@ -1204,46 +377,12 @@ define i32 @test_mul_by_24(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_24: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_24: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: shll $3, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_24: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_24: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_24: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_24: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: shll $3, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_24: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_24: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: shll $3, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 24 ret i32 %mul } @@ -1256,46 +395,12 @@ define i32 @test_mul_by_25(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_25: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_25: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_25: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_25: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_25: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_25: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_25: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_25: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 25 ret i32 %mul } @@ -1303,54 +408,13 @@ define i32 @test_mul_by_25(i32 %x) { define i32 @test_mul_by_26(i32 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_26: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_26: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_26: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_26: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_26: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_26: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_26: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_26: +; X64: # BB#0: +; X64-NEXT: imull $26, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 26 ret i32 %mul } @@ -1363,46 +427,12 @@ define i32 @test_mul_by_27(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_27: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_27: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_27: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_27: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_27: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_27: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_27: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_27: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI %EDI %RDI +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 27 ret i32 %mul } @@ -1410,52 +440,13 @@ define i32 @test_mul_by_27(i32 %x) { define i32 @test_mul_by_28(i32 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_28: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_28: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_28: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_28: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_28: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_28: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_28: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_28: +; X64: # BB#0: +; X64-NEXT: imull $28, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 28 ret i32 %mul } @@ -1463,55 +454,13 @@ define i32 @test_mul_by_28(i32 %x) { define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_29: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_29: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_29: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_29: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_29: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_29: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_29: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_29: +; X64: # BB#0: +; X64-NEXT: imull $29, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 29 ret i32 %mul } @@ -1519,58 +468,13 @@ define i32 @test_mul_by_29(i32 %x) { define i32 @test_mul_by_30(i32 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_30: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: subl %eax, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: subl %ecx, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_30: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: movl %edi, %ecx # sched: [1:0.17] -; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %ecx # sched: [1:0.50] -; X64-JAG-NEXT: subl %ecx, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_30: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_30: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_30: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_30: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_30: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_30: +; X64: # BB#0: +; X64-NEXT: imull $30, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 30 ret i32 %mul } @@ -1584,46 +488,12 @@ define i32 @test_mul_by_31(i32 %x) { ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_31: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_31: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_31: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_31: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_31: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_31: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00] -; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_31: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_31: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $5, %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 31 ret i32 %mul } @@ -1635,124 +505,11 @@ define i32 @test_mul_by_32(i32 %x) { ; X86-NEXT: shll $5, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_32: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_32: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shll $5, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_32: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $5, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_32: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_32: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_32: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shll $5, %edi # sched: [1:1.00] -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_32: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_32: +; X64: # BB#0: +; X64-NEXT: shll $5, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 32 ret i32 %mul } - -; (x*9+42)*(x*5+2) -define i32 @test_mul_spec(i32 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: retl -; -; X64-HSW-LABEL: test_mul_spec: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI %EDI %RDI -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50] -; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25] -; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_spec: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI %EDI %RDI -; X64-JAG-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50] -; X64-JAG-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: imull %ecx, %eax # sched: [3:1.00] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_spec: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NOOPT-NEXT: imull %ecx, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_spec: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50] -; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25] -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_spec: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; JAG-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50] -; JAG-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_spec: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI %EDI %RDI -; X64-SLM-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00] -; X64-SLM-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: imull %ecx, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_spec: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI %EDI %RDI -; SLM-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00] -; SLM-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] - %mul = mul nsw i32 %x, 9 - %add = add nsw i32 %mul, 42 - %mul2 = mul nsw i32 %x, 5 - %add2 = add nsw i32 %mul2, 2 - %mul3 = mul nsw i32 %add, %add2 - ret i32 %mul3 -} diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll index 22eb0bdc6c3f..8579179a8231 100644 --- a/test/CodeGen/X86/mul-constant-i64.ll +++ b/test/CodeGen/X86/mul-constant-i64.ll @@ -1,55 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG -; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 -define i64 @test_mul_by_1(i64 %x) nounwind { +define i64 @test_mul_by_1(i64 %x) { ; X86-LABEL: test_mul_by_1: ; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_1: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_1: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_1: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_1: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_1: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_1: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_1: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_1: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 1 ret i64 %mul } @@ -63,43 +26,10 @@ define i64 @test_mul_by_2(i64 %x) { ; X86-NEXT: addl %eax, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_2: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_2: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_2: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $1, %eax, %edx -; X86-NOOPT-NEXT: addl %eax, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_2: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_2: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_2: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_2: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_2: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 2 ret i64 %mul } @@ -113,43 +43,10 @@ define i64 @test_mul_by_3(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_3: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_3: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_3: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $3, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_3: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_3: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_3: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_3: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_3: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 3 ret i64 %mul } @@ -163,43 +60,10 @@ define i64 @test_mul_by_4(i64 %x) { ; X86-NEXT: shll $2, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_4: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_4: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_4: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $2, %eax, %edx -; X86-NOOPT-NEXT: shll $2, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_4: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_4: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_4: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_4: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_4: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 4 ret i64 %mul } @@ -213,43 +77,10 @@ define i64 @test_mul_by_5(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_5: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_5: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_5: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $5, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_5: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_5: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_5: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_5: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_5: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 5 ret i64 %mul } @@ -264,46 +95,11 @@ define i64 @test_mul_by_6(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_6: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_6: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_6: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $6, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_6: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_6: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_6: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_6: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_6: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 6 ret i64 %mul } @@ -319,46 +115,11 @@ define i64 @test_mul_by_7(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_7: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_7: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_7: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $7, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_7: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_7: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_7: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_7: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_7: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 7 ret i64 %mul } @@ -372,43 +133,10 @@ define i64 @test_mul_by_8(i64 %x) { ; X86-NEXT: shll $3, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_8: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_8: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_8: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $3, %eax, %edx -; X86-NOOPT-NEXT: shll $3, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_8: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_8: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_8: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_8: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_8: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 8 ret i64 %mul } @@ -422,43 +150,10 @@ define i64 @test_mul_by_9(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_9: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_9: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_9: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $9, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_9: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_9: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_9: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_9: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_9: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 9 ret i64 %mul } @@ -473,46 +168,11 @@ define i64 @test_mul_by_10(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_10: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_10: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_10: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $10, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_10: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_10: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_10: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_10: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_10: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 10 ret i64 %mul } @@ -520,53 +180,16 @@ define i64 @test_mul_by_10(i64 %x) { define i64 @test_mul_by_11(i64 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %ecx ; X86-NEXT: movl $11, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_11: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_11: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_11: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $11, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_11: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_11: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_11: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_11: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_11: +; X64: # BB#0: +; X64-NEXT: imulq $11, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 11 ret i64 %mul } @@ -581,46 +204,11 @@ define i64 @test_mul_by_12(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_12: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_12: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_12: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $12, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_12: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_12: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_12: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_12: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_12: +; X64: # BB#0: +; X64-NEXT: shlq $2, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 12 ret i64 %mul } @@ -628,53 +216,16 @@ define i64 @test_mul_by_12(i64 %x) { define i64 @test_mul_by_13(i64 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: movl $13, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_13: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_13: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_13: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $13, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_13: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_13: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_13: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_13: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_13: +; X64: # BB#0: +; X64-NEXT: imulq $13, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 13 ret i64 %mul } @@ -682,56 +233,16 @@ define i64 @test_mul_by_13(i64 %x) { define i64 @test_mul_by_14(i64 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $14, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_14: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_14: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_14: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $14, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_14: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_14: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_14: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_14: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_14: +; X64: # BB#0: +; X64-NEXT: imulq $14, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 14 ret i64 %mul } @@ -747,46 +258,11 @@ define i64 @test_mul_by_15(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_15: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_15: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_15: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $15, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_15: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_15: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_15: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_15: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_15: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 15 ret i64 %mul } @@ -800,49 +276,11 @@ define i64 @test_mul_by_16(i64 %x) { ; X86-NEXT: shll $4, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_16: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_16: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $4, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_16: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $4, %eax, %edx -; X86-NOOPT-NEXT: shll $4, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_16: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_16: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_16: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $4, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_16: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shlq $4, %rdi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_16: +; X64: # BB#0: +; X64-NEXT: shlq $4, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 16 ret i64 %mul } @@ -859,49 +297,12 @@ define i64 @test_mul_by_17(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_17: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_17: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_17: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $17, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_17: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_17: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_17: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: shlq $4, %rax # sched: [1:1.00] -; X64-SLM-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_17: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_17: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 17 ret i64 %mul } @@ -916,46 +317,11 @@ define i64 @test_mul_by_18(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_18: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_18: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_18: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $18, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_18: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_18: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_18: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_18: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_18: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 18 ret i64 %mul } @@ -963,58 +329,16 @@ define i64 @test_mul_by_18(i64 %x) { define i64 @test_mul_by_19(i64 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: shll $2, %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $19, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_19: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_19: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: shlq $2, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_19: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $19, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_19: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_19: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_19: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_19: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_19: +; X64: # BB#0: +; X64-NEXT: imulq $19, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 19 ret i64 %mul } @@ -1029,46 +353,11 @@ define i64 @test_mul_by_20(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_20: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_20: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_20: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $20, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_20: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_20: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_20: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_20: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_20: +; X64: # BB#0: +; X64-NEXT: shlq $2, %rdi +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 20 ret i64 %mul } @@ -1076,53 +365,16 @@ define i64 @test_mul_by_20(i64 %x) { define i64 @test_mul_by_21(i64 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: movl $21, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_21: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_21: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_21: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $21, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_21: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_21: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_21: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_21: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_21: +; X64: # BB#0: +; X64-NEXT: imulq $21, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 21 ret i64 %mul } @@ -1130,56 +382,16 @@ define i64 @test_mul_by_21(i64 %x) { define i64 @test_mul_by_22(i64 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $22, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_22: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_22: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_22: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $22, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_22: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_22: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_22: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_22: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_22: +; X64: # BB#0: +; X64-NEXT: imulq $22, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 22 ret i64 %mul } @@ -1187,58 +399,16 @@ define i64 @test_mul_by_22(i64 %x) { define i64 @test_mul_by_23(i64 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: shll $3, %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $23, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_23: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_23: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: shlq $3, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_23: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $23, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_23: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_23: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_23: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_23: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_23: +; X64: # BB#0: +; X64-NEXT: imulq $23, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 23 ret i64 %mul } @@ -1253,46 +423,11 @@ define i64 @test_mul_by_24(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,8), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_24: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_24: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $3, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_24: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $24, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_24: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_24: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_24: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $3, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_24: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_24: +; X64: # BB#0: +; X64-NEXT: shlq $3, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 24 ret i64 %mul } @@ -1308,46 +443,11 @@ define i64 @test_mul_by_25(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_25: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_25: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_25: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $25, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_25: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_25: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_25: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_25: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_25: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: leaq (%rax,%rax,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 25 ret i64 %mul } @@ -1355,58 +455,16 @@ define i64 @test_mul_by_25(i64 %x) { define i64 @test_mul_by_26(i64 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $26, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_26: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_26: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_26: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $26, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_26: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_26: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_26: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_26: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_26: +; X64: # BB#0: +; X64-NEXT: imulq $26, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 26 ret i64 %mul } @@ -1422,46 +480,11 @@ define i64 @test_mul_by_27(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_27: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_27: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_27: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $27, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_27: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_27: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_27: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_27: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_27: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 27 ret i64 %mul } @@ -1469,56 +492,16 @@ define i64 @test_mul_by_27(i64 %x) { define i64 @test_mul_by_28(i64 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $28, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_28: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_28: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_28: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $28, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_28: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_28: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_28: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_28: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_28: +; X64: # BB#0: +; X64-NEXT: imulq $28, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 28 ret i64 %mul } @@ -1526,59 +509,16 @@ define i64 @test_mul_by_28(i64 %x) { define i64 @test_mul_by_29(i64 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_29: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_29: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_29: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $29, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_29: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_29: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_29: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_29: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_29: +; X64: # BB#0: +; X64-NEXT: imulq $29, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 29 ret i64 %mul } @@ -1586,60 +526,16 @@ define i64 @test_mul_by_29(i64 %x) { define i64 @test_mul_by_30(i64 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $5, %ecx ; X86-NEXT: movl $30, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_30: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: subq %rax, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: subq %rcx, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_30: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: movq %rdi, %rcx # sched: [1:0.17] -; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rcx # sched: [1:0.50] -; X64-JAG-NEXT: subq %rcx, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_30: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $30, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_30: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_30: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_30: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_30: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_30: +; X64: # BB#0: +; X64-NEXT: imulq $30, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 30 ret i64 %mul } @@ -1656,49 +552,12 @@ define i64 @test_mul_by_31(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_31: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_31: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_31: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $31, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_31: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_31: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_31: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: shlq $5, %rax # sched: [1:1.00] -; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_31: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_31: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $5, %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 31 ret i64 %mul } @@ -1712,168 +571,11 @@ define i64 @test_mul_by_32(i64 %x) { ; X86-NEXT: shll $5, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_32: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_32: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $5, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_32: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $5, %eax, %edx -; X86-NOOPT-NEXT: shll $5, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_32: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_32: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_32: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $5, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_32: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shlq $5, %rdi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_32: +; X64: # BB#0: +; X64-NEXT: shlq $5, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 32 ret i64 %mul } - -; (x*9+42)*(x*5+2) -define i64 @test_mul_spec(i64 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl $9, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: leal (%edi,%edi,8), %ebx -; X86-NEXT: addl $42, %esi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl $5, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: leal (%edi,%edi,4), %edi -; X86-NEXT: addl $2, %ecx -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: imull %esi, %edi -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: retl -; -; X64-HSW-LABEL: test_mul_spec: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50] -; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25] -; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_spec: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50] -; X64-JAG-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_spec: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: pushl %ebx -; X86-NOOPT-NEXT: pushl %edi -; X86-NOOPT-NEXT: pushl %esi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOOPT-NEXT: movl $9, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %esi -; X86-NOOPT-NEXT: leal (%edi,%edi,8), %ebx -; X86-NOOPT-NEXT: addl $42, %esi -; X86-NOOPT-NEXT: adcl %edx, %ebx -; X86-NOOPT-NEXT: movl $5, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %ecx -; X86-NOOPT-NEXT: leal (%edi,%edi,4), %edi -; X86-NOOPT-NEXT: addl $2, %ecx -; X86-NOOPT-NEXT: adcl %edx, %edi -; X86-NOOPT-NEXT: movl %esi, %eax -; X86-NOOPT-NEXT: mull %ecx -; X86-NOOPT-NEXT: imull %esi, %edi -; X86-NOOPT-NEXT: addl %edi, %edx -; X86-NOOPT-NEXT: imull %ebx, %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: popl %esi -; X86-NOOPT-NEXT: popl %edi -; X86-NOOPT-NEXT: popl %ebx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_spec: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50] -; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25] -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_spec: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50] -; JAG-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_spec: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00] -; X64-SLM-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_spec: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00] -; SLM-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] - %mul = mul nsw i64 %x, 9 - %add = add nsw i64 %mul, 42 - %mul2 = mul nsw i64 %x, 5 - %add2 = add nsw i64 %mul2, 2 - %mul3 = mul nsw i64 %add, %add2 - ret i64 %mul3 -} diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index d26cf02dd942..0bda41a30c69 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] +; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: por %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm2 -; SSE42-NEXT: movq %xmm2, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movq %xmm1, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 88cb7a6d5825..50a661fcca11 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm5 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm5, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm2 -; SSE41-NEXT: pmuldq %xmm8, %xmm3 +; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll index 571dd6774906..c54909cf93c1 100644 --- a/test/CodeGen/X86/pr32284.ll +++ b/test/CodeGen/X86/pr32284.ll @@ -1,81 +1,17 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X86-O0 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X64-O0 +; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686 +; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686 +; REQUIRES: asserts @c = external constant i8, align 1 define void @foo() { -; X86-LABEL: foo: -; X86: # BB#0: # %entry -; X86-NEXT: subl $8, %esp -; X86-NEXT: .Lcfi0: -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: movzbl c, %eax -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %cl -; X86-NEXT: testb %al, %al -; X86-NEXT: setne {{[0-9]+}}(%esp) -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: setle %dl -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: addl $8, %esp -; X86-NEXT: retl -; -; X86-O0-LABEL: foo: -; X86-O0: # BB#0: # %entry -; X86-O0-NEXT: subl $12, %esp -; X86-O0-NEXT: .Lcfi0: -; X86-O0-NEXT: .cfi_def_cfa_offset 16 -; X86-O0-NEXT: movb c, %al -; X86-O0-NEXT: testb %al, %al -; X86-O0-NEXT: setne {{[0-9]+}}(%esp) -; X86-O0-NEXT: movzbl c, %ecx -; X86-O0-NEXT: testl %ecx, %ecx -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: movzbl %al, %edx -; X86-O0-NEXT: subl %ecx, %edx -; X86-O0-NEXT: setle %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %ecx -; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-O0-NEXT: addl $12, %esp -; X86-O0-NEXT: retl -; -; X64-LABEL: foo: -; X64: # BB#0: # %entry -; X64-NEXT: movzbl {{.*}}(%rip), %eax -; X64-NEXT: testb %al, %al -; X64-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %cl -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setle %dl -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq -; -; X64-O0-LABEL: foo: -; X64-O0: # BB#0: # %entry -; X64-O0-NEXT: movb {{.*}}(%rip), %al -; X64-O0-NEXT: testb %al, %al -; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx -; X64-O0-NEXT: testl %ecx, %ecx -; X64-O0-NEXT: setne %al -; X64-O0-NEXT: movzbl %al, %edx -; X64-O0-NEXT: subl %ecx, %edx -; X64-O0-NEXT: setle %al -; X64-O0-NEXT: andb $1, %al -; X64-O0-NEXT: movzbl %al, %ecx -; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-O0-NEXT: retq +; CHECK-LABEL: foo: +; CHECK: # BB#0: # %entry +; CHECK-DAG: setne +; CHECK-DAG: setle +; CHECK: ret entry: %a = alloca i8, align 1 %b = alloca i32, align 4 @@ -100,3 +36,125 @@ entry: store i32 %conv8, i32* %b, align 4 ret void } + +@var_5 = external global i32, align 4 +@var_57 = external global i64, align 8 +@_ZN8struct_210member_2_0E = external global i64, align 8 + +define void @f1() { +; CHECK-LABEL: f1: +; CHECK: # BB#0: # %entry +; CHECK: sete +; X64: addq $7093, {{.*}} +; 686: addl $7093, {{.*}} +; CHECK: ret +entry: + %a = alloca i8, align 1 + %0 = load i32, i32* @var_5, align 4 + %conv = sext i32 %0 to i64 + %add = add nsw i64 %conv, 8381627093 + %tobool = icmp ne i64 %add, 0 + %frombool = zext i1 %tobool to i8 + store i8 %frombool, i8* %a, align 1 + %1 = load i32, i32* @var_5, align 4 + %neg = xor i32 %1, -1 + %tobool1 = icmp ne i32 %neg, 0 + %lnot = xor i1 %tobool1, true + %conv2 = zext i1 %lnot to i64 + %2 = load i32, i32* @var_5, align 4 + %conv3 = sext i32 %2 to i64 + %add4 = add nsw i64 %conv3, 7093 + %cmp = icmp sgt i64 %conv2, %add4 + %conv5 = zext i1 %cmp to i64 + store i64 %conv5, i64* @var_57, align 8 + %3 = load i32, i32* @var_5, align 4 + %neg6 = xor i32 %3, -1 + %tobool7 = icmp ne i32 %neg6, 0 + %lnot8 = xor i1 %tobool7, true + %conv9 = zext i1 %lnot8 to i64 + store i64 %conv9, i64* @_ZN8struct_210member_2_0E, align 8 + ret void +} + + +@var_7 = external global i8, align 1 + +define void @f2() { +; CHECK-LABEL: f2: +; CHECK: # BB#0: # %entry +; X64: movzbl {{.*}}(%rip), %[[R:[a-z]*]] +; 686: movzbl {{.*}}, %[[R:[a-z]*]] +; CHECK: test{{[qlwb]}} %[[R]], %[[R]] +; CHECK: sete {{.*}} +; CHECK: ret +entry: + %a = alloca i16, align 2 + %0 = load i8, i8* @var_7, align 1 + %conv = zext i8 %0 to i32 + %1 = load i8, i8* @var_7, align 1 + %tobool = icmp ne i8 %1, 0 + %lnot = xor i1 %tobool, true + %conv1 = zext i1 %lnot to i32 + %xor = xor i32 %conv, %conv1 + %conv2 = trunc i32 %xor to i16 + store i16 %conv2, i16* %a, align 2 + %2 = load i8, i8* @var_7, align 1 + %conv3 = zext i8 %2 to i16 + %tobool4 = icmp ne i16 %conv3, 0 + %lnot5 = xor i1 %tobool4, true + %conv6 = zext i1 %lnot5 to i32 + %3 = load i8, i8* @var_7, align 1 + %conv7 = zext i8 %3 to i32 + %cmp = icmp eq i32 %conv6, %conv7 + %conv8 = zext i1 %cmp to i32 + %conv9 = trunc i32 %conv8 to i16 + store i16 %conv9, i16* undef, align 2 + ret void +} + + +@var_13 = external global i32, align 4 +@var_16 = external global i32, align 4 +@var_46 = external global i32, align 4 + +define void @f3() #0 { +; CHECK-LABEL: f3: +; X64-DAG: movl var_13(%rip), {{.*}} +; X64-DAG: movl var_16(%rip), {{.*}} +; X64-DAG: movl {{.*}},{{.*}}var_46{{.*}} +; X64: retq +; 686-DAG: movl var_13, {{.*}} +; 686-DAG: movl var_16, {{.*}} +; 686-DAG: movl {{.*}},{{.*}}var_46{{.*}} +; 686: retl +entry: + %a = alloca i64, align 8 + %0 = load i32, i32* @var_13, align 4 + %neg = xor i32 %0, -1 + %conv = zext i32 %neg to i64 + %1 = load i32, i32* @var_13, align 4 + %tobool = icmp ne i32 %1, 0 + %lnot = xor i1 %tobool, true + %conv1 = zext i1 %lnot to i64 + %2 = load i32, i32* @var_13, align 4 + %neg2 = xor i32 %2, -1 + %3 = load i32, i32* @var_16, align 4 + %xor = xor i32 %neg2, %3 + %conv3 = zext i32 %xor to i64 + %and = and i64 %conv1, %conv3 + %or = or i64 %conv, %and + store i64 %or, i64* %a, align 8 + %4 = load i32, i32* @var_13, align 4 + %neg4 = xor i32 %4, -1 + %conv5 = zext i32 %neg4 to i64 + %5 = load i32, i32* @var_13, align 4 + %tobool6 = icmp ne i32 %5, 0 + %lnot7 = xor i1 %tobool6, true + %conv8 = zext i1 %lnot7 to i64 + %and9 = and i64 %conv8, 0 + %or10 = or i64 %conv5, %and9 + %conv11 = trunc i64 %or10 to i32 + store i32 %conv11, i32* @var_46, align 4 + ret void +} + diff --git a/test/CodeGen/X86/pr32610.ll b/test/CodeGen/X86/pr32610.ll new file mode 100644 index 000000000000..1116cf6f1b29 --- /dev/null +++ b/test/CodeGen/X86/pr32610.ll @@ -0,0 +1,40 @@ +; RUN: llc -o - %s | FileCheck %s + +; CHECK-LABEL: @pr32610 +; CHECK: movl L_b$non_lazy_ptr, [[BASEREG:%[a-z]+]] +; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}} +; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}} + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.13.0" + +@c = external local_unnamed_addr global i32, align 4 +@b = external local_unnamed_addr global [1 x i32], align 4 +@d = external local_unnamed_addr global i32, align 4 + +; Function Attrs: norecurse nounwind optsize ssp +define void @pr32610() local_unnamed_addr #0 { +entry: + %0 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i32 0, i32 undef), align 4, !tbaa !1 + %cmp = icmp eq i32 undef, %0 + %conv = zext i1 %cmp to i32 + %tobool1.i = icmp ne i32 undef, 0 + %or.cond.i = and i1 %cmp, %tobool1.i + %cond.i = select i1 %or.cond.i, i32 %conv, i32 undef + store i32 %cond.i, i32* @c, align 4, !tbaa !1 + %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i32 0, i32 0), align 4 + %tobool = icmp ne i32 %1, 0 + %2 = select i1 %tobool, i32 %1, i32 undef + store i32 %2, i32* @d, align 4, !tbaa !1 + ret void +} + +attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 5.0.0 (trunk 301507) (llvm/trunk 301505)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll index 5d5150ad62d6..4be3a4c2391b 100644 --- a/test/CodeGen/X86/rotate.ll +++ b/test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %esi, %eax ; 32-NEXT: orl %ebx, %edx +; 32-NEXT: orl %esi, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %ebx, %eax ; 32-NEXT: orl %esi, %edx +; 32-NEXT: orl %ebx, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl 8(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi - +; 32-NEXT: retl +; ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq + %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index b8a8b8afd14f..6a565a5c76f0 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -149,127 +149,131 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: psubd %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: psubd %xmm4, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm7 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: psubd %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: psubd %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE2-NEXT: psubd %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -398,288 +402,284 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $184, %rsp -; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: subq $200, %rsp ; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm10, %xmm10 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm8 -; SSE2-NEXT: movdqa %xmm13, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: psubd %xmm13, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: psubd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps a+1040(%rax), %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm15, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm12, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: psubd %xmm13, %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm15, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm13 ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm14, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm12, %xmm8 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm2, %xmm15 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm13 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm14, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $184, %rsp +; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 +; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 -; AVX2-NEXT: vpabsd %ymm8, %ymm8 -; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpabsd %ymm12, %ymm8 -; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpabsd %ymm9, %ymm8 +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 ; AVX2-NEXT: vpabsd %ymm10, %ymm8 ; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 +; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 +; AVX2-NEXT: vpabsd %ymm12, %ymm8 +; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm14, %ymm8 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 -; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpabsd %zmm5, %zmm5 -; AVX512F-NEXT: vpabsd %zmm6, %zmm6 -; AVX512F-NEXT: vpabsd %zmm7, %zmm7 -; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpabsd %zmm5, %zmm4 +; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpabsd %zmm6, %zmm4 +; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpabsd %zmm7, %zmm4 +; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm13, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqu (%rdx), %xmm5 -; SSE2-NEXT: movdqu 16(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: psubd %xmm1, %xmm12 -; SSE2-NEXT: psubd %xmm8, %xmm6 -; SSE2-NEXT: psubd %xmm15, %xmm11 -; SSE2-NEXT: psubd %xmm14, %xmm10 -; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqu 16(%rdi), %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psubd %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: psubd %xmm3, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm13 ; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index ce42d0d643e8..1afef86a5f11 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: +; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm2 -; GENERIC-NEXT: movd %ecx, %xmm3 -; GENERIC-NEXT: movd %r8d, %xmm4 +; GENERIC-NEXT: movd %r9d, %xmm1 +; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; GENERIC-NEXT: movd %r8d, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: -; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ATOM-NEXT: jmp LBB7_6 -; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm2 -; ATOM-NEXT: movd %ecx, %xmm3 -; ATOM-NEXT: movd %r8d, %xmm4 -; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: jmp LBB7_6 +; ATOM-NEXT: LBB7_4: +; ATOM-NEXT: movd %r9d, %xmm1 +; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ATOM-NEXT: movd %r8d, %xmm3 +; ATOM-NEXT: movd %edx, %xmm1 +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 1b8f8e7ae559..2628f824ea40 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -45,64 +45,21 @@ define void @pr26232(i64 %a, <16 x i1> %b) { ; AVX-LABEL: pr26232: ; AVX: # BB#0: # %for_loop599.preheader ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_1: # %for_loop599 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: cmpq $65536, %rdi # imm = 0x10000 ; AVX-NEXT: setl %al -; AVX-NEXT: vmovd %eax, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $14, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $13, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $12, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $11, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $10, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $9, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $8, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $7, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $6, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $5, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $3, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $2, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $1, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: cmpw $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd %eax, %xmm3 +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vpmovmskb %xmm3, %eax +; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: jne .LBB1_1 ; AVX-NEXT: # BB#2: # %for_exit600 ; AVX-NEXT: retq diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 2996edaec3e0..332bf2887fb0 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll index c869dff9e642..6701c247e6fc 100644 --- a/test/CodeGen/X86/shrink_vmul_sse.ll +++ b/test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 503b9416c8d3..4a0dc9c1eb17 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index 226c0adbaf3c..2fb821555dba 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index a05a981daa1f..f0a5fe1dbfff 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index f4d0503f4a79..4181a374c61c 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -120,9 +120,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll index fddc7906e08f..939fa0404223 100644 --- a/test/CodeGen/X86/xchg-nofold.ll +++ b/test/CodeGen/X86/xchg-nofold.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s %"struct.std::atomic" = type { %"struct.std::atomic_bool" } @@ -6,6 +7,28 @@ ; CHECK-LABEL: _Z3fooRSt6atomicIbEb define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind { +; CHECK-LABEL: _Z3fooRSt6atomicIbEb: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $3, %rax +; CHECK-NEXT: movb 2147450880(%rax), %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB0_3 +; CHECK-NEXT: # BB#1: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $7, %ecx +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: jge .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xchgb %al, (%rdi) +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq __asan_report_store1 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP entry: %frombool.i.i = zext i1 %b to i8 %_M_i.i.i = getelementptr inbounds %"struct.std::atomic", %"struct.std::atomic"* %a, i64 0, i32 0, i32 0, i32 0 @@ -30,7 +53,6 @@ entry: ;