From e831c3dab0fd00c84d09be70b9d1b27285cba04d Mon Sep 17 00:00:00 2001 From: dim Date: Mon, 29 May 2017 16:25:25 +0000 Subject: [PATCH] Vendor import of llvm trunk r304149: https://llvm.org/svn/llvm-project/llvm/trunk@304149 --- CMakeLists.txt | 7 + docs/Benchmarking.rst | 87 + docs/GettingStartedVS.rst | 4 + docs/LangRef.rst | 471 ++- docs/Vectorizers.rst | 4 +- docs/index.rst | 1 + .../BuildingAJIT/Chapter1/KaleidoscopeJIT.h | 6 +- .../BuildingAJIT/Chapter2/KaleidoscopeJIT.h | 10 +- .../BuildingAJIT/Chapter3/KaleidoscopeJIT.h | 9 +- .../BuildingAJIT/Chapter4/KaleidoscopeJIT.h | 13 +- .../BuildingAJIT/Chapter5/KaleidoscopeJIT.h | 14 +- .../BuildingAJIT/Chapter5/Server/server.cpp | 28 +- .../Kaleidoscope/include/KaleidoscopeJIT.h | 9 +- include/llvm/ADT/Triple.h | 1 + include/llvm/Analysis/InstructionSimplify.h | 291 +- include/llvm/Analysis/LoopPass.h | 5 +- include/llvm/Analysis/ScalarEvolution.h | 6 + include/llvm/Analysis/TargetTransformInfo.h | 7 + .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + include/llvm/Analysis/ValueTracking.h | 3 +- include/llvm/CodeGen/AsmPrinter.h | 6 +- include/llvm/CodeGen/AtomicExpandUtils.h | 26 +- include/llvm/CodeGen/DIE.h | 86 +- include/llvm/CodeGen/FaultMaps.h | 35 +- include/llvm/CodeGen/GlobalISel/Localizer.h | 78 + include/llvm/CodeGen/ISDOpcodes.h | 8 + include/llvm/CodeGen/LiveInterval.h | 86 +- include/llvm/CodeGen/LiveIntervalAnalysis.h | 43 +- include/llvm/CodeGen/LiveIntervalUnion.h | 14 +- include/llvm/CodeGen/LivePhysRegs.h | 120 +- include/llvm/CodeGen/LiveRangeEdit.h | 80 +- include/llvm/CodeGen/LiveStackAnalysis.h | 23 +- include/llvm/CodeGen/MachineBasicBlock.h | 121 +- .../llvm/CodeGen/MachineBlockFrequencyInfo.h | 17 +- .../llvm/CodeGen/MachineDominanceFrontier.h | 25 +- include/llvm/CodeGen/MachineDominators.h | 18 +- include/llvm/CodeGen/MachineInstr.h | 37 +- include/llvm/CodeGen/MachineRegisterInfo.h | 5 + include/llvm/CodeGen/MachineValueType.h | 2 +- include/llvm/CodeGen/ScheduleDAG.h | 6 +- include/llvm/CodeGen/ScheduleDAGInstrs.h | 5 +- include/llvm/CodeGen/SelectionDAG.h | 5 + include/llvm/CodeGen/SelectionDAGNodes.h | 26 + include/llvm/DebugInfo/CodeView/CVRecord.h | 8 + .../llvm/DebugInfo/CodeView/CVTypeVisitor.h | 1 + .../DebugInfo/CodeView/TypeDeserializer.h | 11 + .../DebugInfo/CodeView/TypeIndexDiscovery.h | 33 + include/llvm/DebugInfo/CodeView/TypeRecord.h | 11 +- .../llvm/DebugInfo/CodeView/TypeSerializer.h | 37 +- .../DebugInfo/CodeView/TypeStreamMerger.h | 69 +- .../DebugInfo/CodeView/TypeTableBuilder.h | 31 +- .../DebugInfo/CodeView/TypeTableCollection.h | 4 +- include/llvm/DebugInfo/DWARF/DWARFContext.h | 18 +- .../DebugInfo/DWARF/DWARFDebugRangeList.h | 3 + include/llvm/DebugInfo/DWARF/DWARFDie.h | 3 +- include/llvm/DebugInfo/DWARF/DWARFFormValue.h | 2 + include/llvm/DebugInfo/DWARF/DWARFRelocMap.h | 3 + include/llvm/DebugInfo/DWARF/DWARFUnit.h | 12 +- .../llvm/DebugInfo/MSF/MappedBlockStream.h | 16 +- .../DebugInfo/PDB/Native/DbiStreamBuilder.h | 1 + .../PDB/Native/PDBTypeServerHandler.h | 5 +- include/llvm/DebugInfo/PDB/Native/TpiStream.h | 7 + include/llvm/IR/Attributes.h | 46 +- include/llvm/IR/BasicBlock.h | 45 + include/llvm/IR/IntrinsicInst.h | 13 + include/llvm/IR/Intrinsics.td | 58 +- include/llvm/IR/IntrinsicsAMDGPU.td | 10 + include/llvm/IR/Metadata.h | 1 + include/llvm/IR/Module.h | 5 +- include/llvm/InitializePasses.h | 2 + include/llvm/LTO/Config.h | 2 +- include/llvm/Object/Binary.h | 4 +- include/llvm/Object/COFF.h | 1 + include/llvm/Object/ELFObjectFile.h | 12 + include/llvm/Object/MachO.h | 1 + include/llvm/Object/ObjectFile.h | 6 + include/llvm/Object/RelocVisitor.h | 513 +-- include/llvm/Object/Wasm.h | 1 + include/llvm/Option/OptTable.h | 8 + include/llvm/ProfileData/InstrProf.h | 6 +- include/llvm/TableGen/Record.h | 48 +- include/llvm/Target/TargetLowering.h | 6 +- include/llvm/Transforms/Scalar.h | 7 + include/llvm/Transforms/Scalar/GVN.h | 34 +- include/llvm/Transforms/Utils/Local.h | 12 + lib/Analysis/ConstantFolding.cpp | 7 +- lib/Analysis/InstructionSimplify.cpp | 152 +- lib/Analysis/Lint.cpp | 8 +- lib/Analysis/LoopPass.cpp | 23 +- lib/Analysis/ScalarEvolution.cpp | 74 +- lib/Analysis/ScalarEvolutionExpander.cpp | 20 +- lib/Analysis/TargetTransformInfo.cpp | 4 + lib/Analysis/ValueTracking.cpp | 6 +- lib/Bitcode/Writer/BitcodeWriter.cpp | 34 +- lib/Bitcode/Writer/ValueEnumerator.cpp | 7 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 10 +- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 8 +- lib/CodeGen/AsmPrinter/DIEHash.cpp | 120 +- lib/CodeGen/AsmPrinter/DIEHash.h | 55 +- lib/CodeGen/AsmPrinter/DIEHashAttributes.def | 55 + lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp | 9 +- lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 8 +- lib/CodeGen/AsmPrinter/DwarfCompileUnit.h | 4 +- lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 125 +- lib/CodeGen/AsmPrinter/DwarfDebug.h | 9 +- lib/CodeGen/AtomicExpandPass.cpp | 2 +- lib/CodeGen/BasicTargetTransformInfo.cpp | 2 - lib/CodeGen/BranchCoalescing.cpp | 6 +- lib/CodeGen/BranchFolding.cpp | 11 +- lib/CodeGen/BranchFolding.h | 1 + lib/CodeGen/BranchRelaxation.cpp | 6 +- lib/CodeGen/CodeGenPrepare.cpp | 4 +- lib/CodeGen/DeadMachineInstructionElim.cpp | 4 +- lib/CodeGen/DetectDeadLanes.cpp | 3 +- lib/CodeGen/DwarfEHPrepare.cpp | 4 +- lib/CodeGen/EarlyIfConversion.cpp | 8 +- lib/CodeGen/ExpandISelPseudos.cpp | 2 +- lib/CodeGen/ExpandPostRAPseudos.cpp | 2 +- lib/CodeGen/FuncletLayout.cpp | 2 +- lib/CodeGen/GlobalISel/CMakeLists.txt | 5 +- lib/CodeGen/GlobalISel/GlobalISel.cpp | 1 + lib/CodeGen/GlobalISel/Localizer.cpp | 125 + lib/CodeGen/GlobalMerge.cpp | 5 +- lib/CodeGen/IfConversion.cpp | 33 +- lib/CodeGen/ImplicitNullChecks.cpp | 4 +- lib/CodeGen/InterleavedAccessPass.cpp | 6 +- lib/CodeGen/LexicalScopes.cpp | 5 +- lib/CodeGen/LiveDebugValues.cpp | 4 +- lib/CodeGen/LiveDebugVariables.cpp | 6 +- lib/CodeGen/LiveIntervalAnalysis.cpp | 49 +- lib/CodeGen/LivePhysRegs.cpp | 89 +- lib/CodeGen/LiveStackAnalysis.cpp | 4 +- lib/CodeGen/LocalStackSlotAllocation.cpp | 4 +- lib/CodeGen/LowerEmuTLS.cpp | 2 +- lib/CodeGen/MachineBlockFrequencyInfo.cpp | 6 +- lib/CodeGen/MachineBlockPlacement.cpp | 4 +- lib/CodeGen/MachineCSE.cpp | 12 +- lib/CodeGen/MachineCombiner.cpp | 4 +- lib/CodeGen/MachineCopyPropagation.cpp | 4 +- lib/CodeGen/MachineLICM.cpp | 10 +- lib/CodeGen/MachineOutliner.cpp | 2 +- lib/CodeGen/MachinePipeliner.cpp | 4 +- lib/CodeGen/MachineScheduler.cpp | 14 +- lib/CodeGen/MachineSink.cpp | 8 +- lib/CodeGen/MachineTraceMetrics.cpp | 8 +- lib/CodeGen/MachineVerifier.cpp | 5 - lib/CodeGen/OptimizePHIs.cpp | 4 +- lib/CodeGen/PHIElimination.cpp | 4 +- lib/CodeGen/PostRASchedulerList.cpp | 4 +- lib/CodeGen/ProcessImplicitDefs.cpp | 6 +- lib/CodeGen/PrologEpilogInserter.cpp | 9 +- lib/CodeGen/RenameIndependentSubregs.cpp | 4 +- lib/CodeGen/SafeStack.cpp | 6 +- lib/CodeGen/ScalarizeMaskedMemIntrin.cpp | 8 +- lib/CodeGen/ScheduleDAGInstrs.cpp | 206 +- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 176 +- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 67 + lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 79 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 90 +- .../SelectionDAG/SelectionDAGBuilder.h | 2 +- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 55 +- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 34 +- lib/CodeGen/ShadowStackGCLowering.cpp | 6 +- lib/CodeGen/ShrinkWrap.cpp | 5 +- lib/CodeGen/SjLjEHPrepare.cpp | 2 +- lib/CodeGen/SlotIndexes.cpp | 2 +- lib/CodeGen/SpillPlacement.cpp | 6 +- lib/CodeGen/StackColoring.cpp | 10 +- lib/CodeGen/StackProtector.cpp | 4 +- lib/CodeGen/StackSlotColoring.cpp | 6 +- lib/CodeGen/TailDuplication.cpp | 3 +- lib/CodeGen/TailDuplicator.cpp | 4 +- lib/CodeGen/TwoAddressInstructionPass.cpp | 6 +- lib/CodeGen/WinEHPrepare.cpp | 2 +- lib/DebugInfo/CodeView/CMakeLists.txt | 1 + lib/DebugInfo/CodeView/CVTypeVisitor.cpp | 20 +- lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp | 371 ++ lib/DebugInfo/CodeView/TypeSerializer.cpp | 234 +- lib/DebugInfo/CodeView/TypeStreamMerger.cpp | 436 +-- .../CodeView/TypeTableCollection.cpp | 3 +- lib/DebugInfo/DWARF/DWARFContext.cpp | 119 +- lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp | 8 +- lib/DebugInfo/DWARF/DWARFDie.cpp | 13 +- lib/DebugInfo/DWARF/DWARFFormValue.cpp | 4 +- lib/DebugInfo/DWARF/DWARFUnit.cpp | 39 +- lib/DebugInfo/MSF/MappedBlockStream.cpp | 32 +- lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp | 25 +- .../PDB/Native/PDBTypeServerHandler.cpp | 19 +- lib/DebugInfo/PDB/Native/TpiStream.cpp | 4 + lib/Demangle/ItaniumDemangle.cpp | 58 +- .../RuntimeDyld/RuntimeDyldELF.cpp | 37 +- lib/Fuzzer/FuzzerUtilPosix.cpp | 17 +- lib/Fuzzer/test/fuzzer-segv.test | 2 + lib/IR/AttributeImpl.h | 47 +- lib/IR/Attributes.cpp | 394 +- lib/IR/BasicBlock.cpp | 16 +- lib/IR/DebugLoc.cpp | 2 +- lib/IR/Instructions.cpp | 6 + lib/IR/IntrinsicInst.cpp | 26 +- lib/IR/Module.cpp | 4 +- lib/IR/Verifier.cpp | 40 +- lib/LTO/LTO.cpp | 44 +- lib/LTO/LTOBackend.cpp | 21 +- lib/Linker/IRMover.cpp | 17 +- lib/MC/WasmObjectWriter.cpp | 32 +- lib/Object/COFFObjectFile.cpp | 4 + lib/Object/MachOObjectFile.cpp | 4 + lib/Object/WasmObjectFile.cpp | 4 + lib/Option/OptTable.cpp | 14 + lib/Passes/PassBuilder.cpp | 25 +- lib/ProfileData/InstrProf.cpp | 16 +- lib/Support/APInt.cpp | 4 +- lib/Support/BinaryStreamReader.cpp | 27 +- lib/Support/ConvertUTF.cpp | 31 + lib/Support/DebugCounter.cpp | 2 + lib/Support/DynamicLibrary.cpp | 11 +- lib/Support/GraphWriter.cpp | 1 + lib/Support/Host.cpp | 1 + lib/Support/Path.cpp | 1 + lib/Support/Triple.cpp | 17 +- lib/Support/YAMLParser.cpp | 4 + lib/TableGen/Record.cpp | 39 +- lib/Target/AArch64/AArch64AsmPrinter.cpp | 5 +- .../AArch64/AArch64ExpandPseudoInsts.cpp | 119 +- lib/Target/AArch64/AArch64FrameLowering.cpp | 11 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 31 +- lib/Target/AArch64/AArch64InstrInfo.cpp | 127 +- lib/Target/AArch64/AArch64InstrInfo.h | 2 +- lib/Target/AArch64/AArch64InstrInfo.td | 2 +- lib/Target/AArch64/AArch64MacroFusion.cpp | 13 + lib/Target/AArch64/AArch64SchedFalkor.td | 84 +- .../AArch64/AArch64SchedFalkorDetails.td | 1063 +++-- .../AArch64/AArch64SchedFalkorWriteRes.td | 403 -- lib/Target/AArch64/AArch64Subtarget.cpp | 1 - lib/Target/AArch64/AArch64TargetMachine.cpp | 12 +- lib/Target/AMDGPU/AMDGPU.td | 20 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 45 +- lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 + lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 + lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 213 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 8 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 83 +- .../Disassembler/AMDGPUDisassembler.cpp | 93 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 5 + lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 2 + lib/Target/AMDGPU/GCNMinRegStrategy.cpp | 2 + lib/Target/AMDGPU/GCNRegPressure.cpp | 6 +- .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h | 12 + .../AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 46 + lib/Target/AMDGPU/R600ISelLowering.cpp | 8 + lib/Target/AMDGPU/R600ISelLowering.h | 2 + lib/Target/AMDGPU/R600RegisterInfo.td | 2 +- lib/Target/AMDGPU/SIDefines.h | 19 +- lib/Target/AMDGPU/SIISelLowering.cpp | 52 +- lib/Target/AMDGPU/SIISelLowering.h | 2 + lib/Target/AMDGPU/SIInstrInfo.cpp | 4 + lib/Target/AMDGPU/SIInstrInfo.td | 180 +- lib/Target/AMDGPU/SOPInstructions.td | 4 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 11 + lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 4 + lib/Target/AMDGPU/VOP1Instructions.td | 33 +- lib/Target/AMDGPU/VOP2Instructions.td | 70 +- lib/Target/AMDGPU/VOP3Instructions.td | 9 +- lib/Target/AMDGPU/VOPCInstructions.td | 37 + lib/Target/AMDGPU/VOPInstructions.td | 116 +- lib/Target/ARM/ARMCallLowering.cpp | 111 +- lib/Target/ARM/ARMCallLowering.h | 5 +- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 4 +- lib/Target/ARM/ARMISelLowering.cpp | 8 +- lib/Target/ARM/ARMISelLowering.h | 2 +- lib/Target/ARM/ARMInstrNEON.td | 496 +-- lib/Target/ARM/ARMSchedule.td | 11 + lib/Target/ARM/ARMScheduleA9.td | 9 + lib/Target/ARM/ARMScheduleR52.td | 103 +- lib/Target/ARM/ARMScheduleSwift.td | 10 + lib/Target/ARM/ARMTargetMachine.cpp | 63 +- lib/Target/ARM/ARMTargetMachine.h | 62 +- lib/Target/ARM/ARMTargetObjectFile.cpp | 4 +- lib/Target/ARM/Thumb1FrameLowering.cpp | 10 +- lib/Target/AVR/AVRInstrInfo.td | 1 - lib/Target/BPF/BPFISelLowering.cpp | 9 +- lib/Target/BPF/BPFISelLowering.h | 4 + lib/Target/Hexagon/HexagonFrameLowering.cpp | 2 +- lib/Target/Hexagon/HexagonInstrInfo.cpp | 4 +- lib/Target/Hexagon/HexagonPseudo.td | 39 +- lib/Target/Hexagon/HexagonRegisterInfo.cpp | 5 - lib/Target/Hexagon/HexagonRegisterInfo.h | 1 - lib/Target/Hexagon/HexagonVLIWPacketizer.cpp | 4 +- lib/Target/LLVMBuild.txt | 1 + lib/Target/MSP430/MSP430.td | 14 + lib/Target/MSP430/MSP430ISelDAGToDAG.cpp | 4 +- lib/Target/MSP430/MSP430ISelLowering.cpp | 27 +- lib/Target/MSP430/MSP430InstrInfo.td | 5 + lib/Target/MSP430/MSP430RegisterInfo.cpp | 2 +- lib/Target/MSP430/MSP430Subtarget.cpp | 27 +- lib/Target/MSP430/MSP430Subtarget.h | 11 + lib/Target/Mips/MipsISelLowering.cpp | 34 +- lib/Target/Mips/MipsSubtarget.cpp | 7 +- lib/Target/Mips/MipsSubtarget.h | 7 +- lib/Target/Nios2/CMakeLists.txt | 18 + lib/Target/Nios2/LLVMBuild.txt | 61 + lib/Target/Nios2/MCTargetDesc/CMakeLists.txt | 2 + lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt | 25 + .../Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp | 25 + .../Nios2/MCTargetDesc/Nios2MCTargetDesc.h | 34 + lib/Target/Nios2/Nios2.h | 25 + lib/Target/Nios2/Nios2.td | 29 + lib/Target/Nios2/Nios2InstrFormats.td | 117 + lib/Target/Nios2/Nios2InstrInfo.td | 50 + lib/Target/Nios2/Nios2RegisterInfo.td | 60 + lib/Target/Nios2/Nios2TargetMachine.cpp | 46 + lib/Target/Nios2/Nios2TargetMachine.h | 30 + lib/Target/Nios2/TargetInfo/CMakeLists.txt | 1 + lib/Target/Nios2/TargetInfo/LLVMBuild.txt | 23 + .../Nios2/TargetInfo/Nios2TargetInfo.cpp | 24 + lib/Target/PowerPC/PPCExpandISEL.cpp | 2 +- lib/Target/PowerPC/PPCISelLowering.cpp | 92 +- lib/Target/PowerPC/PPCISelLowering.h | 6 +- lib/Target/PowerPC/PPCInstr64Bit.td | 4 +- lib/Target/PowerPC/PPCInstrInfo.cpp | 2 + lib/Target/PowerPC/PPCInstrInfo.td | 2 +- lib/Target/PowerPC/PPCInstrVSX.td | 74 +- lib/Target/SystemZ/SystemZExpandPseudo.cpp | 2 +- lib/Target/SystemZ/SystemZInstrInfo.cpp | 30 +- .../SystemZ/SystemZTargetTransformInfo.h | 1 + lib/Target/X86/AsmParser/X86AsmParser.cpp | 5 +- lib/Target/X86/CMakeLists.txt | 1 + lib/Target/X86/X86.td | 3 + lib/Target/X86/X86FloatingPoint.cpp | 4 +- lib/Target/X86/X86ISelLowering.cpp | 8 + lib/Target/X86/X86InstrAVX512.td | 150 +- lib/Target/X86/X86InstrArithmetic.td | 24 +- lib/Target/X86/X86InstrFMA.td | 13 +- lib/Target/X86/X86InstrFormats.td | 10 + lib/Target/X86/X86InstrInfo.cpp | 3406 +---------------- lib/Target/X86/X86InstrInfo.td | 23 +- lib/Target/X86/X86InstrMMX.td | 5 +- lib/Target/X86/X86InstrSSE.td | 66 +- lib/Target/X86/X86InstrXOP.td | 8 +- lib/Target/X86/X86InstructionSelector.cpp | 20 + lib/Target/X86/X86LegalizerInfo.cpp | 17 + lib/Target/X86/X86LegalizerInfo.h | 1 + lib/Target/X86/X86Subtarget.cpp | 1 + lib/Target/X86/X86Subtarget.h | 4 + lib/Transforms/Coroutines/CoroCleanup.cpp | 1 + lib/Transforms/Coroutines/CoroEarly.cpp | 3 + lib/Transforms/Coroutines/CoroElide.cpp | 1 + lib/Transforms/Coroutines/CoroFrame.cpp | 33 +- lib/Transforms/Coroutines/CoroSplit.cpp | 86 +- lib/Transforms/IPO/PartialInlining.cpp | 10 +- lib/Transforms/IPO/PassManagerBuilder.cpp | 15 + .../InstCombine/InstCombineAddSub.cpp | 14 +- .../InstCombine/InstCombineAndOrXor.cpp | 12 +- .../InstCombine/InstCombineCalls.cpp | 8 +- .../InstCombine/InstCombineCasts.cpp | 16 +- .../InstCombine/InstCombineCompares.cpp | 48 +- .../InstCombine/InstCombineInternal.h | 6 + .../InstCombine/InstCombineMulDivRem.cpp | 8 +- .../InstCombine/InstCombineShifts.cpp | 3 +- .../InstCombineSimplifyDemanded.cpp | 92 +- .../InstCombine/InstructionCombining.cpp | 19 +- .../Instrumentation/PGOInstrumentation.cpp | 2 +- .../Instrumentation/SanitizerCoverage.cpp | 5 +- lib/Transforms/Scalar/CMakeLists.txt | 1 + lib/Transforms/Scalar/ConstantHoisting.cpp | 6 +- lib/Transforms/Scalar/GVN.cpp | 164 +- lib/Transforms/Scalar/GVNSink.cpp | 872 +++++ lib/Transforms/Scalar/GuardWidening.cpp | 4 +- .../Scalar/InductiveRangeCheckElimination.cpp | 7 +- lib/Transforms/Scalar/JumpThreading.cpp | 42 +- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 33 +- lib/Transforms/Scalar/LoopUnswitch.cpp | 7 +- lib/Transforms/Scalar/NewGVN.cpp | 70 +- lib/Transforms/Scalar/SCCP.cpp | 3 +- lib/Transforms/Scalar/SROA.cpp | 2 +- lib/Transforms/Scalar/Scalar.cpp | 1 + lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 76 +- lib/Transforms/Utils/CloneFunction.cpp | 2 +- lib/Transforms/Utils/FunctionComparator.cpp | 10 +- lib/Transforms/Utils/InlineFunction.cpp | 7 +- lib/Transforms/Utils/Local.cpp | 68 +- lib/Transforms/Utils/SimplifyCFG.cpp | 50 +- lib/Transforms/Utils/SimplifyLibCalls.cpp | 4 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 157 +- test/Analysis/CostModel/AArch64/falkor.ll | 26 - .../constant_functions_multi_dim.ll | 80 + test/Analysis/IVUsers/quadradic-exit-value.ll | 36 +- .../ScalarEvolution/different-loops-recs.ll | 64 +- .../AArch64/GlobalISel/arm64-fallback.ll | 2 +- .../GlobalISel/gisel-commandline-option.ll | 5 +- .../GlobalISel/localizer-in-O0-pipeline.mir | 96 + test/CodeGen/AArch64/GlobalISel/localizer.mir | 312 ++ test/CodeGen/AArch64/aarch64-stp-cluster.ll | 2 +- test/CodeGen/AArch64/arm64-csldst-mmo.ll | 2 +- test/CodeGen/AArch64/arm64-inline-asm.ll | 7 + test/CodeGen/AArch64/arm64-ldp-cluster.ll | 4 +- .../AArch64/arm64-misched-basic-A53.ll | 4 +- .../AArch64/arm64-misched-basic-A57.ll | 2 +- .../AArch64/arm64-misched-forwarding-A53.ll | 2 +- .../AArch64/arm64-misched-memdep-bug.ll | 2 +- .../CodeGen/AArch64/arm64-misched-multimmo.ll | 2 +- test/CodeGen/AArch64/arm64-vabs.ll | 24 +- test/CodeGen/AArch64/arm64-vadd.ll | 12 +- test/CodeGen/AArch64/arm64-vmul.ll | 24 +- test/CodeGen/AArch64/arm64-vshift.ll | 12 +- test/CodeGen/AArch64/arm64-vsub.ll | 24 +- test/CodeGen/AArch64/asm-print-comments.ll | 17 + test/CodeGen/AArch64/cmpxchg-O0.ll | 10 +- test/CodeGen/AArch64/fast-isel-cmpxchg.ll | 9 +- .../AArch64/live-interval-analysis.mir | 4 +- test/CodeGen/AArch64/misched-fusion-aes.ll | 145 +- test/CodeGen/AArch64/optimize-imm.ll | 19 + test/CodeGen/AArch64/scheduledag-constreg.mir | 2 +- .../CodeGen/AArch64/tailcall_misched_graph.ll | 2 +- .../AMDGPU/GlobalISel/legalize-constant.mir | 20 + test/CodeGen/AMDGPU/bfe-combine.ll | 41 + test/CodeGen/AMDGPU/extload-align.ll | 4 +- test/CodeGen/AMDGPU/kernel-args.ll | 4 +- test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll | 15 + test/CodeGen/AMDGPU/load-constant-i16.ll | 2 +- test/CodeGen/AMDGPU/load-constant-i8.ll | 2 +- test/CodeGen/AMDGPU/load-global-i16.ll | 4 +- test/CodeGen/AMDGPU/load-global-i8.ll | 4 +- test/CodeGen/AMDGPU/load-local-i16.ll | 2 +- test/CodeGen/AMDGPU/load-local-i8.ll | 2 +- test/CodeGen/AMDGPU/min.ll | 2 +- test/CodeGen/AMDGPU/parallelandifcollapse.ll | 2 +- test/CodeGen/AMDGPU/r600.bitcast.ll | 2 +- test/CodeGen/AMDGPU/schedule-regpressure.mir | 2 +- test/CodeGen/AMDGPU/setcc.ll | 2 +- test/CodeGen/AMDGPU/shl-add-to-add-shl.ll | 40 + test/CodeGen/AMDGPU/shl.ll | 2 +- test/CodeGen/AMDGPU/sra.ll | 2 +- test/CodeGen/AMDGPU/store-global.ll | 4 +- test/CodeGen/AMDGPU/store-private.ll | 4 +- test/CodeGen/AMDGPU/sub.ll | 2 +- test/CodeGen/AMDGPU/unknown-processor.ll | 4 +- test/CodeGen/AMDGPU/vector-alloca.ll | 22 + .../ARM/GlobalISel/arm-irtranslator.ll | 354 ++ .../CodeGen/ARM/GlobalISel/arm-unsupported.ll | 12 +- test/CodeGen/ARM/arm-shrink-wrapping.ll | 7 +- test/CodeGen/ARM/constantpool-promote-dbg.ll | 2 +- test/CodeGen/ARM/constantpool-promote-ldrh.ll | 4 +- test/CodeGen/ARM/constantpool-promote.ll | 24 +- test/CodeGen/ARM/cortexr52-misched-basic.ll | 4 +- test/CodeGen/ARM/fastisel-thumb-litpool.ll | 1 + test/CodeGen/ARM/memcpy-inline.ll | 13 +- test/CodeGen/ARM/memset-inline.ll | 84 +- test/CodeGen/ARM/misched-copy-arm.ll | 2 +- test/CodeGen/ARM/misched-fp-basic.ll | 6 +- test/CodeGen/ARM/misched-int-basic-thumb2.mir | 6 +- test/CodeGen/ARM/misched-int-basic.mir | 6 +- test/CodeGen/ARM/single-issue-r52.mir | 4 +- test/CodeGen/ARM/vcombine.ll | 4 +- test/CodeGen/ARM/vext.ll | 8 +- test/CodeGen/Hexagon/post-ra-kill-update.mir | 2 +- .../Lanai/lanai-misched-trivial-disjoint.ll | 2 +- .../PR32721_ifcvt_triangle_unanalyzable.mir | 24 - test/CodeGen/MSP430/hwmult16.ll | 1 + test/CodeGen/MSP430/hwmult32.ll | 1 + test/CodeGen/MSP430/hwmultf5.ll | 1 + test/CodeGen/MSP430/vararg.ll | 2 +- test/CodeGen/Nios2/lit.local.cfg | 3 + test/CodeGen/Nios2/target_support.ll | 11 + test/CodeGen/PowerPC/atomics-constant.ll | 23 + test/CodeGen/PowerPC/build-vector-tests.ll | 216 +- test/CodeGen/PowerPC/livephysregs.mir | 52 + .../PowerPC/p8altivec-shuffles-pred.ll | 2 +- .../PowerPC/p9-xxinsertw-xxextractuw.ll | 72 +- test/CodeGen/PowerPC/ppc64-i128-abi.ll | 8 +- test/CodeGen/PowerPC/pr25157-peephole.ll | 2 +- test/CodeGen/PowerPC/pr27078.ll | 8 +- test/CodeGen/PowerPC/swaps-le-6.ll | 8 +- test/CodeGen/PowerPC/vec_sldwi.ll | 307 ++ test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll | 48 +- test/CodeGen/PowerPC/vsx-ldst.ll | 4 +- test/CodeGen/PowerPC/vsx-p9.ll | 72 +- test/CodeGen/PowerPC/vsx_insert_extract_le.ll | 4 +- test/CodeGen/PowerPC/vsx_shuffle_le.ll | 48 +- test/CodeGen/Thumb/machine-cse-physreg.mir | 35 + test/CodeGen/X86/2009-02-26-MachineLICMBug.ll | 2 +- test/CodeGen/X86/GlobalISel/memop-vec.ll | 113 +- .../X86/GlobalISel/regbankselect-AVX2.mir | 55 +- .../X86/GlobalISel/regbankselect-AVX512.mir | 87 +- .../X86/GlobalISel/select-leaf-constant.mir | 96 + .../X86/GlobalISel/select-memop-v256.mir | 188 + .../X86/GlobalISel/select-memop-v512.mir | 127 + test/CodeGen/X86/avx-vzeroupper.ll | 221 +- test/CodeGen/X86/avx512bw-intrinsics.ll | 6 +- .../CodeGen/X86/avx512vpopcntdq-intrinsics.ll | 88 + test/CodeGen/X86/fast-isel-select-cmp.ll | 10 +- test/CodeGen/X86/fp-intrinsics.ll | 147 + test/CodeGen/X86/hoist-invariant-load.ll | 2 +- test/CodeGen/X86/misched-copy.ll | 2 +- test/CodeGen/X86/or-branch.ll | 66 +- test/CodeGen/X86/pr27681.mir | 2 +- test/CodeGen/X86/sandybridge-loads.ll | 42 +- .../CodeGen/X86/sse42-intrinsics-fast-isel.ll | 6 +- test/CodeGen/X86/stack-folding-fp-avx1.ll | 21 +- test/CodeGen/X86/twoaddr-coalesce-2.ll | 2 +- test/CodeGen/X86/vector-narrow-binop.ll | 63 +- test/CodeGen/X86/vector-popcnt-128.ll | 109 +- test/CodeGen/X86/vector-popcnt-256.ll | 42 + test/CodeGen/X86/vector-popcnt-512.ll | 53 + test/CodeGen/X86/vector-shuffle-512-v16.ll | 3 +- test/CodeGen/X86/vector-shuffle-avx512.ll | 34 +- test/CodeGen/X86/vector-sqrt.ll | 18 +- test/CodeGen/X86/vector-trunc-math.ll | 6 +- test/CodeGen/X86/vector-tzcnt-128.ll | 159 + test/CodeGen/X86/vector-tzcnt-256.ll | 379 +- test/CodeGen/X86/vector-tzcnt-512.ll | 153 + test/CodeGen/X86/wide-integer-cmp.ll | 1 - test/CodeGen/X86/widened-broadcast.ll | 73 +- test/CodeGen/X86/x86-interleaved-access.ll | 12 +- test/CodeGen/X86/x87.ll | 11 +- test/CodeGen/XCore/epilogue_prologue.ll | 24 +- test/DebugInfo/Generic/empty.ll | 9 +- test/DebugInfo/Generic/nodebug.ll | 13 +- test/DebugInfo/Generic/skeletoncu.ll | 4 +- test/DebugInfo/Inputs/split-dwarf-dwp.cpp | 12 + test/DebugInfo/Inputs/split-dwarf-dwp.o | Bin 0 -> 2744 bytes test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp | Bin 0 -> 1256 bytes test/DebugInfo/MIR/X86/empty-inline.mir | 122 + test/DebugInfo/PDB/Inputs/merge-ids-1.yaml | 36 + test/DebugInfo/PDB/Inputs/merge-ids-2.yaml | 31 + .../PDB/Inputs/merge-ids-and-types-1.yaml | 113 + .../PDB/Inputs/merge-ids-and-types-2.yaml | 143 + .../{merge1.yaml => merge-types-1.yaml} | 0 .../{merge2.yaml => merge-types-2.yaml} | 0 test/DebugInfo/PDB/Inputs/source-names-1.yaml | 8 + test/DebugInfo/PDB/Inputs/source-names-2.yaml | 8 + .../PDB/pdbdump-merge-ids-and-types.test | 65 + test/DebugInfo/PDB/pdbdump-mergeids.test | 31 + test/DebugInfo/PDB/pdbdump-mergetypes.test | 4 +- test/DebugInfo/PDB/pdbdump-objfilename.yaml | 14 + test/DebugInfo/PDB/pdbdump-source-names.test | 20 + test/DebugInfo/X86/array.ll | 150 +- test/DebugInfo/X86/dbg-value-frame-index.ll | 2 +- test/DebugInfo/X86/debug-loc-offset.ll | 8 +- test/DebugInfo/X86/debug-macro.ll | 72 +- test/DebugInfo/X86/empty.ll | 11 +- test/DebugInfo/X86/fission-hash.ll | 10 +- test/DebugInfo/X86/gnu-public-names-empty.ll | 9 +- test/DebugInfo/X86/gnu-public-names-gmlt.ll | 68 + .../X86/split-dwarf-multiple-cu-hash.ll | 42 + test/DebugInfo/X86/split-dwarf-omit-empty.ll | 54 + test/DebugInfo/dwo.ll | 4 +- test/DebugInfo/llvm-symbolizer.test | 8 + test/DebugInfo/omit-empty.ll | 12 + test/DebugInfo/skeletoncu.ll | 4 +- test/ExecutionEngine/MCJIT/lit.local.cfg | 3 +- test/ExecutionEngine/OrcMCJIT/lit.local.cfg | 3 +- test/ExecutionEngine/OrcMCJIT/pr32650.ll | 28 + test/Feature/fp-intrinsics.ll | 148 + .../SanitizerCoverage/chains.ll | 33 + .../SanitizerCoverage/postdominator_check.ll | 85 + test/LTO/Resolution/X86/linkonce.ll | 11 + test/LTO/Resolution/X86/type-checked-load.ll | 16 + test/Linker/Inputs/module-flags-pic-2-b.ll | 5 +- test/Linker/module-flags-pic-2-a.ll | 13 +- test/MC/AMDGPU/vop_sdwa.s | 441 ++- test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt | 477 +++ test/MC/Disassembler/X86/avx-512.txt | 23 + test/MC/WebAssembly/unnamed-data.ll | 53 + test/MC/X86/pr22004.s | 3 + test/MC/X86/x86-64-avx512vpopcntdq.s | 225 ++ test/Other/new-pm-defaults.ll | 1 + test/TableGen/GlobalISelEmitter.td | 26 + test/Transforms/Coroutines/coro-debug.ll | 142 + test/Transforms/Coroutines/coro-frame.ll | 11 +- .../Transforms/Coroutines/coro-materialize.ll | 52 + test/Transforms/EarlyCSE/const-speculation.ll | 39 + test/Transforms/GVN/PRE/phi-translate-2.ll | 105 + test/Transforms/GVN/PRE/pre-gep-load.ll | 2 +- test/Transforms/GVN/PRE/pre-load.ll | 6 +- test/Transforms/GVNSink/dither.ll | 42 + test/Transforms/GVNSink/indirect-call.ll | 70 + test/Transforms/GVNSink/sink-common-code.ll | 694 ++++ test/Transforms/GVNSink/struct.ll | 71 + .../GlobalDCE/externally_available.ll | 19 +- test/Transforms/Inline/prof-update-instr.ll | 57 + .../{prof-update.ll => prof-update-sample.ll} | 0 .../InstCombine/2008-07-10-ICmpBinOp.ll | 19 - .../InstCombine/2008-08-17-ICmpXorSignbit.ll | 87 - test/Transforms/InstCombine/alloca.ll | 7 +- .../InstCombine/bitcast-vec-canon.ll | 37 +- test/Transforms/InstCombine/bitcast.ll | 45 + test/Transforms/InstCombine/ctpop.ll | 27 +- .../InstCombine/icmp-xor-signbit.ll | 228 ++ test/Transforms/InstCombine/icmp.ll | 64 + test/Transforms/InstSimplify/call.ll | 253 +- test/Transforms/InstSimplify/or.ll | 41 +- test/Transforms/JumpThreading/assume.ll | 145 +- .../JumpThreading/fold-not-thread.ll | 4 +- test/Transforms/JumpThreading/guards.ll | 91 +- test/Transforms/LoopIdiom/pr33114.ll | 35 + .../X86/incorrect-offset-scaling.ll | 12 +- .../{ => X86}/lsr-expand-quadratic.ll | 26 +- .../LoopStrengthReduce/nonintegral.ll | 45 + .../LoopStrengthReduce/post-inc-icmpzero.ll | 4 +- .../AArch64/no_vector_instructions.ll | 26 + .../LoopVectorize/SystemZ/addressing.ll | 72 + .../X86/vectorization-remarks-missed.ll | 124 +- test/Transforms/NewGVN/pr32403.ll | 3 +- test/Transforms/NewGVN/pr32836.ll | 45 + .../SimpleLoopUnswitch/trivial-unswitch.ll | 61 + test/Verifier/fp-intrinsics.ll | 39 +- test/Verifier/module-flags-1.ll | 6 +- test/tools/gold/X86/relocation-model-pic.ll | 63 + test/tools/llvm-nm/X86/Inputs/example.lib | Bin 0 -> 2000 bytes test/tools/llvm-nm/X86/importlibrary.test | 7 + .../llvm-profdata/memop-size-prof.proftext | 2 +- tools/gold/gold-plugin.cpp | 6 +- tools/llvm-nm/llvm-nm.cpp | 17 +- .../PrettyClassLayoutGraphicalDumper.cpp | 3 +- tools/llvm-pdbdump/YAMLOutputStyle.cpp | 14 + tools/llvm-pdbdump/llvm-pdbdump.cpp | 37 +- tools/llvm-pdbdump/llvm-pdbdump.h | 1 + tools/llvm-profdata/llvm-profdata.cpp | 2 +- tools/llvm-readobj/COFFDumper.cpp | 4 +- unittests/Analysis/ScalarEvolutionTest.cpp | 88 +- unittests/DebugInfo/CodeView/CMakeLists.txt | 1 + unittests/DebugInfo/CodeView/ErrorChecking.h | 9 + .../CodeView/TypeIndexDiscoveryTest.cpp | 496 +++ .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp | 24 +- .../DebugInfo/PDB/MappedBlockStreamTest.cpp | 51 +- unittests/IR/BasicBlockTest.cpp | 75 + unittests/IR/CMakeLists.txt | 1 + unittests/IR/DominatorTreeTest.cpp | 127 +- unittests/Support/CrashRecoveryTest.cpp | 4 - utils/TableGen/CMakeLists.txt | 1 + utils/TableGen/GlobalISelEmitter.cpp | 92 +- utils/TableGen/TableGen.cpp | 6 + utils/TableGen/TableGenBackends.h | 1 + utils/TableGen/X86FoldTablesEmitter.cpp | 720 ++++ utils/{abtest => }/abtest.py | 58 +- utils/abtest/mark_aarch64fns.py | 65 - utils/abtest/mark_armfns.py | 54 - utils/git-svn/git-llvm | 10 + utils/lit/lit/TestRunner.py | 2 +- utils/release/merge-request.sh | 2 +- 641 files changed, 20467 insertions(+), 10059 deletions(-) create mode 100644 docs/Benchmarking.rst create mode 100644 include/llvm/CodeGen/GlobalISel/Localizer.h create mode 100644 include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h create mode 100644 lib/CodeGen/AsmPrinter/DIEHashAttributes.def create mode 100644 lib/CodeGen/GlobalISel/Localizer.cpp create mode 100644 lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp delete mode 100644 lib/Target/AArch64/AArch64SchedFalkorWriteRes.td create mode 100644 lib/Target/Nios2/CMakeLists.txt create mode 100644 lib/Target/Nios2/LLVMBuild.txt create mode 100644 lib/Target/Nios2/MCTargetDesc/CMakeLists.txt create mode 100644 lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt create mode 100644 lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp create mode 100644 lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h create mode 100644 lib/Target/Nios2/Nios2.h create mode 100644 lib/Target/Nios2/Nios2.td create mode 100644 lib/Target/Nios2/Nios2InstrFormats.td create mode 100644 lib/Target/Nios2/Nios2InstrInfo.td create mode 100644 lib/Target/Nios2/Nios2RegisterInfo.td create mode 100644 lib/Target/Nios2/Nios2TargetMachine.cpp create mode 100644 lib/Target/Nios2/Nios2TargetMachine.h create mode 100644 lib/Target/Nios2/TargetInfo/CMakeLists.txt create mode 100644 lib/Target/Nios2/TargetInfo/LLVMBuild.txt create mode 100644 lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp create mode 100644 lib/Transforms/Scalar/GVNSink.cpp delete mode 100644 test/Analysis/CostModel/AArch64/falkor.ll create mode 100644 test/Analysis/Delinearization/constant_functions_multi_dim.ll create mode 100644 test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/localizer.mir create mode 100644 test/CodeGen/AArch64/asm-print-comments.ll create mode 100644 test/CodeGen/AMDGPU/bfe-combine.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll create mode 100644 test/CodeGen/AMDGPU/shl-add-to-add-shl.ll delete mode 100644 test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir create mode 100644 test/CodeGen/Nios2/lit.local.cfg create mode 100644 test/CodeGen/Nios2/target_support.ll create mode 100644 test/CodeGen/PowerPC/atomics-constant.ll create mode 100644 test/CodeGen/PowerPC/livephysregs.mir create mode 100644 test/CodeGen/PowerPC/vec_sldwi.ll create mode 100644 test/CodeGen/Thumb/machine-cse-physreg.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-leaf-constant.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-v256.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-v512.mir create mode 100644 test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll create mode 100644 test/DebugInfo/Inputs/split-dwarf-dwp.cpp create mode 100644 test/DebugInfo/Inputs/split-dwarf-dwp.o create mode 100644 test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp create mode 100644 test/DebugInfo/MIR/X86/empty-inline.mir create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-1.yaml create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-2.yaml create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml rename test/DebugInfo/PDB/Inputs/{merge1.yaml => merge-types-1.yaml} (100%) rename test/DebugInfo/PDB/Inputs/{merge2.yaml => merge-types-2.yaml} (100%) create mode 100644 test/DebugInfo/PDB/Inputs/source-names-1.yaml create mode 100644 test/DebugInfo/PDB/Inputs/source-names-2.yaml create mode 100644 test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test create mode 100644 test/DebugInfo/PDB/pdbdump-mergeids.test create mode 100644 test/DebugInfo/PDB/pdbdump-objfilename.yaml create mode 100644 test/DebugInfo/PDB/pdbdump-source-names.test create mode 100644 test/DebugInfo/X86/gnu-public-names-gmlt.ll create mode 100644 test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll create mode 100644 test/DebugInfo/X86/split-dwarf-omit-empty.ll create mode 100644 test/DebugInfo/omit-empty.ll create mode 100644 test/ExecutionEngine/OrcMCJIT/pr32650.ll create mode 100644 test/Instrumentation/SanitizerCoverage/chains.ll create mode 100644 test/Instrumentation/SanitizerCoverage/postdominator_check.ll create mode 100644 test/LTO/Resolution/X86/linkonce.ll create mode 100644 test/LTO/Resolution/X86/type-checked-load.ll create mode 100644 test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt create mode 100644 test/MC/WebAssembly/unnamed-data.ll create mode 100644 test/MC/X86/pr22004.s create mode 100644 test/MC/X86/x86-64-avx512vpopcntdq.s create mode 100644 test/Transforms/Coroutines/coro-debug.ll create mode 100644 test/Transforms/Coroutines/coro-materialize.ll create mode 100644 test/Transforms/EarlyCSE/const-speculation.ll create mode 100644 test/Transforms/GVN/PRE/phi-translate-2.ll create mode 100644 test/Transforms/GVNSink/dither.ll create mode 100644 test/Transforms/GVNSink/indirect-call.ll create mode 100644 test/Transforms/GVNSink/sink-common-code.ll create mode 100644 test/Transforms/GVNSink/struct.ll create mode 100644 test/Transforms/Inline/prof-update-instr.ll rename test/Transforms/Inline/{prof-update.ll => prof-update-sample.ll} (100%) delete mode 100644 test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll delete mode 100644 test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll create mode 100644 test/Transforms/InstCombine/icmp-xor-signbit.ll create mode 100644 test/Transforms/LoopIdiom/pr33114.ll rename test/Transforms/LoopStrengthReduce/{ => X86}/lsr-expand-quadratic.ll (51%) create mode 100644 test/Transforms/LoopStrengthReduce/nonintegral.ll create mode 100644 test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll create mode 100644 test/Transforms/LoopVectorize/SystemZ/addressing.ll create mode 100644 test/Transforms/NewGVN/pr32836.ll create mode 100644 test/tools/gold/X86/relocation-model-pic.ll create mode 100644 test/tools/llvm-nm/X86/Inputs/example.lib create mode 100644 test/tools/llvm-nm/X86/importlibrary.test create mode 100644 unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp create mode 100644 unittests/IR/BasicBlockTest.cpp create mode 100644 utils/TableGen/X86FoldTablesEmitter.cpp rename utils/{abtest => }/abtest.py (81%) delete mode 100755 utils/abtest/mark_aarch64fns.py delete mode 100755 utils/abtest/mark_armfns.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 78e2e0166257..a5b96569f9c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,13 @@ if (NOT PACKAGE_VERSION) "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}") endif() +if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQUAL "")) + message(WARNING "Visual Studio generators use the x86 host compiler by " + "default, even for 64-bit targets. This can result in linker " + "instability and out of memory errors. To use the 64-bit " + "host compiler, pass -Thost=x64 on the CMake command line.") +endif() + project(LLVM ${cmake_3_0_PROJ_VERSION} ${cmake_3_0_LANGUAGES} diff --git a/docs/Benchmarking.rst b/docs/Benchmarking.rst new file mode 100644 index 000000000000..0f88db745a68 --- /dev/null +++ b/docs/Benchmarking.rst @@ -0,0 +1,87 @@ +================================== +Benchmarking tips +================================== + + +Introduction +============ + +For benchmarking a patch we want to reduce all possible sources of +noise as much as possible. How to do that is very OS dependent. + +Note that low noise is required, but not sufficient. It does not +exclude measurement bias. See +https://www.cis.upenn.edu/~cis501/papers/producing-wrong-data.pdf for +example. + +General +================================ + +* Use a high resolution timer, e.g. perf under linux. + +* Run the benchmark multiple times to be able to recognize noise. + +* Disable as many processes or services as possible on the target system. + +* Disable frequency scaling, turbo boost and address space + randomization (see OS specific section). + +* Static link if the OS supports it. That avoids any variation that + might be introduced by loading dynamic libraries. This can be done + by passing ``-DLLVM_BUILD_STATIC=ON`` to cmake. + +* Try to avoid storage. On some systems you can use tmpfs. Putting the + program, inputs and outputs on tmpfs avoids touching a real storage + system, which can have a pretty big variability. + + To mount it (on linux and freebsd at least):: + + mount -t tmpfs -o size=g none dir_to_mount + +Linux +===== + +* Disable address space randomization:: + + echo 0 > /proc/sys/kernel/randomize_va_space + +* Set scaling_governor to performance:: + + for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + do + echo performance > /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + done + +* Use https://github.com/lpechacek/cpuset to reserve cpus for just the + program you are benchmarking. If using perf, leave at least 2 cores + so that perf runs in one and your program in another:: + + cset shield -c N1,N2 -k on + + This will move all threads out of N1 and N2. The ``-k on`` means + that even kernel threads are moved out. + +* Disable the SMT pair of the cpus you will use for the benchmark. The + pair of cpu N can be found in + ``/sys/devices/system/cpu/cpuN/topology/thread_siblings_list`` and + disabled with:: + + echo 0 > /sys/devices/system/cpu/cpuX/online + + +* Run the program with:: + + cset shield --exec -- perf stat -r 10 + + This will run the command after ``--`` in the isolated cpus. The + particular perf command runs the ```` 10 times and reports + statistics. + +With these in place you can expect perf variations of less than 0.1%. + +Linux Intel +----------- + +* Disable turbo mode:: + + echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst index 1e4676767939..50f7aa123c55 100644 --- a/docs/GettingStartedVS.rst +++ b/docs/GettingStartedVS.rst @@ -100,6 +100,10 @@ Here's the short story for getting up and running quickly with LLVM: * CMake generates project files for all build types. To select a specific build type, use the Configuration manager from the VS IDE or the ``/property:Configuration`` command line option when using MSBuild. + * By default, the Visual Studio project files generated by CMake use the + 32-bit toolset. If you are developing on a 64-bit version of Windows and + want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when + generating the Visual Studio solution. This requires CMake 3.8.0 or later. 6. Start Visual Studio diff --git a/docs/LangRef.rst b/docs/LangRef.rst index b205cae9b118..2e339183ef11 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -4415,12 +4415,6 @@ The current supported vocabulary is limited: address space identifier. - ``DW_OP_stack_value`` marks a constant value. -DIExpression nodes that contain a ``DW_OP_stack_value`` operator are standalone -location descriptions that describe constant values. This form is used to -describe global constants that have been optimized away. All other expressions -are modifiers to another location: A debug intrinsic ties a location and a -DIExpression together. - DWARF specifies three kinds of simple location descriptions: Register, memory, and implicit location descriptions. Register and memory location descriptions describe the *location* of a source variable (in the sense that a debugger might @@ -12722,7 +12716,7 @@ Syntax: declare @llvm.experimental.constrained.fadd( , , metadata , - metadata ) + metadata ) Overview: """"""""" @@ -12759,7 +12753,7 @@ Syntax: declare @llvm.experimental.constrained.fsub( , , metadata , - metadata ) + metadata ) Overview: """"""""" @@ -12796,7 +12790,7 @@ Syntax: declare @llvm.experimental.constrained.fmul( , , metadata , - metadata ) + metadata ) Overview: """"""""" @@ -12833,7 +12827,7 @@ Syntax: declare @llvm.experimental.constrained.fdiv( , , metadata , - metadata ) + metadata ) Overview: """"""""" @@ -12870,7 +12864,7 @@ Syntax: declare @llvm.experimental.constrained.frem( , , metadata , - metadata ) + metadata ) Overview: """"""""" @@ -12899,6 +12893,461 @@ value operands and has the same type as the operands. The remainder has the same sign as the dividend. +Constrained libm-equivalent Intrinsics +-------------------------------------- + +In addition to the basic floating point operations for which constrained +intrinsics are described above, there are constrained versions of various +operations which provide equivalent behavior to a corresponding libm function. +These intrinsics allow the precise behavior of these operations with respect to +rounding mode and exception behavior to be controlled. + +As with the basic constrained floating point intrinsics, the rounding mode +and exception behavior arguments only control the behavior of the optimizer. +They do not change the runtime floating point environment. + + +'``llvm.experimental.constrained.sqrt``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.sqrt( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.sqrt``' intrinsic returns the square root +of the specified value, returning the same value as the libm '``sqrt``' +functions would, but without setting ``errno``. + +Arguments: +"""""""""" + +The first argument and the return type are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the nonnegative square root of the specified value. +If the value is less than negative zero, a floating point exception occurs +and the the return value is architecture specific. + + +'``llvm.experimental.constrained.pow``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.pow( , , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.pow``' intrinsic returns the first operand +raised to the (positive or negative) power specified by the second operand. + +Arguments: +"""""""""" + +The first two arguments and the return value are floating point numbers of the +same type. The second argument specifies the power to which the first argument +should be raised. + +The third and fourth arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the first value raised to the second power, +returning the same values as the libm ``pow`` functions would, and +handles error conditions in the same way. + + +'``llvm.experimental.constrained.powi``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.powi( , i32 , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand +raised to the (positive or negative) power specified by the second operand. The +order of evaluation of multiplications is not defined. When a vector of floating +point type is used, the second argument remains a scalar integer value. + + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. The second argument is a 32-bit signed integer specifying the power to +which the first argument should be raised. + +The third and fourth arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the first value raised to the second power with an +unspecified sequence of rounding operations. + + +'``llvm.experimental.constrained.sin``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.sin( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.sin``' intrinsic returns the sine of the +first operand. + +Arguments: +"""""""""" + +The first argument and the return type are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the sine of the specified operand, returning the +same values as the libm ``sin`` functions would, and handles error +conditions in the same way. + + +'``llvm.experimental.constrained.cos``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.cos( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.cos``' intrinsic returns the cosine of the +first operand. + +Arguments: +"""""""""" + +The first argument and the return type are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the cosine of the specified operand, returning the +same values as the libm ``cos`` functions would, and handles error +conditions in the same way. + + +'``llvm.experimental.constrained.exp``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.exp( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.exp``' intrinsic computes the base-e +exponential of the specified value. + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``exp`` functions +would, and handles error conditions in the same way. + + +'``llvm.experimental.constrained.exp2``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.exp2( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.exp2``' intrinsic computes the base-2 +exponential of the specified value. + + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``exp2`` functions +would, and handles error conditions in the same way. + + +'``llvm.experimental.constrained.log``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.log( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.log``' intrinsic computes the base-e +logarithm of the specified value. + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + + +Semantics: +"""""""""" + +This function returns the same values as the libm ``log`` functions +would, and handles error conditions in the same way. + + +'``llvm.experimental.constrained.log10``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.log10( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.log10``' intrinsic computes the base-10 +logarithm of the specified value. + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``log10`` functions +would, and handles error conditions in the same way. + + +'``llvm.experimental.constrained.log2``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.log2( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.log2``' intrinsic computes the base-2 +logarithm of the specified value. + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``log2`` functions +would, and handles error conditions in the same way. + + +'``llvm.experimental.constrained.rint``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.rint( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.rint``' intrinsic returns the first +operand rounded to the nearest integer. It may raise an inexact floating point +exception if the operand is not an integer. + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``rint`` functions +would, and handles error conditions in the same way. The rounding mode is +described, not determined, by the rounding mode argument. The actual rounding +mode is determined by the runtime floating point environment. The rounding +mode argument is only intended as information to the compiler. + + +'``llvm.experimental.constrained.nearbyint``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.nearbyint( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first +operand rounded to the nearest integer. It will not raise an inexact floating +point exception if the operand is not an integer. + + +Arguments: +"""""""""" + +The first argument and the return value are floating point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``nearbyint`` functions +would, and handles error conditions in the same way. The rounding mode is +described, not determined, by the rounding mode argument. The actual rounding +mode is determined by the runtime floating point environment. The rounding +mode argument is only intended as information to the compiler. + + General Intrinsics ------------------ diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst index 65c19aa2bc0c..a909d458c317 100644 --- a/docs/Vectorizers.rst +++ b/docs/Vectorizers.rst @@ -99,7 +99,9 @@ Optimization remarks are enabled using: indicates if vectorization was specified. ``-Rpass-analysis=loop-vectorize`` identifies the statements that caused -vectorization to fail. +vectorization to fail. If in addition ``-fsave-optimization-record`` is +provided, multiple causes of vectorization failure may be listed (this behavior +might change in the future). Consider the following loop: diff --git a/docs/index.rst b/docs/index.rst index fe47eb1bcb7f..becbe48e7ec7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -90,6 +90,7 @@ representation. CodeOfConduct CompileCudaWithLLVM ReportingGuide + Benchmarking :doc:`GettingStarted` Discusses how to get up and running quickly with the LLVM infrastructure. diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h index a14fd1dc20ec..847662cc11be 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h @@ -1,4 +1,4 @@ -//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===// +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,7 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" -#include "llvm/ExecutionEngine/RuntimeDyld.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" @@ -44,7 +44,7 @@ private: IRCompileLayer CompileLayer; public: - typedef decltype(CompileLayer)::ModuleSetHandleT ModuleHandle; + using ModuleHandle = decltype(CompileLayer)::ModuleSetHandleT; KaleidoscopeJIT() : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h index 2039be4571a5..a5ac2f017b74 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h @@ -1,4 +1,4 @@ -//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===// +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,7 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" -#include "llvm/ExecutionEngine/RuntimeDyld.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" @@ -47,13 +47,13 @@ private: RTDyldObjectLinkingLayer<> ObjectLayer; IRCompileLayer CompileLayer; - typedef std::function(std::unique_ptr)> - OptimizeFunction; + using OptimizeFunction = + std::function(std::unique_ptr)>; IRTransformLayer OptimizeLayer; public: - typedef decltype(OptimizeLayer)::ModuleSetHandleT ModuleHandle; + using ModuleHandle = decltype(OptimizeLayer)::ModuleSetHandleT; KaleidoscopeJIT() : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h index d22d41855072..7acb9c748880 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h @@ -1,4 +1,4 @@ -//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===// +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/RuntimeDyld.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" @@ -49,8 +50,8 @@ private: RTDyldObjectLinkingLayer<> ObjectLayer; IRCompileLayer CompileLayer; - typedef std::function(std::unique_ptr)> - OptimizeFunction; + using OptimizeFunction = + std::function(std::unique_ptr)>; IRTransformLayer OptimizeLayer; @@ -58,7 +59,7 @@ private: CompileOnDemandLayer CODLayer; public: - typedef decltype(CODLayer)::ModuleSetHandleT ModuleHandle; + using ModuleHandle = decltype(CODLayer)::ModuleSetHandleT; KaleidoscopeJIT() : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h index e0a78410f713..03e42230ae71 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h @@ -1,4 +1,4 @@ -//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===// +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,10 +17,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" -#include "llvm/ExecutionEngine/RuntimeDyld.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" -#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" @@ -76,8 +76,8 @@ private: RTDyldObjectLinkingLayer<> ObjectLayer; IRCompileLayer CompileLayer; - typedef std::function(std::unique_ptr)> - OptimizeFunction; + using OptimizeFunction = + std::function(std::unique_ptr)>; IRTransformLayer OptimizeLayer; @@ -85,7 +85,7 @@ private: std::unique_ptr IndirectStubsMgr; public: - typedef decltype(OptimizeLayer)::ModuleSetHandleT ModuleHandle; + using ModuleHandle = decltype(OptimizeLayer)::ModuleSetHandleT; KaleidoscopeJIT() : TM(EngineBuilder().selectTarget()), @@ -106,7 +106,6 @@ public: TargetMachine &getTargetMachine() { return *TM; } ModuleHandle addModule(std::unique_ptr M) { - // Build our symbol resolver: // Lambda 1: Look back into the JIT itself to find symbols that are part of // the same "logical dylib". diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h index 70a896fe8f00..0ee9d094ab82 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h @@ -1,4 +1,4 @@ -//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===// +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -20,9 +20,8 @@ #include "llvm/ADT/Triple.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" -#include "llvm/ExecutionEngine/RuntimeDyld.h" -#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" @@ -73,7 +72,7 @@ namespace llvm { namespace orc { // Typedef the remote-client API. -typedef remote::OrcRemoteTargetClient MyRemote; +using MyRemote = remote::OrcRemoteTargetClient; class KaleidoscopeJIT { private: @@ -82,8 +81,8 @@ private: RTDyldObjectLinkingLayer<> ObjectLayer; IRCompileLayer CompileLayer; - typedef std::function(std::unique_ptr)> - OptimizeFunction; + using OptimizeFunction = + std::function(std::unique_ptr)>; IRTransformLayer OptimizeLayer; @@ -92,7 +91,7 @@ private: MyRemote &Remote; public: - typedef decltype(OptimizeLayer)::ModuleSetHandleT ModuleHandle; + using ModuleHandle = decltype(OptimizeLayer)::ModuleSetHandleT; KaleidoscopeJIT(MyRemote &Remote) : TM(EngineBuilder().selectTarget(Triple(Remote.getTargetTriple()), "", @@ -124,7 +123,6 @@ public: TargetMachine &getTargetMachine() { return *TM; } ModuleHandle addModule(std::unique_ptr M) { - // Build our symbol resolver: // Lambda 1: Look back into the JIT itself to find symbols that are part of // the same "logical dylib". diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp index da6e8ac65234..e50a7ecf96bc 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp @@ -1,17 +1,19 @@ -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/DynamicLibrary.h" -#include "llvm/Support/TargetSelect.h" +#include "../RemoteJITUtils.h" +#include "llvm/ExecutionEngine/RTDyldMemoryManager.h" #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h" #include "llvm/ExecutionEngine/Orc/OrcABISupport.h" - -#include "../RemoteJITUtils.h" - +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetSelect.h" +#include +#include #include -#include +#include #include #include - using namespace llvm; using namespace llvm::orc; @@ -22,7 +24,7 @@ cl::opt Port("port", ExitOnError ExitOnErr; -typedef int (*MainFun)(int, const char*[]); +using MainFun = int (*)(int, const char*[]); template NativePtrT MakeNative(uint64_t P) { @@ -36,7 +38,6 @@ void printExprResult(double Val) { // --- LAZY COMPILE TEST --- int main(int argc, char* argv[]) { - if (argc == 0) ExitOnErr.setBanner("jit_server: "); else @@ -59,14 +60,14 @@ int main(int argc, char* argv[]) { int sockfd = socket(PF_INET, SOCK_STREAM, 0); sockaddr_in servAddr, clientAddr; socklen_t clientAddrLen = sizeof(clientAddr); - bzero(&servAddr, sizeof(servAddr)); + memset(&servAddr, 0, sizeof(servAddr)); servAddr.sin_family = PF_INET; servAddr.sin_family = INADDR_ANY; servAddr.sin_port = htons(Port); { // avoid "Address already in use" error. - int yes=1; + int yes = 1; if (setsockopt(sockfd,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) { errs() << "Error calling setsockopt.\n"; return 1; @@ -98,7 +99,8 @@ int main(int argc, char* argv[]) { }; FDRPCChannel TCPChannel(newsockfd, newsockfd); - typedef remote::OrcRemoteTargetServer MyServerT; + + using MyServerT = remote::OrcRemoteTargetServer; MyServerT Server(TCPChannel, SymbolLookup, RegisterEHFrames, DeregisterEHFrames); diff --git a/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/examples/Kaleidoscope/include/KaleidoscopeJIT.h index 1dca39deba3c..9a682f7ab744 100644 --- a/examples/Kaleidoscope/include/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/include/KaleidoscopeJIT.h @@ -1,4 +1,4 @@ -//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===// +//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -19,7 +19,6 @@ #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" -#include "llvm/ExecutionEngine/RuntimeDyld.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" @@ -40,9 +39,9 @@ namespace orc { class KaleidoscopeJIT { public: - typedef RTDyldObjectLinkingLayer<> ObjLayerT; - typedef IRCompileLayer CompileLayerT; - typedef CompileLayerT::ModuleSetHandleT ModuleHandleT; + using ObjLayerT = RTDyldObjectLinkingLayer<>; + using CompileLayerT = IRCompileLayer; + using ModuleHandleT = CompileLayerT::ModuleSetHandleT; KaleidoscopeJIT() : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h index 3a4a37017d61..07626982d289 100644 --- a/include/llvm/ADT/Triple.h +++ b/include/llvm/ADT/Triple.h @@ -59,6 +59,7 @@ public: mips64, // MIPS64: mips64 mips64el, // MIPS64EL: mips64el msp430, // MSP430: msp430 + nios2, // NIOSII: nios2 ppc, // PPC: powerpc ppc64, // PPC64: powerpc64, ppu ppc64le, // PPC64LE: powerpc64le diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h index bf73e099a2bf..ca48b5483512 100644 --- a/include/llvm/Analysis/InstructionSimplify.h +++ b/include/llvm/Analysis/InstructionSimplify.h @@ -70,174 +70,173 @@ struct SimplifyQuery { Copy.CxtI = I; return Copy; } - }; +}; - // NOTE: the explicit multiple argument versions of these functions are - // deprecated. - // Please use the SimplifyQuery versions in new code. +// NOTE: the explicit multiple argument versions of these functions are +// deprecated. +// Please use the SimplifyQuery versions in new code. - /// Given operands for an Add, fold the result or return null. - Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, +/// Given operands for an Add, fold the result or return null. +Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, const SimplifyQuery &Q); - /// Given operands for a Sub, fold the result or return null. - Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, - const SimplifyQuery &Q); +/// Given operands for a Sub, fold the result or return null. +Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW, + const SimplifyQuery &Q); - /// Given operands for an FAdd, fold the result or return null. - Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF, +/// Given operands for an FAdd, fold the result or return null. +Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + +/// Given operands for an FSub, fold the result or return null. +Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + +/// Given operands for an FMul, fold the result or return null. +Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + +/// Given operands for a Mul, fold the result or return null. +Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for an SDiv, fold the result or return null. +Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for a UDiv, fold the result or return null. +Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for an FDiv, fold the result or return null. +Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + +/// Given operands for an SRem, fold the result or return null. +Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for a URem, fold the result or return null. +Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for an FRem, fold the result or return null. +Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF, + const SimplifyQuery &Q); + +/// Given operands for a Shl, fold the result or return null. +Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, + const SimplifyQuery &Q); + +/// Given operands for a LShr, fold the result or return null. +Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, + const SimplifyQuery &Q); + +/// Given operands for a AShr, fold the result or return nulll. +Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, + const SimplifyQuery &Q); + +/// Given operands for an And, fold the result or return null. +Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for an Or, fold the result or return null. +Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for an Xor, fold the result or return null. +Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); + +/// Given operands for an ICmpInst, fold the result or return null. +Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, + const SimplifyQuery &Q); + +/// Given operands for an FCmpInst, fold the result or return null. +Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, + FastMathFlags FMF, const SimplifyQuery &Q); + +/// Given operands for a SelectInst, fold the result or return null. +Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, const SimplifyQuery &Q); - /// Given operands for an FSub, fold the result or return null. - Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF, - const SimplifyQuery &Q); +/// Given operands for a GetElementPtrInst, fold the result or return null. +Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, + const SimplifyQuery &Q); - /// Given operands for an FMul, fold the result or return null. - Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF, - const SimplifyQuery &Q); +/// Given operands for an InsertValueInst, fold the result or return null. +Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef Idxs, + const SimplifyQuery &Q); - /// Given operands for a Mul, fold the result or return null. - Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); +/// Given operands for an ExtractValueInst, fold the result or return null. +Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, + const SimplifyQuery &Q); - /// Given operands for an SDiv, fold the result or return null. - Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for a UDiv, fold the result or return null. - Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for an FDiv, fold the result or return null. - Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF, - const SimplifyQuery &Q); - - /// Given operands for an SRem, fold the result or return null. - Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for a URem, fold the result or return null. - Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for an FRem, fold the result or return null. - Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF, - const SimplifyQuery &Q); - - /// Given operands for a Shl, fold the result or return null. - Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, - const SimplifyQuery &Q); - - /// Given operands for a LShr, fold the result or return null. - Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact, - const SimplifyQuery &Q); - - /// Given operands for a AShr, fold the result or return nulll. - Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, - const SimplifyQuery &Q); - - /// Given operands for an And, fold the result or return null. - Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for an Or, fold the result or return null. - Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for an Xor, fold the result or return null. - Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q); - - /// Given operands for an ICmpInst, fold the result or return null. - Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, - const SimplifyQuery &Q); - - /// Given operands for an FCmpInst, fold the result or return null. - Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS, - FastMathFlags FMF, const SimplifyQuery &Q); - - /// Given operands for a SelectInst, fold the result or return null. - Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal, - const SimplifyQuery &Q); - - /// Given operands for a GetElementPtrInst, fold the result or return null. - Value *SimplifyGEPInst(Type *SrcTy, ArrayRef Ops, - const SimplifyQuery &Q); - - /// Given operands for an InsertValueInst, fold the result or return null. - Value *SimplifyInsertValueInst(Value *Agg, Value *Val, - ArrayRef Idxs, - const SimplifyQuery &Q); - - /// Given operands for an ExtractValueInst, fold the result or return null. - Value *SimplifyExtractValueInst(Value *Agg, ArrayRef Idxs, +/// Given operands for an ExtractElementInst, fold the result or return null. +Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &Q); - /// Given operands for an ExtractElementInst, fold the result or return null. - Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, - const SimplifyQuery &Q); +/// Given operands for a CastInst, fold the result or return null. +Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, + const SimplifyQuery &Q); - /// Given operands for a CastInst, fold the result or return null. - Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, - const SimplifyQuery &Q); +/// Given operands for a ShuffleVectorInst, fold the result or return null. +Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, + Type *RetTy, const SimplifyQuery &Q); - /// Given operands for a ShuffleVectorInst, fold the result or return null. - Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, - Type *RetTy, const SimplifyQuery &Q); +//=== Helper functions for higher up the class hierarchy. - //=== Helper functions for higher up the class hierarchy. - - - /// Given operands for a CmpInst, fold the result or return null. - Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, - const SimplifyQuery &Q); - - /// Given operands for a BinaryOperator, fold the result or return null. - Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, +/// Given operands for a CmpInst, fold the result or return null. +Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q); - /// Given operands for an FP BinaryOperator, fold the result or return null. - /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the - /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. - Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, - FastMathFlags FMF, const SimplifyQuery &Q); +/// Given operands for a BinaryOperator, fold the result or return null. +Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, + const SimplifyQuery &Q); - /// Given a function and iterators over arguments, fold the result or return - /// null. - Value *SimplifyCall(Value *V, User::op_iterator ArgBegin, - User::op_iterator ArgEnd, const SimplifyQuery &Q); +/// Given operands for an FP BinaryOperator, fold the result or return null. +/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the +/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp. +Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS, + FastMathFlags FMF, const SimplifyQuery &Q); - /// Given a function and set of arguments, fold the result or return null. - Value *SimplifyCall(Value *V, ArrayRef Args, const SimplifyQuery &Q); +/// Given a function and iterators over arguments, fold the result or return +/// null. +Value *SimplifyCall(Value *V, User::op_iterator ArgBegin, + User::op_iterator ArgEnd, const SimplifyQuery &Q); - /// See if we can compute a simplified version of this instruction. If not, - /// return null. - Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, - OptimizationRemarkEmitter *ORE = nullptr); +/// Given a function and set of arguments, fold the result or return null. +Value *SimplifyCall(Value *V, ArrayRef Args, const SimplifyQuery &Q); - /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively. - /// - /// This first performs a normal RAUW of I with SimpleV. It then recursively - /// attempts to simplify those users updated by the operation. The 'I' - /// instruction must not be equal to the simplified value 'SimpleV'. - /// - /// The function returns true if any simplifications were performed. - bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV, - const TargetLibraryInfo *TLI = nullptr, - const DominatorTree *DT = nullptr, - AssumptionCache *AC = nullptr); +/// See if we can compute a simplified version of this instruction. If not, +/// return null. +Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, + OptimizationRemarkEmitter *ORE = nullptr); - /// Recursively attempt to simplify an instruction. - /// - /// This routine uses SimplifyInstruction to simplify 'I', and if successful - /// replaces uses of 'I' with the simplified value. It then recurses on each - /// of the users impacted. It returns true if any simplifications were - /// performed. - bool recursivelySimplifyInstruction(Instruction *I, - const TargetLibraryInfo *TLI = nullptr, - const DominatorTree *DT = nullptr, - AssumptionCache *AC = nullptr); - // These helper functions return a SimplifyQuery structure that contains as - // many of the optional analysis we use as are currently valid. This is the - // strongly preferred way of constructing SimplifyQuery in passes. - const SimplifyQuery getBestSimplifyQuery(Pass &, Function &); - template - const SimplifyQuery getBestSimplifyQuery(AnalysisManager &, - Function &); - const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &, - const DataLayout &); +/// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively. +/// +/// This first performs a normal RAUW of I with SimpleV. It then recursively +/// attempts to simplify those users updated by the operation. The 'I' +/// instruction must not be equal to the simplified value 'SimpleV'. +/// +/// The function returns true if any simplifications were performed. +bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV, + const TargetLibraryInfo *TLI = nullptr, + const DominatorTree *DT = nullptr, + AssumptionCache *AC = nullptr); + +/// Recursively attempt to simplify an instruction. +/// +/// This routine uses SimplifyInstruction to simplify 'I', and if successful +/// replaces uses of 'I' with the simplified value. It then recurses on each +/// of the users impacted. It returns true if any simplifications were +/// performed. +bool recursivelySimplifyInstruction(Instruction *I, + const TargetLibraryInfo *TLI = nullptr, + const DominatorTree *DT = nullptr, + AssumptionCache *AC = nullptr); + +// These helper functions return a SimplifyQuery structure that contains as +// many of the optional analysis we use as are currently valid. This is the +// strongly preferred way of constructing SimplifyQuery in passes. +const SimplifyQuery getBestSimplifyQuery(Pass &, Function &); +template +const SimplifyQuery getBestSimplifyQuery(AnalysisManager &, + Function &); +const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &, + const DataLayout &); } // end namespace llvm #endif diff --git a/include/llvm/Analysis/LoopPass.h b/include/llvm/Analysis/LoopPass.h index 496ae189e57b..75e7688bbdc2 100644 --- a/include/llvm/Analysis/LoopPass.h +++ b/include/llvm/Analysis/LoopPass.h @@ -126,9 +126,8 @@ public: } public: - // Add a new loop into the loop queue as a child of the given parent, or at - // the top level if \c ParentLoop is null. - Loop &addLoop(Loop *ParentLoop); + // Add a new loop into the loop queue. + void addLoop(Loop &L); //===--------------------------------------------------------------------===// /// SimpleAnalysis - Provides simple interface to update analysis info diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index ac54bd4cfffb..4a6fc245c225 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -1533,6 +1533,12 @@ public: /// specified loop. bool isLoopInvariant(const SCEV *S, const Loop *L); + /// Determine if the SCEV can be evaluated at loop's entry. It is true if it + /// doesn't depend on a SCEVUnknown of an instruction which is dominated by + /// the header of loop L. + bool isAvailableAtLoopEntry(const SCEV *S, const Loop *L, DominatorTree &DT, + LoopInfo &LI); + /// Return true if the given SCEV changes value in a known way in the /// specified loop. This property being true implies that the value is /// variant in the loop AND that we can emit an expression to compute the diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 0a0af384c3e6..6cbe3a1f515e 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -396,6 +396,9 @@ public: bool isLegalMaskedScatter(Type *DataType) const; bool isLegalMaskedGather(Type *DataType) const; + /// Return true if target doesn't mind addresses in vectors. + bool prefersVectorizedAddressing() const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. @@ -807,6 +810,7 @@ public: virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; virtual bool isLegalMaskedGather(Type *DataType) = 0; + virtual bool prefersVectorizedAddressing() = 0; virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; @@ -1000,6 +1004,9 @@ public: bool isLegalMaskedGather(Type *DataType) override { return Impl.isLegalMaskedGather(DataType); } + bool prefersVectorizedAddressing() override { + return Impl.prefersVectorizedAddressing(); + } int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 550e84ad90c4..ad1a7cb748fe 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -237,6 +237,8 @@ public: bool isLegalMaskedGather(Type *DataType) { return false; } + bool prefersVectorizedAddressing() { return true; } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { // Guess that all legal addressing mode are free. diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index cf24062e46f8..b1ee76159c4b 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -60,7 +60,8 @@ template class ArrayRef; KnownBits computeKnownBits(const Value *V, const DataLayout &DL, unsigned Depth = 0, AssumptionCache *AC = nullptr, const Instruction *CxtI = nullptr, - const DominatorTree *DT = nullptr); + const DominatorTree *DT = nullptr, + OptimizationRemarkEmitter *ORE = nullptr); /// Compute known bits from the range metadata. /// \p KnownZero the set of bits that are known to be zero /// \p KnownOne the set of bits that are known to be one diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h index 180c0b579248..c898667f1474 100644 --- a/include/llvm/CodeGen/AsmPrinter.h +++ b/include/llvm/CodeGen/AsmPrinter.h @@ -34,6 +34,7 @@ namespace llvm { class AsmPrinterHandler; +class BasicBlock; class BlockAddress; class Constant; class ConstantArray; @@ -43,6 +44,7 @@ class DIEAbbrev; class DwarfDebug; class GCMetadataPrinter; class GlobalIndirectSymbol; +class GlobalObject; class GlobalValue; class GlobalVariable; class GCStrategy; @@ -65,6 +67,8 @@ class MCSubtargetInfo; class MCSymbol; class MCTargetOptions; class MDNode; +class Module; +class raw_ostream; class TargetLoweringObjectFile; class TargetMachine; @@ -109,7 +113,7 @@ public: /// Map global GOT equivalent MCSymbols to GlobalVariables and keep track of /// its number of uses by other globals. - typedef std::pair GOTEquivUsePair; + using GOTEquivUsePair = std::pair; MapVector GlobalGOTEquivs; /// Enable print [latency:throughput] in output diff --git a/include/llvm/CodeGen/AtomicExpandUtils.h b/include/llvm/CodeGen/AtomicExpandUtils.h index ac18eac8a1ce..1f9c96b18e1b 100644 --- a/include/llvm/CodeGen/AtomicExpandUtils.h +++ b/include/llvm/CodeGen/AtomicExpandUtils.h @@ -1,4 +1,4 @@ -//===-- AtomicExpandUtils.h - Utilities for expanding atomic instructions -===// +//===- AtomicExpandUtils.h - Utilities for expanding atomic instructions --===// // // The LLVM Compiler Infrastructure // @@ -7,19 +7,24 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_CODEGEN_ATOMICEXPANDUTILS_H +#define LLVM_CODEGEN_ATOMICEXPANDUTILS_H + #include "llvm/ADT/STLExtras.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Support/AtomicOrdering.h" namespace llvm { -class Value; -class AtomicRMWInst; +class AtomicRMWInst; +class Value; /// Parameters (see the expansion example below): /// (the builder, %addr, %loaded, %new_val, ordering, /// /* OUT */ %success, /* OUT */ %new_loaded) -typedef function_ref &, Value *, Value *, Value *, - AtomicOrdering, Value *&, Value *&)> CreateCmpXchgInstFun; +using CreateCmpXchgInstFun = + function_ref &, Value *, Value *, Value *, AtomicOrdering, + Value *&, Value *&)>; /// \brief Expand an atomic RMW instruction into a loop utilizing /// cmpxchg. You'll want to make sure your target machine likes cmpxchg @@ -42,7 +47,8 @@ typedef function_ref &, Value *, Value *, Value *, /// loop: /// %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] /// %new = some_op iN %loaded, %incr -/// ; This is what -atomic-expand will produce using this function on i686 targets: +/// ; This is what -atomic-expand will produce using this function on i686 +/// targets: /// %pair = cmpxchg iN* %addr, iN %loaded, iN %new_val /// %new_loaded = extractvalue { iN, i1 } %pair, 0 /// %success = extractvalue { iN, i1 } %pair, 1 @@ -52,6 +58,8 @@ typedef function_ref &, Value *, Value *, Value *, /// [...] /// /// Returns true if the containing function was modified. -bool -expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun Factory); -} +bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun Factory); + +} // end namespace llvm + +#endif // LLVM_CODEGEN_ATOMICEXPANDUTILS_H diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h index a40147336fe2..4be44e62fa92 100644 --- a/include/llvm/CodeGen/DIE.h +++ b/include/llvm/CodeGen/DIE.h @@ -1,4 +1,4 @@ -//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===// +//===- lib/CodeGen/DIE.h - DWARF Info Entries -------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -31,6 +31,7 @@ #include #include #include +#include #include namespace llvm { @@ -53,11 +54,11 @@ class DIEAbbrevData { dwarf::Form Form; /// Dwarf attribute value for DW_FORM_implicit_const - int64_t Value; + int64_t Value = 0; public: DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) - : Attribute(A), Form(F), Value(0) {} + : Attribute(A), Form(F) {} DIEAbbrevData(dwarf::Attribute A, int64_t V) : Attribute(A), Form(dwarf::DW_FORM_implicit_const), Value(V) {} @@ -136,13 +137,14 @@ class DIEAbbrevSet { /// storage container. BumpPtrAllocator &Alloc; /// \brief FoldingSet that uniques the abbreviations. - llvm::FoldingSet AbbreviationsSet; + FoldingSet AbbreviationsSet; /// A list of all the unique abbreviations in use. std::vector Abbreviations; public: DIEAbbrevSet(BumpPtrAllocator &A) : Alloc(A) {} ~DIEAbbrevSet(); + /// Generate the abbreviation declaration for a DIE and return a pointer to /// the generated abbreviation. /// @@ -289,13 +291,11 @@ public: /// A pointer to another debug information entry. An instance of this class can /// also be used as a proxy for a debug information entry not yet defined /// (ie. types.) -class DIE; class DIEEntry { DIE *Entry; - DIEEntry() = delete; - public: + DIEEntry() = delete; explicit DIEEntry(DIE &E) : Entry(&E) {} DIE &getEntry() const { return *Entry; } @@ -348,10 +348,10 @@ private: /// /// All values that aren't standard layout (or are larger than 8 bytes) /// should be stored by reference instead of by value. - typedef AlignedCharArrayUnion - ValTy; + using ValTy = AlignedCharArrayUnion; + static_assert(sizeof(ValTy) <= sizeof(uint64_t) || sizeof(ValTy) <= sizeof(void *), "Expected all large types to be stored via pointer"); @@ -486,10 +486,12 @@ struct IntrusiveBackListNode { }; struct IntrusiveBackListBase { - typedef IntrusiveBackListNode Node; + using Node = IntrusiveBackListNode; + Node *Last = nullptr; bool empty() const { return !Last; } + void push_back(Node &N) { assert(N.Next.getPointer() == &N && "Expected unlinked node"); assert(N.Next.getInt() == true && "Expected unlinked node"); @@ -505,6 +507,7 @@ struct IntrusiveBackListBase { template class IntrusiveBackList : IntrusiveBackListBase { public: using IntrusiveBackListBase::empty; + void push_back(T &N) { IntrusiveBackListBase::push_back(N); } T &back() { return *static_cast(Last); } const T &back() const { return *static_cast(Last); } @@ -513,6 +516,7 @@ public: class iterator : public iterator_facade_base { friend class const_iterator; + Node *N = nullptr; public: @@ -585,10 +589,12 @@ public: class DIEValueList { struct Node : IntrusiveBackListNode { DIEValue V; + explicit Node(DIEValue V) : V(V) {} }; - typedef IntrusiveBackList ListTy; + using ListTy = IntrusiveBackList; + ListTy List; public: @@ -597,9 +603,10 @@ public: : public iterator_adaptor_base { friend class const_value_iterator; - typedef iterator_adaptor_base iterator_adaptor; + + using iterator_adaptor = + iterator_adaptor_base; public: value_iterator() = default; @@ -612,9 +619,9 @@ public: class const_value_iterator : public iterator_adaptor_base< const_value_iterator, ListTy::const_iterator, std::forward_iterator_tag, const DIEValue> { - typedef iterator_adaptor_base iterator_adaptor; + using iterator_adaptor = + iterator_adaptor_base; public: const_value_iterator() = default; @@ -627,8 +634,8 @@ public: const DIEValue &operator*() const { return wrapped()->V; } }; - typedef iterator_range value_range; - typedef iterator_range const_value_range; + using value_range = iterator_range; + using const_value_range = iterator_range; value_iterator addValue(BumpPtrAllocator &Alloc, const DIEValue &V) { List.push_back(*new (Alloc) Node(V)); @@ -657,15 +664,15 @@ class DIE : IntrusiveBackListNode, public DIEValueList { friend class DIEUnit; /// Dwarf unit relative offset. - unsigned Offset; + unsigned Offset = 0; /// Size of instance + children. - unsigned Size; + unsigned Size = 0; unsigned AbbrevNumber = ~0u; /// Dwarf tag code. dwarf::Tag Tag = (dwarf::Tag)0; /// Set to true to force a DIE to emit an abbreviation that says it has /// children even when it doesn't. This is used for unit testing purposes. - bool ForceChildren; + bool ForceChildren = false; /// Children DIEs. IntrusiveBackList Children; @@ -673,20 +680,19 @@ class DIE : IntrusiveBackListNode, public DIEValueList { /// DIEUnit which contains this DIE as its unit DIE. PointerUnion Owner; - DIE() = delete; - explicit DIE(dwarf::Tag Tag) : Offset(0), Size(0), Tag(Tag), - ForceChildren(false) {} + explicit DIE(dwarf::Tag Tag) : Tag(Tag) {} public: + DIE() = delete; + DIE(const DIE &RHS) = delete; + DIE(DIE &&RHS) = delete; + DIE &operator=(const DIE &RHS) = delete; + DIE &operator=(const DIE &&RHS) = delete; + static DIE *get(BumpPtrAllocator &Alloc, dwarf::Tag Tag) { return new (Alloc) DIE(Tag); } - DIE(const DIE &RHS) = delete; - DIE(DIE &&RHS) = delete; - void operator=(const DIE &RHS) = delete; - void operator=(const DIE &&RHS) = delete; - // Accessors. unsigned getAbbrevNumber() const { return AbbrevNumber; } dwarf::Tag getTag() const { return Tag; } @@ -696,10 +702,10 @@ public: bool hasChildren() const { return ForceChildren || !Children.empty(); } void setForceChildren(bool B) { ForceChildren = B; } - typedef IntrusiveBackList::iterator child_iterator; - typedef IntrusiveBackList::const_iterator const_child_iterator; - typedef iterator_range child_range; - typedef iterator_range const_child_range; + using child_iterator = IntrusiveBackList::iterator; + using const_child_iterator = IntrusiveBackList::const_iterator; + using child_range = iterator_range; + using const_child_range = iterator_range; child_range children() { return make_range(Children.begin(), Children.end()); @@ -838,10 +844,10 @@ struct BasicDIEUnit final : DIEUnit { /// DIELoc - Represents an expression location. // class DIELoc : public DIEValueList { - mutable unsigned Size; // Size in bytes excluding size header. + mutable unsigned Size = 0; // Size in bytes excluding size header. public: - DIELoc() : Size(0) {} + DIELoc() = default; /// ComputeSize - Calculate the size of the location expression. /// @@ -872,10 +878,10 @@ public: /// DIEBlock - Represents a block of values. // class DIEBlock : public DIEValueList { - mutable unsigned Size; // Size in bytes excluding size header. + mutable unsigned Size = 0; // Size in bytes excluding size header. public: - DIEBlock() : Size(0) {} + DIEBlock() = default; /// ComputeSize - Calculate the size of the location expression. /// diff --git a/include/llvm/CodeGen/FaultMaps.h b/include/llvm/CodeGen/FaultMaps.h index 0f0005b83c54..98ff526dfe94 100644 --- a/include/llvm/CodeGen/FaultMaps.h +++ b/include/llvm/CodeGen/FaultMaps.h @@ -56,7 +56,7 @@ private: HandlerOffsetExpr(HandlerOffset) {} }; - typedef std::vector FunctionFaultInfos; + using FunctionFaultInfos = std::vector; // We'd like to keep a stable iteration order for FunctionInfos to help // FileCheck based testing. @@ -78,20 +78,17 @@ private: /// generated by the version of LLVM that includes it. No guarantees are made /// with respect to forward or backward compatibility. class FaultMapParser { - typedef uint8_t FaultMapVersionType; - static const size_t FaultMapVersionOffset = 0; + using FaultMapVersionType = uint8_t; + using Reserved0Type = uint8_t; + using Reserved1Type = uint16_t; + using NumFunctionsType = uint32_t; - typedef uint8_t Reserved0Type; + static const size_t FaultMapVersionOffset = 0; static const size_t Reserved0Offset = FaultMapVersionOffset + sizeof(FaultMapVersionType); - - typedef uint16_t Reserved1Type; static const size_t Reserved1Offset = Reserved0Offset + sizeof(Reserved0Type); - - typedef uint32_t NumFunctionsType; static const size_t NumFunctionsOffset = Reserved1Offset + sizeof(Reserved1Type); - static const size_t FunctionInfosOffset = NumFunctionsOffset + sizeof(NumFunctionsType); @@ -105,14 +102,13 @@ class FaultMapParser { public: class FunctionFaultInfoAccessor { - typedef uint32_t FaultKindType; - static const size_t FaultKindOffset = 0; + using FaultKindType = uint32_t; + using FaultingPCOffsetType = uint32_t; + using HandlerPCOffsetType = uint32_t; - typedef uint32_t FaultingPCOffsetType; + static const size_t FaultKindOffset = 0; static const size_t FaultingPCOffsetOffset = FaultKindOffset + sizeof(FaultKindType); - - typedef uint32_t HandlerPCOffsetType; static const size_t HandlerPCOffsetOffset = FaultingPCOffsetOffset + sizeof(FaultingPCOffsetType); @@ -140,20 +136,17 @@ public: }; class FunctionInfoAccessor { - typedef uint64_t FunctionAddrType; - static const size_t FunctionAddrOffset = 0; + using FunctionAddrType = uint64_t; + using NumFaultingPCsType = uint32_t; + using ReservedType = uint32_t; - typedef uint32_t NumFaultingPCsType; + static const size_t FunctionAddrOffset = 0; static const size_t NumFaultingPCsOffset = FunctionAddrOffset + sizeof(FunctionAddrType); - - typedef uint32_t ReservedType; static const size_t ReservedOffset = NumFaultingPCsOffset + sizeof(NumFaultingPCsType); - static const size_t FunctionFaultInfosOffset = ReservedOffset + sizeof(ReservedType); - static const size_t FunctionInfoHeaderSize = FunctionFaultInfosOffset; const uint8_t *P = nullptr; diff --git a/include/llvm/CodeGen/GlobalISel/Localizer.h b/include/llvm/CodeGen/GlobalISel/Localizer.h new file mode 100644 index 000000000000..0a46eb9e7840 --- /dev/null +++ b/include/llvm/CodeGen/GlobalISel/Localizer.h @@ -0,0 +1,78 @@ +//== llvm/CodeGen/GlobalISel/Localizer.h - Localizer -------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file describes the interface of the Localizer pass. +/// This pass moves/duplicates constant-like instructions close to their uses. +/// Its primarily goal is to workaround the deficiencies of the fast register +/// allocator. +/// With GlobalISel constants are all materialized in the entry block of +/// a function. However, the fast allocator cannot rematerialize constants and +/// has a lot more live-ranges to deal with and will most likely end up +/// spilling a lot. +/// By pushing the constants close to their use, we only create small +/// live-ranges. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_GLOBALISEL_LOCALIZER_H +#define LLVM_CODEGEN_GLOBALISEL_LOCALIZER_H + +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +namespace llvm { +// Forward declarations. +class MachineRegisterInfo; + +/// This pass implements the localization mechanism described at the +/// top of this file. One specificity of the implementation is that +/// it will materialize one and only one instance of a constant per +/// basic block, thus enabling reuse of that constant within that block. +/// Moreover, it only materializes constants in blocks where they +/// are used. PHI uses are considered happening at the end of the +/// related predecessor. +class Localizer : public MachineFunctionPass { +public: + static char ID; + +private: + /// MRI contains all the register class/bank information that this + /// pass uses and updates. + MachineRegisterInfo *MRI; + + /// Check whether or not \p MI needs to be moved close to its uses. + static bool shouldLocalize(const MachineInstr &MI); + + /// Check if \p MOUse is used in the same basic block as \p Def. + /// If the use is in the same block, we say it is local. + /// When the use is not local, \p InsertMBB will contain the basic + /// block when to insert \p Def to have a local use. + static bool isLocalUse(MachineOperand &MOUse, const MachineInstr &Def, + MachineBasicBlock *&InsertMBB); + + /// Initialize the field members using \p MF. + void init(MachineFunction &MF); + +public: + Localizer(); + + StringRef getPassName() const override { return "Localizer"; } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA) + .set(MachineFunctionProperties::Property::Legalized) + .set(MachineFunctionProperties::Property::RegBankSelected); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // End namespace llvm. + +#endif diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index f2a9a9f73ca6..2300a106c358 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -264,6 +264,14 @@ namespace ISD { /// optimized. STRICT_FADD, STRICT_FSUB, STRICT_FMUL, STRICT_FDIV, STRICT_FREM, + /// Constrained versions of libm-equivalent floating point intrinsics. + /// These will be lowered to the equivalent non-constrained pseudo-op + /// (or expanded to the equivalent library call) before final selection. + /// They are used to limit optimizations while the DAG is being optimized. + STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS, + STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2, + STRICT_FRINT, STRICT_FNEARBYINT, + /// FMA - Perform a * b + c with no intermediate rounding step. FMA, diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h index b792cba4b78a..40cd146f88f8 100644 --- a/include/llvm/CodeGen/LiveInterval.h +++ b/include/llvm/CodeGen/LiveInterval.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/LiveInterval.h - Interval representation ---*- C++ -*-===// +//===- llvm/CodeGen/LiveInterval.h - Interval representation ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -21,22 +21,30 @@ #ifndef LLVM_CODEGEN_LIVEINTERVAL_H #define LLVM_CODEGEN_LIVEINTERVAL_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IntEqClasses.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/LaneBitmask.h" #include "llvm/Support/Allocator.h" -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/MathExtras.h" +#include #include -#include +#include +#include +#include #include +#include +#include namespace llvm { + class CoalescerPair; class LiveIntervals; - class MachineInstr; class MachineRegisterInfo; - class TargetRegisterInfo; class raw_ostream; - template class SmallPtrSet; /// VNInfo - Value Number Information. /// This class holds information about a machine level values, including @@ -44,7 +52,7 @@ namespace llvm { /// class VNInfo { public: - typedef BumpPtrAllocator Allocator; + using Allocator = BumpPtrAllocator; /// The ID number of this value. unsigned id; @@ -53,14 +61,10 @@ namespace llvm { SlotIndex def; /// VNInfo constructor. - VNInfo(unsigned i, SlotIndex d) - : id(i), def(d) - { } + VNInfo(unsigned i, SlotIndex d) : id(i), def(d) {} /// VNInfo constructor, copies values from orig, except for the value number. - VNInfo(unsigned i, const VNInfo &orig) - : id(i), def(orig.def) - { } + VNInfo(unsigned i, const VNInfo &orig) : id(i), def(orig.def) {} /// Copy from the parameter into this VNInfo. void copyFrom(VNInfo &src) { @@ -152,16 +156,16 @@ namespace llvm { /// segment with a new value number is used. class LiveRange { public: - /// This represents a simple continuous liveness interval for a value. /// The start point is inclusive, the end point exclusive. These intervals /// are rendered as [start,end). struct Segment { SlotIndex start; // Start point of the interval (inclusive) SlotIndex end; // End point of the interval (exclusive) - VNInfo *valno; // identifier for the value contained in this segment. + VNInfo *valno = nullptr; // identifier for the value contained in this + // segment. - Segment() : valno(nullptr) {} + Segment() = default; Segment(SlotIndex S, SlotIndex E, VNInfo *V) : start(S), end(E), valno(V) { @@ -189,8 +193,8 @@ namespace llvm { void dump() const; }; - typedef SmallVector Segments; - typedef SmallVector VNInfoList; + using Segments = SmallVector; + using VNInfoList = SmallVector; Segments segments; // the liveness segments VNInfoList valnos; // value#'s @@ -198,22 +202,24 @@ namespace llvm { // The segment set is used temporarily to accelerate initial computation // of live ranges of physical registers in computeRegUnitRange. // After that the set is flushed to the segment vector and deleted. - typedef std::set SegmentSet; + using SegmentSet = std::set; std::unique_ptr segmentSet; - typedef Segments::iterator iterator; + using iterator = Segments::iterator; + using const_iterator = Segments::const_iterator; + iterator begin() { return segments.begin(); } iterator end() { return segments.end(); } - typedef Segments::const_iterator const_iterator; const_iterator begin() const { return segments.begin(); } const_iterator end() const { return segments.end(); } - typedef VNInfoList::iterator vni_iterator; + using vni_iterator = VNInfoList::iterator; + using const_vni_iterator = VNInfoList::const_iterator; + vni_iterator vni_begin() { return valnos.begin(); } vni_iterator vni_end() { return valnos.end(); } - typedef VNInfoList::const_iterator const_vni_iterator; const_vni_iterator vni_begin() const { return valnos.begin(); } const_vni_iterator vni_end() const { return valnos.end(); } @@ -631,40 +637,37 @@ namespace llvm { /// or stack slot. class LiveInterval : public LiveRange { public: - typedef LiveRange super; + using super = LiveRange; /// A live range for subregisters. The LaneMask specifies which parts of the /// super register are covered by the interval. /// (@sa TargetRegisterInfo::getSubRegIndexLaneMask()). class SubRange : public LiveRange { public: - SubRange *Next; + SubRange *Next = nullptr; LaneBitmask LaneMask; /// Constructs a new SubRange object. - SubRange(LaneBitmask LaneMask) - : Next(nullptr), LaneMask(LaneMask) { - } + SubRange(LaneBitmask LaneMask) : LaneMask(LaneMask) {} /// Constructs a new SubRange object by copying liveness from @p Other. SubRange(LaneBitmask LaneMask, const LiveRange &Other, BumpPtrAllocator &Allocator) - : LiveRange(Other, Allocator), Next(nullptr), LaneMask(LaneMask) { - } + : LiveRange(Other, Allocator), LaneMask(LaneMask) {} void print(raw_ostream &OS) const; void dump() const; }; private: - SubRange *SubRanges; ///< Single linked list of subregister live ranges. + SubRange *SubRanges = nullptr; ///< Single linked list of subregister live + /// ranges. public: const unsigned reg; // the register or stack slot of this interval. float weight; // weight of this interval - LiveInterval(unsigned Reg, float Weight) - : SubRanges(nullptr), reg(Reg), weight(Weight) {} + LiveInterval(unsigned Reg, float Weight) : reg(Reg), weight(Weight) {} ~LiveInterval() { clearSubRanges(); @@ -673,8 +676,10 @@ namespace llvm { template class SingleLinkedListIterator { T *P; + public: SingleLinkedListIterator(T *P) : P(P) {} + SingleLinkedListIterator &operator++() { P = P->Next; return *this; @@ -698,7 +703,9 @@ namespace llvm { } }; - typedef SingleLinkedListIterator subrange_iterator; + using subrange_iterator = SingleLinkedListIterator; + using const_subrange_iterator = SingleLinkedListIterator; + subrange_iterator subrange_begin() { return subrange_iterator(SubRanges); } @@ -706,7 +713,6 @@ namespace llvm { return subrange_iterator(nullptr); } - typedef SingleLinkedListIterator const_subrange_iterator; const_subrange_iterator subrange_begin() const { return const_subrange_iterator(SubRanges); } @@ -759,12 +765,12 @@ namespace llvm { /// isSpillable - Can this interval be spilled? bool isSpillable() const { - return weight != llvm::huge_valf; + return weight != huge_valf; } /// markNotSpillable - Mark interval as not spillable void markNotSpillable() { - weight = llvm::huge_valf; + weight = huge_valf; } /// For a given lane mask @p LaneMask, compute indexes at which the @@ -931,5 +937,7 @@ namespace llvm { void Distribute(LiveInterval &LI, LiveInterval *LIV[], MachineRegisterInfo &MRI); }; -} -#endif + +} // end namespace llvm + +#endif // LLVM_CODEGEN_LIVEINTERVAL_H diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h index 181cb375de86..820e88362483 100644 --- a/include/llvm/CodeGen/LiveIntervalAnalysis.h +++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h @@ -1,4 +1,4 @@ -//===-- LiveIntervalAnalysis.h - Live Interval Analysis ---------*- C++ -*-===// +//===- LiveIntervalAnalysis.h - Live Interval Analysis ----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -20,6 +20,7 @@ #ifndef LLVM_CODEGEN_LIVEINTERVALANALYSIS_H #define LLVM_CODEGEN_LIVEINTERVALANALYSIS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -27,27 +28,29 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/SlotIndexes.h" -#include "llvm/Support/Allocator.h" +#include "llvm/MC/LaneBitmask.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetRegisterInfo.h" -#include +#include +#include +#include namespace llvm { extern cl::opt UseSegmentSetForPhysRegs; - class BitVector; - class BlockFrequency; - class LiveRangeCalc; - class LiveVariables; - class MachineDominatorTree; - class MachineLoopInfo; - class TargetRegisterInfo; - class MachineRegisterInfo; - class TargetInstrInfo; - class TargetRegisterClass; - class VirtRegMap; - class MachineBlockFrequencyInfo; +class BitVector; +class LiveRangeCalc; +class MachineBlockFrequencyInfo; +class MachineDominatorTree; +class MachineFunction; +class MachineInstr; +class MachineRegisterInfo; +class raw_ostream; +class TargetInstrInfo; +class VirtRegMap; class LiveIntervals : public MachineFunctionPass { MachineFunction* MF; @@ -56,8 +59,8 @@ extern cl::opt UseSegmentSetForPhysRegs; const TargetInstrInfo* TII; AliasAnalysis *AA; SlotIndexes* Indexes; - MachineDominatorTree *DomTree; - LiveRangeCalc *LRCalc; + MachineDominatorTree *DomTree = nullptr; + LiveRangeCalc *LRCalc = nullptr; /// Special pool allocator for VNInfo's (LiveInterval val#). VNInfo::Allocator VNInfoAllocator; @@ -95,6 +98,7 @@ extern cl::opt UseSegmentSetForPhysRegs; public: static char ID; + LiveIntervals(); ~LiveIntervals() override; @@ -466,6 +470,7 @@ extern cl::opt UseSegmentSetForPhysRegs; class HMEditor; }; -} // End llvm namespace -#endif +} // end namespace llvm + +#endif // LLVM_CODEGEN_LIVEINTERVALANALYSIS_H diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h index 57e3deb038af..b922e543c856 100644 --- a/include/llvm/CodeGen/LiveIntervalUnion.h +++ b/include/llvm/CodeGen/LiveIntervalUnion.h @@ -26,12 +26,14 @@ namespace llvm { +class raw_ostream; class TargetRegisterInfo; #ifndef NDEBUG // forward declaration template class SparseBitVector; -typedef SparseBitVector<128> LiveVirtRegBitSet; + +using LiveVirtRegBitSet = SparseBitVector<128>; #endif /// Union of live intervals that are strong candidates for coalescing into a @@ -42,19 +44,19 @@ class LiveIntervalUnion { // A set of live virtual register segments that supports fast insertion, // intersection, and removal. // Mapping SlotIndex intervals to virtual register numbers. - typedef IntervalMap LiveSegments; + using LiveSegments = IntervalMap; public: // SegmentIter can advance to the next segment ordered by starting position // which may belong to a different live virtual register. We also must be able // to reach the current segment's containing virtual register. - typedef LiveSegments::iterator SegmentIter; + using SegmentIter = LiveSegments::iterator; /// Const version of SegmentIter. - typedef LiveSegments::const_iterator ConstSegmentIter; + using ConstSegmentIter = LiveSegments::const_iterator; // LiveIntervalUnions share an external allocator. - typedef LiveSegments::Allocator Allocator; + using Allocator = LiveSegments::Allocator; private: unsigned Tag = 0; // unique tag for current contents. @@ -76,7 +78,7 @@ public: SlotIndex startIndex() const { return Segments.start(); } // Provide public access to the underlying map to allow overlap iteration. - typedef LiveSegments Map; + using Map = LiveSegments; const Map &getMap() const { return Segments; } /// getTag - Return an opaque tag representing the current state of the union. diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h index 9e04c467fadc..f9c741dd75b2 100644 --- a/include/llvm/CodeGen/LivePhysRegs.h +++ b/include/llvm/CodeGen/LivePhysRegs.h @@ -7,23 +7,24 @@ // //===----------------------------------------------------------------------===// // -// This file implements the LivePhysRegs utility for tracking liveness of -// physical registers. This can be used for ad-hoc liveness tracking after -// register allocation. You can start with the live-ins/live-outs at the -// beginning/end of a block and update the information while walking the -// instructions inside the block. This implementation tracks the liveness on a -// sub-register granularity. -// -// We assume that the high bits of a physical super-register are not preserved -// unless the instruction has an implicit-use operand reading the super- -// register. -// -// X86 Example: -// %YMM0 = ... -// %XMM0 = ... (Kills %XMM0, all %XMM0s sub-registers, and %YMM0) -// -// %YMM0 = ... -// %XMM0 = ..., %YMM0 (%YMM0 and all its sub-registers are alive) +/// \file +/// This file implements the LivePhysRegs utility for tracking liveness of +/// physical registers. This can be used for ad-hoc liveness tracking after +/// register allocation. You can start with the live-ins/live-outs at the +/// beginning/end of a block and update the information while walking the +/// instructions inside the block. This implementation tracks the liveness on a +/// sub-register granularity. +/// +/// We assume that the high bits of a physical super-register are not preserved +/// unless the instruction has an implicit-use operand reading the super- +/// register. +/// +/// X86 Example: +/// %YMM0 = ... +/// %XMM0 = ... (Kills %XMM0, all %XMM0s sub-registers, and %YMM0) +/// +/// %YMM0 = ... +/// %XMM0 = ..., %YMM0 (%YMM0 and all its sub-registers are alive) //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_LIVEPHYSREGS_H @@ -39,40 +40,42 @@ namespace llvm { class MachineInstr; +class MachineOperand; +class MachineRegisterInfo; +class raw_ostream; -/// \brief A set of live physical registers with functions to track liveness +/// \brief A set of physical registers with utility functions to track liveness /// when walking backward/forward through a basic block. class LivePhysRegs { const TargetRegisterInfo *TRI = nullptr; SparseSet LiveRegs; +public: + /// Constructs an unitialized set. init() needs to be called to initialize it. + LivePhysRegs() = default; + + /// Constructs and initializes an empty set. + LivePhysRegs(const TargetRegisterInfo &TRI) : TRI(&TRI) { + LiveRegs.setUniverse(TRI.getNumRegs()); + } + LivePhysRegs(const LivePhysRegs&) = delete; LivePhysRegs &operator=(const LivePhysRegs&) = delete; -public: - /// \brief Constructs a new empty LivePhysRegs set. - LivePhysRegs() = default; - - /// \brief Constructs and initialize an empty LivePhysRegs set. - LivePhysRegs(const TargetRegisterInfo *TRI) : TRI(TRI) { - assert(TRI && "Invalid TargetRegisterInfo pointer."); - LiveRegs.setUniverse(TRI->getNumRegs()); - } - - /// \brief Clear and initialize the LivePhysRegs set. + /// (re-)initializes and clears the set. void init(const TargetRegisterInfo &TRI) { this->TRI = &TRI; LiveRegs.clear(); LiveRegs.setUniverse(TRI.getNumRegs()); } - /// \brief Clears the LivePhysRegs set. + /// Clears the set. void clear() { LiveRegs.clear(); } - /// \brief Returns true if the set is empty. + /// Returns true if the set is empty. bool empty() const { return LiveRegs.empty(); } - /// \brief Adds a physical register and all its sub-registers to the set. + /// Adds a physical register and all its sub-registers to the set. void addReg(unsigned Reg) { assert(TRI && "LivePhysRegs is not initialized."); assert(Reg <= TRI->getNumRegs() && "Expected a physical register."); @@ -90,12 +93,13 @@ public: LiveRegs.erase(*R); } - /// \brief Removes physical registers clobbered by the regmask operand @p MO. + /// Removes physical registers clobbered by the regmask operand \p MO. void removeRegsInMask(const MachineOperand &MO, - SmallVectorImpl> *Clobbers); + SmallVectorImpl> *Clobbers = + nullptr); - /// \brief Returns true if register @p Reg is contained in the set. This also - /// works if only the super register of @p Reg has been defined, because + /// \brief Returns true if register \p Reg is contained in the set. This also + /// works if only the super register of \p Reg has been defined, because /// addReg() always adds all sub-registers to the set as well. /// Note: Returns false if just some sub registers are live, use available() /// when searching a free register. @@ -104,48 +108,48 @@ public: /// Returns true if register \p Reg and no aliasing register is in the set. bool available(const MachineRegisterInfo &MRI, unsigned Reg) const; - /// \brief Simulates liveness when stepping backwards over an - /// instruction(bundle): Remove Defs, add uses. This is the recommended way of - /// calculating liveness. + /// Simulates liveness when stepping backwards over an instruction(bundle). + /// Remove Defs, add uses. This is the recommended way of calculating + /// liveness. void stepBackward(const MachineInstr &MI); - /// \brief Simulates liveness when stepping forward over an - /// instruction(bundle): Remove killed-uses, add defs. This is the not - /// recommended way, because it depends on accurate kill flags. If possible - /// use stepBackward() instead of this function. - /// The clobbers set will be the list of registers either defined or clobbered - /// by a regmask. The operand will identify whether this is a regmask or - /// register operand. + /// Simulates liveness when stepping forward over an instruction(bundle). + /// Remove killed-uses, add defs. This is the not recommended way, because it + /// depends on accurate kill flags. If possible use stepBackward() instead of + /// this function. The clobbers set will be the list of registers either + /// defined or clobbered by a regmask. The operand will identify whether this + /// is a regmask or register operand. void stepForward(const MachineInstr &MI, SmallVectorImpl> &Clobbers); - /// Adds all live-in registers of basic block @p MBB. + /// Adds all live-in registers of basic block \p MBB. /// Live in registers are the registers in the blocks live-in list and the /// pristine registers. void addLiveIns(const MachineBasicBlock &MBB); - /// Adds all live-out registers of basic block @p MBB. + /// Adds all live-out registers of basic block \p MBB. /// Live out registers are the union of the live-in registers of the successor /// blocks and pristine registers. Live out registers of the end block are the /// callee saved registers. void addLiveOuts(const MachineBasicBlock &MBB); - /// Like addLiveOuts() but does not add pristine registers/callee saved + /// Adds all live-out registers of basic block \p MBB but skips pristine /// registers. void addLiveOutsNoPristines(const MachineBasicBlock &MBB); - typedef SparseSet::const_iterator const_iterator; + using const_iterator = SparseSet::const_iterator; + const_iterator begin() const { return LiveRegs.begin(); } const_iterator end() const { return LiveRegs.end(); } - /// \brief Prints the currently live registers to @p OS. + /// Prints the currently live registers to \p OS. void print(raw_ostream &OS) const; - /// \brief Dumps the currently live registers to the debug output. + /// Dumps the currently live registers to the debug output. void dump() const; private: - /// Adds live-in registers from basic block @p MBB, taking associated + /// \brief Adds live-in registers from basic block \p MBB, taking associated /// lane masks into consideration. void addBlockLiveIns(const MachineBasicBlock &MBB); }; @@ -155,11 +159,11 @@ inline raw_ostream &operator<<(raw_ostream &OS, const LivePhysRegs& LR) { return OS; } -/// Compute the live-in list for \p MBB assuming all of its successors live-in -/// lists are up-to-date. Uses the given LivePhysReg instance \p LiveRegs; This -/// is just here to avoid repeated heap allocations when calling this multiple -/// times in a pass. -void computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, +/// \brief Computes the live-in list for \p MBB assuming all of its successors +/// live-in lists are up-to-date. Uses the given LivePhysReg instance \p +/// LiveRegs; This is just here to avoid repeated heap allocations when calling +/// this multiple times in a pass. +void computeLiveIns(LivePhysRegs &LiveRegs, const MachineRegisterInfo &MRI, MachineBasicBlock &MBB); } // end namespace llvm diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h index 4250777682ba..362d9854a271 100644 --- a/include/llvm/CodeGen/LiveRangeEdit.h +++ b/include/llvm/CodeGen/LiveRangeEdit.h @@ -1,4 +1,4 @@ -//===---- LiveRangeEdit.h - Basic tools for split and spill -----*- C++ -*-===// +//===- LiveRangeEdit.h - Basic tools for split and spill --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -19,19 +19,28 @@ #define LLVM_CODEGEN_LIVERANGEEDIT_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetMachine.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include namespace llvm { class LiveIntervals; class MachineBlockFrequencyInfo; +class MachineInstr; class MachineLoopInfo; +class MachineOperand; +class TargetInstrInfo; +class TargetRegisterInfo; class VirtRegMap; class LiveRangeEdit : private MachineRegisterInfo::Delegate { @@ -39,7 +48,10 @@ public: /// Callback methods for LiveRangeEdit owners. class Delegate { virtual void anchor(); + public: + virtual ~Delegate() = default; + /// Called immediately before erasing a dead machine instruction. virtual void LRE_WillEraseInstruction(MachineInstr *MI) {} @@ -53,8 +65,6 @@ public: /// Called after cloning a virtual register. /// This is used for new registers representing connected components of Old. virtual void LRE_DidCloneVirtReg(unsigned New, unsigned Old) {} - - virtual ~Delegate() {} }; private: @@ -70,7 +80,7 @@ private: const unsigned FirstNew; /// ScannedRemattable - true when remattable values have been identified. - bool ScannedRemattable; + bool ScannedRemattable = false; /// DeadRemats - The saved instructions which have already been dead after /// rematerialization but not deleted yet -- to be done in postOptimization. @@ -78,11 +88,11 @@ private: /// Remattable - Values defined by remattable instructions as identified by /// tii.isTriviallyReMaterializable(). - SmallPtrSet Remattable; + SmallPtrSet Remattable; /// Rematted - Values that were actually rematted, and so need to have their /// live range trimmed or entirely removed. - SmallPtrSet Rematted; + SmallPtrSet Rematted; /// scanRemattable - Identify the Parent values that may rematerialize. void scanRemattable(AliasAnalysis *aa); @@ -94,11 +104,11 @@ private: /// foldAsLoad - If LI has a single use and a single def that can be folded as /// a load, eliminate the register by folding the def into the use. - bool foldAsLoad(LiveInterval *LI, SmallVectorImpl &Dead); + bool foldAsLoad(LiveInterval *LI, SmallVectorImpl &Dead); + + using ToShrinkSet = SetVector, + SmallPtrSet>; - typedef SetVector, - SmallPtrSet > ToShrinkSet; /// Helper for eliminateDeadDefs. void eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink, AliasAnalysis *AA); @@ -129,26 +139,26 @@ public: SmallPtrSet *deadRemats = nullptr) : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis), VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate), - FirstNew(newRegs.size()), ScannedRemattable(false), - DeadRemats(deadRemats) { + FirstNew(newRegs.size()), DeadRemats(deadRemats) { MRI.setDelegate(this); } ~LiveRangeEdit() override { MRI.resetDelegate(this); } LiveInterval &getParent() const { - assert(Parent && "No parent LiveInterval"); - return *Parent; + assert(Parent && "No parent LiveInterval"); + return *Parent; } + unsigned getReg() const { return getParent().reg; } /// Iterator for accessing the new registers added by this edit. - typedef SmallVectorImpl::const_iterator iterator; - iterator begin() const { return NewRegs.begin()+FirstNew; } + using iterator = SmallVectorImpl::const_iterator; + iterator begin() const { return NewRegs.begin() + FirstNew; } iterator end() const { return NewRegs.end(); } - unsigned size() const { return NewRegs.size()-FirstNew; } + unsigned size() const { return NewRegs.size() - FirstNew; } bool empty() const { return size() == 0; } - unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; } + unsigned get(unsigned idx) const { return NewRegs[idx + FirstNew]; } /// pop_back - It allows LiveRangeEdit users to drop new registers. /// The context is when an original def instruction of a register is @@ -176,26 +186,25 @@ public: return createEmptyIntervalFrom(getReg()); } - unsigned create() { - return createFrom(getReg()); - } + unsigned create() { return createFrom(getReg()); } /// anyRematerializable - Return true if any parent values may be /// rematerializable. /// This function must be called before any rematerialization is attempted. - bool anyRematerializable(AliasAnalysis*); + bool anyRematerializable(AliasAnalysis *); /// checkRematerializable - Manually add VNI to the list of rematerializable /// values if DefMI may be rematerializable. bool checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI, - AliasAnalysis*); + AliasAnalysis *); /// Remat - Information needed to rematerialize at a specific location. struct Remat { - VNInfo *ParentVNI; // parent_'s value at the remat location. - MachineInstr *OrigMI; // Instruction defining OrigVNI. It contains the - // real expr for remat. - explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {} + VNInfo *ParentVNI; // parent_'s value at the remat location. + MachineInstr *OrigMI = nullptr; // Instruction defining OrigVNI. It contains + // the real expr for remat. + + explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI) {} }; /// canRematerializeAt - Determine if ParentVNI can be rematerialized at @@ -209,10 +218,8 @@ public: /// liveness is not updated. /// Return the SlotIndex of the new instruction. SlotIndex rematerializeAt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, - const Remat &RM, - const TargetRegisterInfo&, + MachineBasicBlock::iterator MI, unsigned DestReg, + const Remat &RM, const TargetRegisterInfo &, bool Late = false); /// markRematerialized - explicitly mark a value as rematerialized after doing @@ -248,11 +255,10 @@ public: /// calculateRegClassAndHint - Recompute register class and hint for each new /// register. - void calculateRegClassAndHint(MachineFunction&, - const MachineLoopInfo&, - const MachineBlockFrequencyInfo&); + void calculateRegClassAndHint(MachineFunction &, const MachineLoopInfo &, + const MachineBlockFrequencyInfo &); }; -} +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_LIVERANGEEDIT_H diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStackAnalysis.h index 3ffbe3d775b4..c90ae7b184f4 100644 --- a/include/llvm/CodeGen/LiveStackAnalysis.h +++ b/include/llvm/CodeGen/LiveStackAnalysis.h @@ -1,4 +1,4 @@ -//===-- LiveStackAnalysis.h - Live Stack Slot Analysis ----------*- C++ -*-===// +//===- LiveStackAnalysis.h - Live Stack Slot Analysis -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,13 +18,16 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include #include #include namespace llvm { +class TargetRegisterClass; +class TargetRegisterInfo; + class LiveStacks : public MachineFunctionPass { const TargetRegisterInfo *TRI; @@ -33,8 +36,7 @@ class LiveStacks : public MachineFunctionPass { VNInfo::Allocator VNInfoAllocator; /// S2IMap - Stack slot indices to live interval mapping. - /// - typedef std::unordered_map SS2IntervalMap; + using SS2IntervalMap = std::unordered_map; SS2IntervalMap S2IMap; /// S2RCMap - Stack slot indices to register class mapping. @@ -42,12 +44,14 @@ class LiveStacks : public MachineFunctionPass { public: static char ID; // Pass identification, replacement for typeid + LiveStacks() : MachineFunctionPass(ID) { initializeLiveStacksPass(*PassRegistry::getPassRegistry()); } - typedef SS2IntervalMap::iterator iterator; - typedef SS2IntervalMap::const_iterator const_iterator; + using iterator = SS2IntervalMap::iterator; + using const_iterator = SS2IntervalMap::const_iterator; + const_iterator begin() const { return S2IMap.begin(); } const_iterator end() const { return S2IMap.end(); } iterator begin() { return S2IMap.begin(); } @@ -93,6 +97,7 @@ public: /// print - Implement the dump method. void print(raw_ostream &O, const Module * = nullptr) const override; }; -} -#endif /* LLVM_CODEGEN_LIVESTACK_ANALYSIS_H */ +} // end namespace llvm + +#endif // LLVM_CODEGEN_LIVESTACK_ANALYSIS_H diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 18d40564856d..8da48c379d00 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -1,4 +1,4 @@ -//===-- llvm/CodeGen/MachineBasicBlock.h ------------------------*- C++ -*-===// +//===- llvm/CodeGen/MachineBasicBlock.h -------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -15,41 +15,50 @@ #define LLVM_CODEGEN_MACHINEBASICBLOCK_H #include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/ilist.h" +#include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/simple_ilist.h" #include "llvm/CodeGen/MachineInstrBundleIterator.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/Support/BranchProbability.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Support/DataTypes.h" +#include +#include #include +#include +#include +#include namespace llvm { -class Pass; class BasicBlock; class MachineFunction; class MCSymbol; -class MIPrinter; +class ModuleSlotTracker; +class Pass; class SlotIndexes; class StringRef; class raw_ostream; -class MachineBranchProbabilityInfo; +class TargetRegisterClass; +class TargetRegisterInfo; template <> struct ilist_traits { private: friend class MachineBasicBlock; // Set by the owning MachineBasicBlock. + MachineBasicBlock *Parent; - typedef simple_ilist>::iterator - instr_iterator; + using instr_iterator = + simple_ilist>::iterator; public: void addNodeToList(MachineInstr *N); void removeNodeFromList(MachineInstr *N); void transferNodesFromList(ilist_traits &OldList, instr_iterator First, instr_iterator Last); - void deleteNode(MachineInstr *MI); }; @@ -69,7 +78,8 @@ public: }; private: - typedef ilist> Instructions; + using Instructions = ilist>; + Instructions Insts; const BasicBlock *BB; int Number; @@ -83,12 +93,12 @@ private: /// same order as Successors, or it is empty if we don't use it (disable /// optimization). std::vector Probs; - typedef std::vector::iterator probability_iterator; - typedef std::vector::const_iterator - const_probability_iterator; + using probability_iterator = std::vector::iterator; + using const_probability_iterator = + std::vector::const_iterator; /// Keep track of the physical registers that are livein of the basicblock. - typedef std::vector LiveInVector; + using LiveInVector = std::vector; LiveInVector LiveIns; /// Alignment of the basic block. Zero if the basic block does not need to be @@ -113,7 +123,7 @@ private: mutable MCSymbol *CachedMCSymbol = nullptr; // Intrusive list support - MachineBasicBlock() {} + MachineBasicBlock() = default; explicit MachineBasicBlock(MachineFunction &MF, const BasicBlock *BB); @@ -145,16 +155,16 @@ public: const MachineFunction *getParent() const { return xParent; } MachineFunction *getParent() { return xParent; } - typedef Instructions::iterator instr_iterator; - typedef Instructions::const_iterator const_instr_iterator; - typedef Instructions::reverse_iterator reverse_instr_iterator; - typedef Instructions::const_reverse_iterator const_reverse_instr_iterator; + using instr_iterator = Instructions::iterator; + using const_instr_iterator = Instructions::const_iterator; + using reverse_instr_iterator = Instructions::reverse_iterator; + using const_reverse_instr_iterator = Instructions::const_reverse_iterator; - typedef MachineInstrBundleIterator iterator; - typedef MachineInstrBundleIterator const_iterator; - typedef MachineInstrBundleIterator reverse_iterator; - typedef MachineInstrBundleIterator - const_reverse_iterator; + using iterator = MachineInstrBundleIterator; + using const_iterator = MachineInstrBundleIterator; + using reverse_iterator = MachineInstrBundleIterator; + using const_reverse_iterator = + MachineInstrBundleIterator; unsigned size() const { return (unsigned)Insts.size(); } bool empty() const { return Insts.empty(); } @@ -178,8 +188,8 @@ public: reverse_instr_iterator instr_rend () { return Insts.rend(); } const_reverse_instr_iterator instr_rend () const { return Insts.rend(); } - typedef iterator_range instr_range; - typedef iterator_range const_instr_range; + using instr_range = iterator_range; + using const_instr_range = iterator_range; instr_range instrs() { return instr_range(instr_begin(), instr_end()); } const_instr_range instrs() const { return const_instr_range(instr_begin(), instr_end()); @@ -213,18 +223,18 @@ public: } // Machine-CFG iterators - typedef std::vector::iterator pred_iterator; - typedef std::vector::const_iterator const_pred_iterator; - typedef std::vector::iterator succ_iterator; - typedef std::vector::const_iterator const_succ_iterator; - typedef std::vector::reverse_iterator - pred_reverse_iterator; - typedef std::vector::const_reverse_iterator - const_pred_reverse_iterator; - typedef std::vector::reverse_iterator - succ_reverse_iterator; - typedef std::vector::const_reverse_iterator - const_succ_reverse_iterator; + using pred_iterator = std::vector::iterator; + using const_pred_iterator = std::vector::const_iterator; + using succ_iterator = std::vector::iterator; + using const_succ_iterator = std::vector::const_iterator; + using pred_reverse_iterator = + std::vector::reverse_iterator; + using const_pred_reverse_iterator = + std::vector::const_reverse_iterator; + using succ_reverse_iterator = + std::vector::reverse_iterator; + using const_succ_reverse_iterator = + std::vector::const_reverse_iterator; pred_iterator pred_begin() { return Predecessors.begin(); } const_pred_iterator pred_begin() const { return Predecessors.begin(); } pred_iterator pred_end() { return Predecessors.end(); } @@ -307,7 +317,7 @@ public: // Iteration support for live in sets. These sets are kept in sorted // order by their register number. - typedef LiveInVector::const_iterator livein_iterator; + using livein_iterator = LiveInVector::const_iterator; #ifndef NDEBUG /// Unlike livein_begin, this method does not check that the liveness /// information is accurate. Still for debug purposes it may be useful @@ -455,7 +465,6 @@ public: /// other block. bool isLayoutSuccessor(const MachineBasicBlock *MBB) const; - /// Return the fallthrough block if the block can implicitly /// transfer control to the block after it by falling off the end of /// it. This should return null if it can reach the block after @@ -695,7 +704,7 @@ public: LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, unsigned Reg, const_iterator Before, - unsigned Neighborhood=10) const; + unsigned Neighborhood = 10) const; // Debugging methods. void dump() const; @@ -714,7 +723,6 @@ public: /// Return the MCSymbol for this basic block. MCSymbol *getSymbol() const; - private: /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); @@ -764,8 +772,8 @@ struct MBB2NumberFunctor : // template <> struct GraphTraits { - typedef MachineBasicBlock *NodeRef; - typedef MachineBasicBlock::succ_iterator ChildIteratorType; + using NodeRef = MachineBasicBlock *; + using ChildIteratorType = MachineBasicBlock::succ_iterator; static NodeRef getEntryNode(MachineBasicBlock *BB) { return BB; } static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } @@ -773,8 +781,8 @@ template <> struct GraphTraits { }; template <> struct GraphTraits { - typedef const MachineBasicBlock *NodeRef; - typedef MachineBasicBlock::const_succ_iterator ChildIteratorType; + using NodeRef = const MachineBasicBlock *; + using ChildIteratorType = MachineBasicBlock::const_succ_iterator; static NodeRef getEntryNode(const MachineBasicBlock *BB) { return BB; } static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); } @@ -787,28 +795,30 @@ template <> struct GraphTraits { // to be when traversing the predecessor edges of a MBB // instead of the successor edges. // -template <> struct GraphTraits > { - typedef MachineBasicBlock *NodeRef; - typedef MachineBasicBlock::pred_iterator ChildIteratorType; +template <> struct GraphTraits> { + using NodeRef = MachineBasicBlock *; + using ChildIteratorType = MachineBasicBlock::pred_iterator; + static NodeRef getEntryNode(Inverse G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { return N->pred_begin(); } static ChildIteratorType child_end(NodeRef N) { return N->pred_end(); } }; -template <> struct GraphTraits > { - typedef const MachineBasicBlock *NodeRef; - typedef MachineBasicBlock::const_pred_iterator ChildIteratorType; +template <> struct GraphTraits> { + using NodeRef = const MachineBasicBlock *; + using ChildIteratorType = MachineBasicBlock::const_pred_iterator; + static NodeRef getEntryNode(Inverse G) { return G.Graph; } + static ChildIteratorType child_begin(NodeRef N) { return N->pred_begin(); } static ChildIteratorType child_end(NodeRef N) { return N->pred_end(); } }; - - /// MachineInstrSpan provides an interface to get an iteration range /// containing the instruction it was initialized with, along with all /// those instructions inserted prior to or following that instruction @@ -816,6 +826,7 @@ template <> struct GraphTraits > { class MachineInstrSpan { MachineBasicBlock &MBB; MachineBasicBlock::iterator I, B, E; + public: MachineInstrSpan(MachineBasicBlock::iterator I) : MBB(*I->getParent()), @@ -854,6 +865,6 @@ inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin) { return It; } -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEBASICBLOCK_H diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index cd1c204981ed..cba79c818a76 100644 --- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -1,4 +1,4 @@ -//===- MachineBlockFrequencyInfo.h - MBB Frequency Analysis -*- C++ -*-----===// +//===- MachineBlockFrequencyInfo.h - MBB Frequency Analysis -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,26 +17,28 @@ #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/Support/BlockFrequency.h" -#include +#include +#include namespace llvm { +template class BlockFrequencyInfoImpl; class MachineBasicBlock; class MachineBranchProbabilityInfo; +class MachineFunction; class MachineLoopInfo; -template class BlockFrequencyInfoImpl; +class raw_ostream; /// MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation /// to estimate machine basic block frequencies. class MachineBlockFrequencyInfo : public MachineFunctionPass { - typedef BlockFrequencyInfoImpl ImplType; + using ImplType = BlockFrequencyInfoImpl; std::unique_ptr MBFI; public: static char ID; MachineBlockFrequencyInfo(); - ~MachineBlockFrequencyInfo() override; void getAnalysisUsage(AnalysisUsage &AU) const override; @@ -74,9 +76,8 @@ public: const MachineBasicBlock *MBB) const; uint64_t getEntryFreq() const; - }; -} +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEBLOCKFREQUENCYINFO_H diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h index 4131194a0c0f..370ffbe4862e 100644 --- a/include/llvm/CodeGen/MachineDominanceFrontier.h +++ b/include/llvm/CodeGen/MachineDominanceFrontier.h @@ -11,23 +11,28 @@ #define LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H #include "llvm/Analysis/DominanceFrontier.h" +#include "llvm/Analysis/DominanceFrontierImpl.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" - +#include "llvm/Support/GenericDomTree.h" +#include namespace llvm { class MachineDominanceFrontier : public MachineFunctionPass { ForwardDominanceFrontierBase Base; -public: - typedef DominatorTreeBase DomTreeT; - typedef DomTreeNodeBase DomTreeNodeT; - typedef DominanceFrontierBase::DomSetType DomSetType; - typedef DominanceFrontierBase::iterator iterator; - typedef DominanceFrontierBase::const_iterator const_iterator; - void operator=(const MachineDominanceFrontier &) = delete; +public: + using DomTreeT = DominatorTreeBase; + using DomTreeNodeT = DomTreeNodeBase; + using DomSetType = DominanceFrontierBase::DomSetType; + using iterator = DominanceFrontierBase::iterator; + using const_iterator = + DominanceFrontierBase::const_iterator; + MachineDominanceFrontier(const MachineDominanceFrontier &) = delete; + MachineDominanceFrontier & + operator=(const MachineDominanceFrontier &) = delete; static char ID; @@ -104,6 +109,6 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override; }; -} +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h index 30b6cfdd1c36..74a7c3ea04ae 100644 --- a/include/llvm/CodeGen/MachineDominators.h +++ b/include/llvm/CodeGen/MachineDominators.h @@ -1,4 +1,4 @@ -//=- llvm/CodeGen/MachineDominators.h - Machine Dom Calculation --*- C++ -*-==// +//==- llvm/CodeGen/MachineDominators.h - Machine Dom Calculation -*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -16,12 +16,15 @@ #define LLVM_CODEGEN_MACHINEDOMINATORS_H #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/GenericDomTree.h" #include "llvm/Support/GenericDomTreeConstruction.h" +#include #include +#include namespace llvm { @@ -33,7 +36,7 @@ inline void DominatorTreeBase::addRoot(MachineBasicBlock* MBB extern template class DomTreeNodeBase; extern template class DominatorTreeBase; -typedef DomTreeNodeBase MachineDomTreeNode; +using MachineDomTreeNode = DomTreeNodeBase; //===------------------------------------- /// DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to @@ -52,6 +55,7 @@ class MachineDominatorTree : public MachineFunctionPass { /// The splitting of a critical edge is local and thus, it is possible /// to apply several of those changes at the same time. mutable SmallVector CriticalEdgesToSplit; + /// \brief Remember all the basic blocks that are inserted during /// edge splitting. /// Invariant: NewBBs == all the basic blocks contained in the NewBB @@ -259,8 +263,8 @@ public: template struct MachineDomTreeGraphTraitsBase { - typedef Node *NodeRef; - typedef ChildIterator ChildIteratorType; + using NodeRef = Node *; + using ChildIteratorType = ChildIterator; static NodeRef getEntryNode(NodeRef N) { return N; } static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } @@ -287,6 +291,6 @@ template <> struct GraphTraits } }; -} +} // end namespace llvm -#endif +#endif // LLVM_CODEGEN_MACHINEDOMINATORS_H diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h index e7e728c1be28..8d040beff7a6 100644 --- a/include/llvm/CodeGen/MachineInstr.h +++ b/include/llvm/CodeGen/MachineInstr.h @@ -826,20 +826,12 @@ public: getOperand(0).getSubReg() == getOperand(1).getSubReg(); } - /// Return true if this is a transient instruction that is - /// either very likely to be eliminated during register allocation (such as - /// copy-like instructions), or if this instruction doesn't have an - /// execution-time cost. - bool isTransient() const { - switch(getOpcode()) { - default: return false; - // Copy-like instructions are usually eliminated during register allocation. - case TargetOpcode::PHI: - case TargetOpcode::COPY: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: - case TargetOpcode::REG_SEQUENCE: - // Pseudo-instructions that don't produce any real output. + /// Return true if this instruction doesn't produce any output in the form of + /// executable instructions. + bool isMetaInstruction() const { + switch (getOpcode()) { + default: + return false; case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::CFI_INSTRUCTION: @@ -850,6 +842,23 @@ public: } } + /// Return true if this is a transient instruction that is either very likely + /// to be eliminated during register allocation (such as copy-like + /// instructions), or if this instruction doesn't have an execution-time cost. + bool isTransient() const { + switch (getOpcode()) { + default: + return isMetaInstruction(); + // Copy-like instructions are usually eliminated during register allocation. + case TargetOpcode::PHI: + case TargetOpcode::COPY: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + return true; + } + } + /// Return the number of instructions inside the MI bundle, excluding the /// bundle header. /// diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h index 6e5c6473ff4a..1026654da3d7 100644 --- a/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/include/llvm/CodeGen/MachineRegisterInfo.h @@ -642,6 +642,11 @@ public: /// void setRegBank(unsigned Reg, const RegisterBank &RegBank); + void setRegClassOrRegBank(unsigned Reg, + const RegClassOrRegBank &RCOrRB){ + VRegInfo[Reg].first = RCOrRB; + } + /// constrainRegClass - Constrain the register class of the specified virtual /// register to be a common subclass of RC and the current register class, /// but only if the new class has at least MinNumRegs registers. Return the diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h index e92bb7f74967..d991e4c216d9 100644 --- a/include/llvm/CodeGen/MachineValueType.h +++ b/include/llvm/CodeGen/MachineValueType.h @@ -26,7 +26,7 @@ namespace llvm { /// Machine Value Type. Every type that is supported natively by some /// processor targeted by LLVM occurs here. This means that any legal value /// type can be represented by an MVT. -class MVT { + class MVT { public: enum SimpleValueType : uint8_t { // Simple value types that aren't explicitly part of this enumeration diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index 99afd8c5c9ab..97aa2aace822 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -52,14 +52,14 @@ class TargetRegisterInfo; /// These are the different kinds of scheduling dependencies. enum Kind { Data, ///< Regular data dependence (aka true-dependence). - Anti, ///< A register anti-dependedence (aka WAR). + Anti, ///< A register anti-dependence (aka WAR). Output, ///< A register output-dependence (aka WAW). Order ///< Any other ordering dependency. }; // Strong dependencies must be respected by the scheduler. Artificial // dependencies may be removed only if they are redundant with another - // strong depedence. + // strong dependence. // // Weak dependencies may be violated by the scheduling strategy, but only if // the strategy can prove it is correct to do so. @@ -342,7 +342,7 @@ class TargetRegisterInfo; /// BoundaryNodes can have DAG edges, including Data edges, but they do not /// correspond to schedulable entities (e.g. instructions) and do not have a /// valid ID. Consequently, always check for boundary nodes before accessing - /// an assoicative data structure keyed on node ID. + /// an associative data structure keyed on node ID. bool isBoundaryNode() const { return NodeNum == BoundaryID; } /// Assigns the representative SDNode for this SUnit. This may be used diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h index 21e1740aa6b8..f5f5bfd45e79 100644 --- a/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -18,6 +18,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SparseMultiSet.h" #include "llvm/ADT/SparseSet.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/Compiler.h" @@ -224,7 +225,7 @@ namespace llvm { MachineInstr *FirstDbgValue; /// Set of live physical registers for updating kill flags. - BitVector LiveRegs; + LivePhysRegs LiveRegs; public: explicit ScheduleDAGInstrs(MachineFunction &mf, @@ -311,7 +312,7 @@ namespace llvm { std::string getDAGName() const override; /// Fixes register kill flags that scheduling has made invalid. - void fixupKills(MachineBasicBlock *MBB); + void fixupKills(MachineBasicBlock &MBB); protected: void initSUnits(); diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index d761661f763e..493122b15704 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -1070,6 +1070,11 @@ public: SDNode *MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef Ops); + /// Mutate the specified strict FP node to its non-strict equivalent, + /// unlinking the node from its chain and dropping the metadata arguments. + /// The node must be a strict FP node. + SDNode *mutateStrictFPToFP(SDNode *Node); + /// These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. /// diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index 35ddcf80c91f..973c5aac5281 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -612,6 +612,32 @@ public: SDNodeBits.IsMemIntrinsic; } + /// Test if this node is a strict floating point pseudo-op. + bool isStrictFPOpcode() { + switch (NodeType) { + default: + return false; + case ISD::STRICT_FADD: + case ISD::STRICT_FSUB: + case ISD::STRICT_FMUL: + case ISD::STRICT_FDIV: + case ISD::STRICT_FREM: + case ISD::STRICT_FSQRT: + case ISD::STRICT_FPOW: + case ISD::STRICT_FPOWI: + case ISD::STRICT_FSIN: + case ISD::STRICT_FCOS: + case ISD::STRICT_FEXP: + case ISD::STRICT_FEXP2: + case ISD::STRICT_FLOG: + case ISD::STRICT_FLOG10: + case ISD::STRICT_FLOG2: + case ISD::STRICT_FRINT: + case ISD::STRICT_FNEARBYINT: + return true; + } + } + /// Test if this node has a post-isel opcode, directly /// corresponding to a MachineInstr opcode. bool isMachineOpcode() const { return NodeType < 0; } diff --git a/include/llvm/DebugInfo/CodeView/CVRecord.h b/include/llvm/DebugInfo/CodeView/CVRecord.h index 71ea82b6a9ab..68ad09982202 100644 --- a/include/llvm/DebugInfo/CodeView/CVRecord.h +++ b/include/llvm/DebugInfo/CodeView/CVRecord.h @@ -14,6 +14,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/DebugInfo/CodeView/CodeViewError.h" #include "llvm/DebugInfo/CodeView/RecordSerialization.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/BinaryStreamRef.h" #include "llvm/Support/Endian.h" @@ -50,6 +51,13 @@ public: Optional Hash; }; +template struct RemappedRecord { + explicit RemappedRecord(const CVRecord &R) : OriginalRecord(R) {} + + CVRecord OriginalRecord; + SmallVector, 8> Mappings; +}; + } // end namespace codeview template diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h index 4bc8fbefd5d8..70ccc867cd38 100644 --- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h +++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h @@ -46,6 +46,7 @@ Error visitMemberRecordStream(ArrayRef FieldList, TypeVisitorCallbacks &Callbacks); Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks, + VisitorDataSource Source = VDS_BytesPresent, TypeServerHandler *TS = nullptr); Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks, TypeServerHandler *TS = nullptr); diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h index 2142d4a2dec7..a9c5cf42fc5b 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h @@ -40,6 +40,17 @@ class TypeDeserializer : public TypeVisitorCallbacks { public: TypeDeserializer() = default; + template static Error deserializeAs(CVType &CVT, T &Record) { + MappingInfo I(CVT.content()); + if (auto EC = I.Mapping.visitTypeBegin(CVT)) + return EC; + if (auto EC = I.Mapping.visitKnownRecord(CVT, Record)) + return EC; + if (auto EC = I.Mapping.visitTypeEnd(CVT)) + return EC; + return Error::success(); + } + Error visitTypeBegin(CVType &Record) override { assert(!Mapping && "Already in a type mapping!"); Mapping = llvm::make_unique(Record.content()); diff --git a/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h b/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h new file mode 100644 index 000000000000..82ceb5038316 --- /dev/null +++ b/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h @@ -0,0 +1,33 @@ +//===- TypeIndexDiscovery.h -------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H +#define LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/Support/Error.h" + +namespace llvm { +namespace codeview { +enum class TiRefKind { TypeRef, IndexRef }; +struct TiReference { + TiRefKind Kind; + uint32_t Offset; + uint32_t Count; +}; + +void discoverTypeIndices(ArrayRef RecordData, + SmallVectorImpl &Refs); +void discoverTypeIndices(const CVType &Type, + SmallVectorImpl &Refs); +} +} + +#endif diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index 1f10872c8768..92745ebfcded 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -35,6 +35,7 @@ using support::ulittle16_t; using support::ulittle32_t; typedef CVRecord CVType; +typedef RemappedRecord RemappedType; struct CVMemberRecord { TypeLeafKind Kind; @@ -278,15 +279,9 @@ public: Attrs(calcAttrs(PK, PM, PO, Size)) {} PointerRecord(TypeIndex ReferentType, PointerKind PK, PointerMode PM, - PointerOptions PO, uint8_t Size, - const MemberPointerInfo &Member) + PointerOptions PO, uint8_t Size, const MemberPointerInfo &MPI) : TypeRecord(TypeRecordKind::Pointer), ReferentType(ReferentType), - Attrs(calcAttrs(PK, PM, PO, Size)), MemberInfo(Member) {} - - PointerRecord(TypeIndex ReferentType, uint32_t Attrs, - const MemberPointerInfo &Member) - : TypeRecord(TypeRecordKind::Pointer), ReferentType(ReferentType), - Attrs(Attrs), MemberInfo(Member) {} + Attrs(calcAttrs(PK, PM, PO, Size)), MemberInfo(MPI) {} TypeIndex getReferentType() const { return ReferentType; } diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h index 6dad98247136..435c43f7edcb 100644 --- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h @@ -17,7 +17,6 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Error.h" @@ -26,6 +25,8 @@ namespace llvm { namespace codeview { +class TypeHasher; + class TypeSerializer : public TypeVisitorCallbacks { struct SubRecord { SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {} @@ -45,14 +46,13 @@ class TypeSerializer : public TypeVisitorCallbacks { } }; - typedef SmallVector, 2> RecordList; + typedef SmallVector, 2> MutableRecordList; static constexpr uint8_t ContinuationLength = 8; BumpPtrAllocator &RecordStorage; RecordSegment CurrentSegment; - RecordList FieldListSegments; + MutableRecordList FieldListSegments; - TypeIndex LastTypeIndex; Optional TypeKind; Optional MemberKind; std::vector RecordBuffer; @@ -60,28 +60,35 @@ class TypeSerializer : public TypeVisitorCallbacks { BinaryStreamWriter Writer; TypeRecordMapping Mapping; - RecordList SeenRecords; - StringMap HashedRecords; + /// Private type record hashing implementation details are handled here. + std::unique_ptr Hasher; + + /// Contains a list of all records indexed by TypeIndex.toArrayIndex(). + SmallVector, 2> SeenRecords; + + /// Temporary storage that we use to copy a record's data while re-writing + /// its type indices. + SmallVector RemapStorage; + + TypeIndex nextTypeIndex() const; bool isInFieldList() const; - TypeIndex calcNextTypeIndex() const; - TypeIndex incrementTypeIndex(); MutableArrayRef getCurrentSubRecordData(); MutableArrayRef getCurrentRecordData(); Error writeRecordPrefix(TypeLeafKind Kind); - TypeIndex insertRecordBytesPrivate(MutableArrayRef Record); - TypeIndex insertRecordBytesWithCopy(CVType &Record, - MutableArrayRef Data); Expected> addPadding(MutableArrayRef Record); public: - explicit TypeSerializer(BumpPtrAllocator &Storage); + explicit TypeSerializer(BumpPtrAllocator &Storage, bool Hash = true); + ~TypeSerializer(); - ArrayRef> records() const; - TypeIndex getLastTypeIndex() const; - TypeIndex insertRecordBytes(MutableArrayRef Record); + void reset(); + + ArrayRef> records() const; + TypeIndex insertRecordBytes(ArrayRef &Record); + TypeIndex insertRecord(const RemappedType &Record); Expected visitTypeEndGetIndex(CVType &Record); Error visitTypeBegin(CVType &Record) override; diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h index 65bcf9812e68..3ad2b4e9c92f 100644 --- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h +++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h @@ -22,12 +22,75 @@ class TypeIndex; class TypeServerHandler; class TypeTableBuilder; -/// Merges one type stream into another. Returns true on success. -Error mergeTypeStreams(TypeTableBuilder &DestIdStream, - TypeTableBuilder &DestTypeStream, +/// \brief Merge one set of type records into another. This method assumes +/// that all records are type records, and there are no Id records present. +/// +/// \param Dest The table to store the re-written type records into. +/// +/// \param SourceToDest A vector, indexed by the TypeIndex in the source +/// type stream, that contains the index of the corresponding type record +/// in the destination stream. +/// +/// \param Handler (optional) If non-null, an interface that gets invoked +/// to handle type server records. +/// +/// \param Types The collection of types to merge in. +/// +/// \returns Error::success() if the operation succeeded, otherwise an +/// appropriate error code. +Error mergeTypeRecords(TypeTableBuilder &Dest, SmallVectorImpl &SourceToDest, TypeServerHandler *Handler, const CVTypeArray &Types); +/// \brief Merge one set of id records into another. This method assumes +/// that all records are id records, and there are no Type records present. +/// However, since Id records can refer back to Type records, this method +/// assumes that the referenced type records have also been merged into +/// another type stream (for example using the above method), and accepts +/// the mapping from source to dest for that stream so that it can re-write +/// the type record mappings accordingly. +/// +/// \param Dest The table to store the re-written id records into. +/// +/// \param Types The mapping to use for the type records that these id +/// records refer to. +/// +/// \param SourceToDest A vector, indexed by the TypeIndex in the source +/// id stream, that contains the index of the corresponding id record +/// in the destination stream. +/// +/// \param Ids The collection of id records to merge in. +/// +/// \returns Error::success() if the operation succeeded, otherwise an +/// appropriate error code. +Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef Types, + SmallVectorImpl &SourceToDest, + const CVTypeArray &Ids); + +/// \brief Merge a unified set of type and id records, splitting them into +/// separate output streams. +/// +/// \param DestIds The table to store the re-written id records into. +/// +/// \param DestTypes the table to store the re-written type records into. +/// +/// \param SourceToDest A vector, indexed by the TypeIndex in the source +/// id stream, that contains the index of the corresponding id record +/// in the destination stream. +/// +/// \param Handler (optional) If non-null, an interface that gets invoked +/// to handle type server records. +/// +/// \param IdsAndTypes The collection of id records to merge in. +/// +/// \returns Error::success() if the operation succeeded, otherwise an +/// appropriate error code. +Error mergeTypeAndIdRecords(TypeTableBuilder &DestIds, + TypeTableBuilder &DestTypes, + SmallVectorImpl &SourceToDest, + TypeServerHandler *Handler, + const CVTypeArray &IdsAndTypes); + } // end namespace codeview } // end namespace llvm diff --git a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h index 102bee4b0801..7bdc9ecb20cf 100644 --- a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h +++ b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h @@ -64,10 +64,14 @@ public: return *ExpectedIndex; } - TypeIndex writeSerializedRecord(MutableArrayRef Record) { + TypeIndex writeSerializedRecord(ArrayRef Record) { return Serializer.insertRecordBytes(Record); } + TypeIndex writeSerializedRecord(const RemappedType &Record) { + return Serializer.insertRecord(Record); + } + template void ForEachRecord(TFunc Func) { uint32_t Index = TypeIndex::FirstNonSimpleIndex; @@ -77,23 +81,24 @@ public: } } - ArrayRef> records() const { - return Serializer.records(); - } + ArrayRef> records() const { return Serializer.records(); } }; class FieldListRecordBuilder { TypeTableBuilder &TypeTable; + BumpPtrAllocator Allocator; TypeSerializer TempSerializer; CVType Type; public: explicit FieldListRecordBuilder(TypeTableBuilder &TypeTable) - : TypeTable(TypeTable), TempSerializer(TypeTable.getAllocator()) { + : TypeTable(TypeTable), TempSerializer(Allocator, false) { Type.Type = TypeLeafKind::LF_FIELDLIST; } void begin() { + TempSerializer.reset(); + if (auto EC = TempSerializer.visitTypeBegin(Type)) consumeError(std::move(EC)); } @@ -109,23 +114,19 @@ public: consumeError(std::move(EC)); } - TypeIndex end() { + TypeIndex end(bool Write) { + TypeIndex Index; if (auto EC = TempSerializer.visitTypeEnd(Type)) { consumeError(std::move(EC)); return TypeIndex(); } - TypeIndex Index; - for (auto Record : TempSerializer.records()) { - Index = TypeTable.writeSerializedRecord(Record); + if (Write) { + for (auto Record : TempSerializer.records()) + Index = TypeTable.writeSerializedRecord(Record); } - return Index; - } - /// Stop building the record. - void reset() { - if (auto EC = TempSerializer.visitTypeEnd(Type)) - consumeError(std::move(EC)); + return Index; } }; diff --git a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h index 7de562a19a74..42b62ba2b6ce 100644 --- a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h +++ b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h @@ -18,7 +18,7 @@ namespace codeview { class TypeTableCollection : public TypeCollection { public: - explicit TypeTableCollection(ArrayRef> Records); + explicit TypeTableCollection(ArrayRef> Records); Optional getFirst() override; Optional getNext(TypeIndex Prev) override; @@ -33,7 +33,7 @@ private: bool hasCapacityFor(TypeIndex Index) const; void ensureTypeExists(TypeIndex Index); - ArrayRef> Records; + ArrayRef> Records; TypeDatabase Database; }; } diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h index d3a63edf10ff..7fa68f3f2314 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -46,7 +46,8 @@ class raw_ostream; /// Reads a value from data extractor and applies a relocation to the result if /// one exists for the given offset. uint64_t getRelocatedValue(const DataExtractor &Data, uint32_t Size, - uint32_t *Off, const RelocAddrMap *Relocs); + uint32_t *Off, const RelocAddrMap *Relocs, + uint64_t *SecNdx = nullptr); /// DWARFContext /// This data structure is the top level entity that deals with dwarf debug @@ -71,6 +72,14 @@ class DWARFContext : public DIContext { std::unique_ptr AbbrevDWO; std::unique_ptr LocDWO; + struct DWOFile { + object::OwningBinary File; + std::unique_ptr Context; + }; + StringMap> DWOFiles; + std::weak_ptr DWP; + bool CheckedForDWP = false; + /// Read compile units from the debug_info section (if necessary) /// and store them in CUs. void parseCompileUnits(); @@ -165,6 +174,8 @@ public: return DWOCUs[index].get(); } + DWARFCompileUnit *getDWOCompileUnitForHash(uint64_t Hash); + /// Get a DIE given an exact offset. DWARFDie getDIEForOffset(uint32_t Offset); @@ -206,6 +217,7 @@ public: DIInliningInfo getInliningInfoForAddress(uint64_t Address, DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override; + virtual StringRef getFileName() const = 0; virtual bool isLittleEndian() const = 0; virtual uint8_t getAddressSize() const = 0; virtual const DWARFSection &getInfoSection() = 0; @@ -248,6 +260,8 @@ public: return version == 2 || version == 3 || version == 4 || version == 5; } + std::shared_ptr getDWOContext(StringRef AbsolutePath); + private: /// Return the compile unit that includes an offset (relative to .debug_info). DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset); @@ -263,6 +277,7 @@ private: class DWARFContextInMemory : public DWARFContext { virtual void anchor(); + StringRef FileName; bool IsLittleEndian; uint8_t AddressSize; DWARFSection InfoSection; @@ -316,6 +331,7 @@ public: uint8_t AddrSize, bool isLittleEndian = sys::IsLittleEndianHost); + StringRef getFileName() const override { return FileName; } bool isLittleEndian() const override { return IsLittleEndian; } uint8_t getAddressSize() const override { return AddressSize; } const DWARFSection &getInfoSection() override { return InfoSection; } diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h index 95ec1be62a79..b436711ae6ed 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h @@ -25,6 +25,7 @@ class raw_ostream; struct DWARFAddressRange { uint64_t LowPC; uint64_t HighPC; + uint64_t SectionIndex; }; /// DWARFAddressRangesVector - represents a set of absolute address ranges. @@ -44,6 +45,8 @@ public: /// address past the end of the address range. The ending address must /// be greater than or equal to the beginning address. uint64_t EndAddress; + /// A section index this range belongs to. + uint64_t SectionIndex; /// The end of any given range list is marked by an end of list entry, /// which consists of a 0 for the beginning address offset diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h index ca94a90fabfc..fa41b9e293c0 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDie.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h @@ -195,7 +195,8 @@ public: /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU. /// Returns true if both attributes are present. - bool getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const; + bool getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC, + uint64_t &SectionIndex) const; /// Get the address ranges for this DIE. /// diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h index a30e0be9c3c3..3a781dde8929 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h +++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h @@ -47,6 +47,7 @@ private: const char *cstr; }; const uint8_t *data = nullptr; + uint64_t SectionIndex; /// Section index for reference forms. }; dwarf::Form Form; /// Form for this value. @@ -58,6 +59,7 @@ public: dwarf::Form getForm() const { return Form; } uint64_t getRawUValue() const { return Value.uval; } + uint64_t getSectionIndex() const { return Value.SectionIndex; } void setForm(dwarf::Form F) { Form = F; } void setUValue(uint64_t V) { Value.uval = V; } void setSValue(int64_t V) { Value.sval = V; } diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h index fabacc0abcea..f143de334737 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h +++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h @@ -16,7 +16,10 @@ namespace llvm { +/// RelocAddrEntry contains relocated value and section index. +/// Section index is -1LL if relocation points to absolute symbol. struct RelocAddrEntry { + uint64_t SectionIndex; uint64_t Value; }; diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h index ae7fd24ce5bb..d0f7bd0d623f 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -143,17 +143,7 @@ class DWARFUnit { typedef iterator_range::iterator> die_iterator_range; - class DWOHolder { - object::OwningBinary DWOFile; - std::unique_ptr DWOContext; - DWARFUnit *DWOU = nullptr; - - public: - DWOHolder(StringRef DWOPath, uint64_t DWOId); - - DWARFUnit *getUnit() const { return DWOU; } - }; - std::unique_ptr DWO; + std::shared_ptr DWO; const DWARFUnitIndex::Entry *IndexEntry; diff --git a/include/llvm/DebugInfo/MSF/MappedBlockStream.h b/include/llvm/DebugInfo/MSF/MappedBlockStream.h index c91f6f725c80..d68f5f70c83e 100644 --- a/include/llvm/DebugInfo/MSF/MappedBlockStream.h +++ b/include/llvm/DebugInfo/MSF/MappedBlockStream.h @@ -43,8 +43,8 @@ class MappedBlockStream : public BinaryStream { friend class WritableMappedBlockStream; public: static std::unique_ptr - createStream(uint32_t BlockSize, uint32_t NumBlocks, - const MSFStreamLayout &Layout, BinaryStreamRef MsfData); + createStream(uint32_t BlockSize, const MSFStreamLayout &Layout, + BinaryStreamRef MsfData); static std::unique_ptr createIndexedStream(const MSFLayout &Layout, BinaryStreamRef MsfData, @@ -74,12 +74,11 @@ public: void invalidateCache(); uint32_t getBlockSize() const { return BlockSize; } - uint32_t getNumBlocks() const { return NumBlocks; } + uint32_t getNumBlocks() const { return StreamLayout.Blocks.size(); } uint32_t getStreamLength() const { return StreamLayout.Length; } protected: - MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks, - const MSFStreamLayout &StreamLayout, + MappedBlockStream(uint32_t BlockSize, const MSFStreamLayout &StreamLayout, BinaryStreamRef MsfData); private: @@ -91,7 +90,6 @@ private: ArrayRef &Buffer); const uint32_t BlockSize; - const uint32_t NumBlocks; const MSFStreamLayout StreamLayout; BinaryStreamRef MsfData; @@ -103,8 +101,8 @@ private: class WritableMappedBlockStream : public WritableBinaryStream { public: static std::unique_ptr - createStream(uint32_t BlockSize, uint32_t NumBlocks, - const MSFStreamLayout &Layout, WritableBinaryStreamRef MsfData); + createStream(uint32_t BlockSize, const MSFStreamLayout &Layout, + WritableBinaryStreamRef MsfData); static std::unique_ptr createIndexedStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData, @@ -139,7 +137,7 @@ public: uint32_t getStreamLength() const { return ReadInterface.getStreamLength(); } protected: - WritableMappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks, + WritableMappedBlockStream(uint32_t BlockSize, const MSFStreamLayout &StreamLayout, WritableBinaryStreamRef MsfData); diff --git a/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h index bcac182e2145..e116f314ac0e 100644 --- a/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h +++ b/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h @@ -82,6 +82,7 @@ private: Error finalize(); uint32_t calculateModiSubstreamSize() const; + uint32_t calculateNamesOffset() const; uint32_t calculateSectionContribsStreamSize() const; uint32_t calculateSectionMapStreamSize() const; uint32_t calculateFileInfoSubstreamSize() const; diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h index bfd38b6c80ec..196ba4d6ffbd 100644 --- a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h +++ b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h @@ -11,8 +11,7 @@ #define LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringSet.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/CodeView/TypeServerHandler.h" #include "llvm/DebugInfo/PDB/Native/NativeSession.h" @@ -39,7 +38,7 @@ private: bool RevisitAlways; std::unique_ptr Session; - SmallVector, 4> SearchPaths; + StringSet<> SearchPaths; }; } } diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h index c5549983ed43..17fba9991c2e 100644 --- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h +++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h @@ -21,6 +21,9 @@ #include "llvm/Support/Error.h" namespace llvm { +namespace codeview { +class LazyRandomTypeCollection; +} namespace msf { class MappedBlockStream; } @@ -53,12 +56,16 @@ public: codeview::CVTypeRange types(bool *HadError) const; const codeview::CVTypeArray &typeArray() const { return TypeRecords; } + codeview::LazyRandomTypeCollection &typeCollection() { return *Types; } + Error commit(); private: const PDBFile &Pdb; std::unique_ptr Stream; + std::unique_ptr Types; + codeview::CVTypeArray TypeRecords; std::unique_ptr HashStream; diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index d4a896c01867..ace309ed95a4 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -322,7 +322,7 @@ template <> struct DenseMapInfo { /// the AttributeList object. The function attributes are at index /// `AttributeList::FunctionIndex', the return value is at index /// `AttributeList::ReturnIndex', and the attributes for the parameters start at -/// index `1'. +/// index `AttributeList::FirstArgIndex'. class AttributeList { public: enum AttrIndex : unsigned { @@ -347,8 +347,8 @@ public: /// \brief Create an AttributeList with the specified parameters in it. static AttributeList get(LLVMContext &C, ArrayRef> Attrs); - static AttributeList - get(LLVMContext &C, ArrayRef> Attrs); + static AttributeList get(LLVMContext &C, + ArrayRef> Attrs); /// \brief Create an AttributeList from attribute sets for a function, its /// return value, and all of its arguments. @@ -356,13 +356,11 @@ public: AttributeSet RetAttrs, ArrayRef ArgAttrs); - static AttributeList - getImpl(LLVMContext &C, - ArrayRef> Attrs); - private: explicit AttributeList(AttributeListImpl *LI) : pImpl(LI) {} + static AttributeList getImpl(LLVMContext &C, ArrayRef AttrSets); + public: AttributeList() = default; @@ -521,39 +519,31 @@ public: /// \brief Return the attributes at the index as a string. std::string getAsString(unsigned Index, bool InAttrGrp = false) const; - using iterator = ArrayRef::iterator; + //===--------------------------------------------------------------------===// + // AttributeList Introspection + //===--------------------------------------------------------------------===// - iterator begin(unsigned Slot) const; - iterator end(unsigned Slot) const; + typedef const AttributeSet *iterator; + iterator begin() const; + iterator end() const; + + unsigned getNumAttrSets() const; + + /// Use these to iterate over the valid attribute indices. + unsigned index_begin() const { return AttributeList::FunctionIndex; } + unsigned index_end() const { return getNumAttrSets() - 1; } /// operator==/!= - Provide equality predicates. bool operator==(const AttributeList &RHS) const { return pImpl == RHS.pImpl; } bool operator!=(const AttributeList &RHS) const { return pImpl != RHS.pImpl; } - //===--------------------------------------------------------------------===// - // AttributeList Introspection - //===--------------------------------------------------------------------===// - /// \brief Return a raw pointer that uniquely identifies this attribute list. void *getRawPointer() const { return pImpl; } /// \brief Return true if there are no attributes. - bool isEmpty() const { - return getNumSlots() == 0; - } - - /// \brief Return the number of slots used in this attribute list. This is - /// the number of arguments that have an attribute set on them (including the - /// function itself). - unsigned getNumSlots() const; - - /// \brief Return the index for the given slot. - unsigned getSlotIndex(unsigned Slot) const; - - /// \brief Return the attributes at the given slot. - AttributeSet getSlotAttributes(unsigned Slot) const; + bool isEmpty() const { return pImpl == nullptr; } void dump() const; }; diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h index c917b1f2cada..235cb57cfd09 100644 --- a/include/llvm/IR/BasicBlock.h +++ b/include/llvm/IR/BasicBlock.h @@ -33,6 +33,7 @@ class Function; class LandingPadInst; class LLVMContext; class Module; +class PHINode; class TerminatorInst; class ValueSymbolTable; @@ -261,6 +262,50 @@ public: inline const Instruction &back() const { return InstList.back(); } inline Instruction &back() { return InstList.back(); } + /// Iterator to walk just the phi nodes in the basic block. + template + class phi_iterator_impl + : public iterator_facade_base, + std::forward_iterator_tag, PHINodeT> { + friend BasicBlock; + + PHINodeT *PN; + + phi_iterator_impl(PHINodeT *PN) : PN(PN) {} + + public: + // Allow default construction to build variables, but this doesn't build + // a useful iterator. + phi_iterator_impl() = default; + + // Allow conversion between instantiations where valid. + template + phi_iterator_impl(const phi_iterator_impl &Arg) + : PN(Arg.PN) {} + + bool operator==(const phi_iterator_impl &Arg) const { return PN == Arg.PN; } + + PHINodeT &operator*() const { return *PN; } + + using phi_iterator_impl::iterator_facade_base::operator++; + phi_iterator_impl &operator++() { + assert(PN && "Cannot increment the end iterator!"); + PN = dyn_cast(std::next(BBIteratorT(PN))); + return *this; + } + }; + typedef phi_iterator_impl<> phi_iterator; + typedef phi_iterator_impl + const_phi_iterator; + + /// Returns a range that iterates over the phis in the basic block. + /// + /// Note that this cannot be used with basic blocks that have no terminator. + iterator_range phis() const { + return const_cast(this)->phis(); + } + iterator_range phis(); + /// \brief Return the underlying instruction list container. /// /// Currently you need to access the underlying instruction list container diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h index 05e3315cbab2..2ae98d9e35b0 100644 --- a/include/llvm/IR/IntrinsicInst.h +++ b/include/llvm/IR/IntrinsicInst.h @@ -171,6 +171,7 @@ namespace llvm { ebStrict }; + bool isUnaryOp() const; RoundingMode getRoundingMode() const; ExceptionBehavior getExceptionBehavior() const; @@ -182,6 +183,18 @@ namespace llvm { case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_sqrt: + case Intrinsic::experimental_constrained_pow: + case Intrinsic::experimental_constrained_powi: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_rint: + case Intrinsic::experimental_constrained_nearbyint: return true; default: return false; } diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 19f6045568f4..291d16fb0d9b 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -489,8 +489,64 @@ let IntrProperties = [IntrInaccessibleMemOnly] in { LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + + // These intrinsics are sensitive to the rounding mode so we need constrained + // versions of each of them. When strict rounding and exception control are + // not required the non-constrained versions of these intrinsics should be + // used. + def int_experimental_constrained_sqrt : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_powi : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_i32_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_sin : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_cos : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_pow : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_log : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_log10: Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_log2 : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_exp : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_exp2 : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_rint : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_nearbyint : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; } -// FIXME: Add intrinsic for fcmp, fptrunc, fpext, fptoui and fptosi. +// FIXME: Add intrinsics for fcmp, fptrunc, fpext, fptoui and fptosi. +// FIXME: Add intrinsics for fabs, copysign, floor, ceil, trunc and round? //===------------------------- Expect Intrinsics --------------------------===// diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index d7413fe9e56f..e1928546607a 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -566,6 +566,16 @@ def int_amdgcn_s_getreg : [IntrReadMem, IntrSpeculatable] >; +// int_amdgcn_s_getpc is provided to allow a specific style of position +// independent code to determine the high part of its address when it is +// known (through convention) that the code and any data of interest does +// not cross a 4Gb address boundary. Use for any other purpose may not +// produce the desired results as optimizations may cause code movement, +// especially as we explicitly use IntrNoMem to allow optimizations. +def int_amdgcn_s_getpc : + GCCBuiltin<"__builtin_amdgcn_s_getpc">, + Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; + // __builtin_amdgcn_interp_mov , , , // param values: 0 = P10, 1 = P20, 2 = P0 def int_amdgcn_interp_mov : diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h index 92f701e01ff3..3c753260190e 100644 --- a/include/llvm/IR/Metadata.h +++ b/include/llvm/IR/Metadata.h @@ -1223,6 +1223,7 @@ public: // FIXME: Fix callers and remove condition on N. unsigned size() const { return N ? N->getNumOperands() : 0u; } + bool empty() const { return N ? N->getNumOperands() == 0 : true; } T *operator[](unsigned I) const { return cast_or_null(N->getOperand(I)); } // FIXME: Fix callers and remove condition on N. diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h index 3024d9e27a2f..5e1f680c5b36 100644 --- a/include/llvm/IR/Module.h +++ b/include/llvm/IR/Module.h @@ -139,9 +139,12 @@ public: /// during the append operation. AppendUnique = 6, + /// Takes the max of the two values, which are required to be integers. + Max = 7, + // Markers: ModFlagBehaviorFirstVal = Error, - ModFlagBehaviorLastVal = AppendUnique + ModFlagBehaviorLastVal = Max }; /// Checks if Metadata represents a valid ModFlagBehavior, and stores the diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 3df5244a0bd6..3ca21c15577b 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -144,6 +144,7 @@ void initializeGCMachineCodeAnalysisPass(PassRegistry&); void initializeGCModuleInfoPass(PassRegistry&); void initializeGCOVProfilerLegacyPassPass(PassRegistry&); void initializeGVNHoistLegacyPassPass(PassRegistry&); +void initializeGVNSinkLegacyPassPass(PassRegistry&); void initializeGVNLegacyPassPass(PassRegistry&); void initializeGlobalDCELegacyPassPass(PassRegistry&); void initializeGlobalMergePass(PassRegistry&); @@ -193,6 +194,7 @@ void initializeLiveVariablesPass(PassRegistry&); void initializeLoadCombinePass(PassRegistry&); void initializeLoadStoreVectorizerPass(PassRegistry&); void initializeLoaderPassPass(PassRegistry&); +void initializeLocalizerPass(PassRegistry&); void initializeLocalStackSlotPassPass(PassRegistry&); void initializeLoopAccessLegacyAnalysisPass(PassRegistry&); void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&); diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h index ede6637dfa4d..5ba8492db8f5 100644 --- a/include/llvm/LTO/Config.h +++ b/include/llvm/LTO/Config.h @@ -39,7 +39,7 @@ struct Config { std::string CPU; TargetOptions Options; std::vector MAttrs; - Reloc::Model RelocModel = Reloc::PIC_; + Optional RelocModel = Reloc::PIC_; CodeModel::Model CodeModel = CodeModel::Default; CodeGenOpt::Level CGOptLevel = CodeGenOpt::Default; TargetMachine::CodeGenFileType CGFileType = TargetMachine::CGFT_ObjectFile; diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h index cf5d93ee9ed7..3f5a233c1ee1 100644 --- a/include/llvm/Object/Binary.h +++ b/include/llvm/Object/Binary.h @@ -95,9 +95,7 @@ public: return TypeID > ID_StartObjects && TypeID < ID_EndObjects; } - bool isSymbolic() const { - return isIR() || isObject(); - } + bool isSymbolic() const { return isIR() || isObject() || isCOFFImportFile(); } bool isArchive() const { return TypeID == ID_Archive; diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h index 8b9b49737170..dafd1a43cb59 100644 --- a/include/llvm/Object/COFF.h +++ b/include/llvm/Object/COFF.h @@ -782,6 +782,7 @@ protected: std::error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override; uint64_t getSectionAddress(DataRefImpl Sec) const override; + uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; std::error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override; diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h index d8b58b8079fa..ef2abd8c52ce 100644 --- a/include/llvm/Object/ELFObjectFile.h +++ b/include/llvm/Object/ELFObjectFile.h @@ -235,6 +235,7 @@ protected: std::error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override; uint64_t getSectionAddress(DataRefImpl Sec) const override; + uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; std::error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override; @@ -645,6 +646,17 @@ uint64_t ELFObjectFile::getSectionAddress(DataRefImpl Sec) const { return getSection(Sec)->sh_addr; } +template +uint64_t ELFObjectFile::getSectionIndex(DataRefImpl Sec) const { + auto SectionsOrErr = EF.sections(); + handleAllErrors(std::move(SectionsOrErr.takeError()), + [](const ErrorInfoBase &) { + llvm_unreachable("unable to get section index"); + }); + const Elf_Shdr *First = SectionsOrErr->begin(); + return getSection(Sec) - First; +} + template uint64_t ELFObjectFile::getSectionSize(DataRefImpl Sec) const { return getSection(Sec)->sh_size; diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h index 29553558f72f..a4356d5977b2 100644 --- a/include/llvm/Object/MachO.h +++ b/include/llvm/Object/MachO.h @@ -290,6 +290,7 @@ public: std::error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override; uint64_t getSectionAddress(DataRefImpl Sec) const override; + uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; std::error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override; diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h index 9a7bc618ffd0..ea6a9049bc1b 100644 --- a/include/llvm/Object/ObjectFile.h +++ b/include/llvm/Object/ObjectFile.h @@ -95,6 +95,7 @@ public: std::error_code getName(StringRef &Result) const; uint64_t getAddress() const; + uint64_t getIndex() const; uint64_t getSize() const; std::error_code getContents(StringRef &Result) const; @@ -222,6 +223,7 @@ protected: virtual std::error_code getSectionName(DataRefImpl Sec, StringRef &Res) const = 0; virtual uint64_t getSectionAddress(DataRefImpl Sec) const = 0; + virtual uint64_t getSectionIndex(DataRefImpl Sec) const = 0; virtual uint64_t getSectionSize(DataRefImpl Sec) const = 0; virtual std::error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const = 0; @@ -393,6 +395,10 @@ inline uint64_t SectionRef::getAddress() const { return OwningObject->getSectionAddress(SectionPimpl); } +inline uint64_t SectionRef::getIndex() const { + return OwningObject->getSectionIndex(SectionPimpl); +} + inline uint64_t SectionRef::getSize() const { return OwningObject->getSectionSize(SectionPimpl); } diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h index 86579b7c3e3a..348179860f3e 100644 --- a/include/llvm/Object/RelocVisitor.h +++ b/include/llvm/Object/RelocVisitor.h @@ -40,13 +40,13 @@ public: // TODO: Should handle multiple applied relocations via either passing in the // previously computed value or just count paired relocations as a single // visit. - uint64_t visit(uint32_t RelocType, RelocationRef R, uint64_t Value = 0) { + uint64_t visit(uint32_t Rel, RelocationRef R, uint64_t Value = 0) { if (isa(ObjToVisit)) - return visitELF(RelocType, R, Value); + return visitELF(Rel, R, Value); if (isa(ObjToVisit)) - return visitCOFF(RelocType, R, Value); + return visitCOFF(Rel, R, Value); if (isa(ObjToVisit)) - return visitMachO(RelocType, R, Value); + return visitMachO(Rel, R, Value); HasError = true; return 0; @@ -58,214 +58,60 @@ private: const ObjectFile &ObjToVisit; bool HasError = false; - uint64_t visitELF(uint32_t RelocType, RelocationRef R, uint64_t Value) { + uint64_t visitELF(uint32_t Rel, RelocationRef R, uint64_t Value) { if (ObjToVisit.getBytesInAddress() == 8) { // 64-bit object file switch (ObjToVisit.getArch()) { case Triple::x86_64: - switch (RelocType) { - case ELF::R_X86_64_NONE: - return visitELF_X86_64_NONE(R); - case ELF::R_X86_64_64: - return visitELF_X86_64_64(R, Value); - case ELF::R_X86_64_PC32: - return visitELF_X86_64_PC32(R, Value); - case ELF::R_X86_64_32: - return visitELF_X86_64_32(R, Value); - case ELF::R_X86_64_32S: - return visitELF_X86_64_32S(R, Value); - default: - HasError = true; - return 0; - } + return visitX86_64(Rel, R, Value); case Triple::aarch64: case Triple::aarch64_be: - switch (RelocType) { - case ELF::R_AARCH64_ABS32: - return visitELF_AARCH64_ABS32(R, Value); - case ELF::R_AARCH64_ABS64: - return visitELF_AARCH64_ABS64(R, Value); - default: - HasError = true; - return 0; - } + return visitAarch64(Rel, R, Value); case Triple::bpfel: case Triple::bpfeb: - switch (RelocType) { - case ELF::R_BPF_64_64: - return visitELF_BPF_64_64(R, Value); - case ELF::R_BPF_64_32: - return visitELF_BPF_64_32(R, Value); - default: - HasError = true; - return 0; - } + return visitBpf(Rel, R, Value); case Triple::mips64el: case Triple::mips64: - switch (RelocType) { - case ELF::R_MIPS_32: - return visitELF_MIPS64_32(R, Value); - case ELF::R_MIPS_64: - return visitELF_MIPS64_64(R, Value); - default: - HasError = true; - return 0; - } + return visitMips64(Rel, R, Value); case Triple::ppc64le: case Triple::ppc64: - switch (RelocType) { - case ELF::R_PPC64_ADDR32: - return visitELF_PPC64_ADDR32(R, Value); - case ELF::R_PPC64_ADDR64: - return visitELF_PPC64_ADDR64(R, Value); - default: - HasError = true; - return 0; - } + return visitPPC64(Rel, R, Value); case Triple::systemz: - switch (RelocType) { - case ELF::R_390_32: - return visitELF_390_32(R, Value); - case ELF::R_390_64: - return visitELF_390_64(R, Value); - default: - HasError = true; - return 0; - } + return visitSystemz(Rel, R, Value); case Triple::sparcv9: - switch (RelocType) { - case ELF::R_SPARC_32: - case ELF::R_SPARC_UA32: - return visitELF_SPARCV9_32(R, Value); - case ELF::R_SPARC_64: - case ELF::R_SPARC_UA64: - return visitELF_SPARCV9_64(R, Value); - default: - HasError = true; - return 0; - } + return visitSparc64(Rel, R, Value); case Triple::amdgcn: - switch (RelocType) { - case ELF::R_AMDGPU_ABS32: - return visitELF_AMDGPU_ABS32(R, Value); - case ELF::R_AMDGPU_ABS64: - return visitELF_AMDGPU_ABS64(R, Value); - default: - HasError = true; - return 0; - } + return visitAmdgpu(Rel, R, Value); default: HasError = true; return 0; } - } else if (ObjToVisit.getBytesInAddress() == 4) { // 32-bit object file - switch (ObjToVisit.getArch()) { - case Triple::x86: - switch (RelocType) { - case ELF::R_386_NONE: - return visitELF_386_NONE(R); - case ELF::R_386_32: - return visitELF_386_32(R, Value); - case ELF::R_386_PC32: - return visitELF_386_PC32(R, Value); - default: - HasError = true; - return 0; - } - case Triple::ppc: - switch (RelocType) { - case ELF::R_PPC_ADDR32: - return visitELF_PPC_ADDR32(R, Value); - default: - HasError = true; - return 0; - } - case Triple::arm: - case Triple::armeb: - switch (RelocType) { - default: - HasError = true; - return 0; - case ELF::R_ARM_ABS32: - return visitELF_ARM_ABS32(R, Value); - } - case Triple::lanai: - switch (RelocType) { - case ELF::R_LANAI_32: - return visitELF_Lanai_32(R, Value); - default: - HasError = true; - return 0; - } - case Triple::mipsel: - case Triple::mips: - switch (RelocType) { - case ELF::R_MIPS_32: - return visitELF_MIPS_32(R, Value); - default: - HasError = true; - return 0; - } - case Triple::sparc: - switch (RelocType) { - case ELF::R_SPARC_32: - case ELF::R_SPARC_UA32: - return visitELF_SPARC_32(R, Value); - default: - HasError = true; - return 0; - } - case Triple::hexagon: - switch (RelocType) { - case ELF::R_HEX_32: - return visitELF_HEX_32(R, Value); - default: - HasError = true; - return 0; - } - default: - HasError = true; - return 0; - } - } else { - report_fatal_error("Invalid word size in object file"); } - } - uint64_t visitCOFF(uint32_t RelocType, RelocationRef R, uint64_t Value) { + // 32-bit object file + assert(ObjToVisit.getBytesInAddress() == 4 && + "Invalid word size in object file"); + switch (ObjToVisit.getArch()) { case Triple::x86: - switch (RelocType) { - case COFF::IMAGE_REL_I386_SECREL: - return visitCOFF_I386_SECREL(R, Value); - case COFF::IMAGE_REL_I386_DIR32: - return visitCOFF_I386_DIR32(R, Value); - } - break; - case Triple::x86_64: - switch (RelocType) { - case COFF::IMAGE_REL_AMD64_SECREL: - return visitCOFF_AMD64_SECREL(R, Value); - case COFF::IMAGE_REL_AMD64_ADDR64: - return visitCOFF_AMD64_ADDR64(R, Value); - } - break; + return visitX86(Rel, R, Value); + case Triple::ppc: + return visitPPC32(Rel, R, Value); + case Triple::arm: + case Triple::armeb: + return visitARM(Rel, R, Value); + case Triple::lanai: + return visitLanai(Rel, R, Value); + case Triple::mipsel: + case Triple::mips: + return visitMips32(Rel, R, Value); + case Triple::sparc: + return visitSparc32(Rel, R, Value); + case Triple::hexagon: + return visitHexagon(Rel, R, Value); + default: + HasError = true; + return 0; } - HasError = true; - return 0; - } - - uint64_t visitMachO(uint32_t RelocType, RelocationRef R, uint64_t Value) { - switch (ObjToVisit.getArch()) { - default: break; - case Triple::x86_64: - switch (RelocType) { - default: break; - case MachO::X86_64_RELOC_UNSIGNED: - return visitMACHO_X86_64_UNSIGNED(R, Value); - } - } - HasError = true; - return 0; } int64_t getELFAddend(RelocationRef R) { @@ -275,176 +121,193 @@ private: return *AddendOrErr; } - /// Operations - - /// 386-ELF - uint64_t visitELF_386_NONE(RelocationRef R) { + uint64_t visitX86_64(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_X86_64_NONE: + return 0; + case ELF::R_X86_64_64: + return Value + getELFAddend(R); + case ELF::R_X86_64_PC32: + return Value + getELFAddend(R) - R.getOffset(); + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + return (Value + getELFAddend(R)) & 0xFFFFFFFF; + } + HasError = true; return 0; } - // Ideally the Addend here will be the addend in the data for - // the relocation. It's not actually the case for Rel relocations. - uint64_t visitELF_386_32(RelocationRef R, uint64_t Value) { - return Value; - } - - uint64_t visitELF_386_PC32(RelocationRef R, uint64_t Value) { - return Value - R.getOffset(); - } - - /// X86-64 ELF - uint64_t visitELF_X86_64_NONE(RelocationRef R) { + uint64_t visitAarch64(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_AARCH64_ABS32: { + int64_t Res = Value + getELFAddend(R); + if (Res < INT32_MIN || Res > UINT32_MAX) + HasError = true; + return static_cast(Res); + } + case ELF::R_AARCH64_ABS64: + return Value + getELFAddend(R); + } + HasError = true; return 0; } - uint64_t visitELF_X86_64_64(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); + uint64_t visitBpf(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_BPF_64_32: + return Value & 0xFFFFFFFF; + case ELF::R_BPF_64_64: + return Value; + } + HasError = true; + return 0; } - uint64_t visitELF_X86_64_PC32(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R) - R.getOffset(); + uint64_t visitMips64(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_MIPS_32: + return (Value + getELFAddend(R)) & 0xFFFFFFFF; + case ELF::R_MIPS_64: + return Value + getELFAddend(R); + } + HasError = true; + return 0; } - uint64_t visitELF_X86_64_32(RelocationRef R, uint64_t Value) { - return (Value + getELFAddend(R)) & 0xFFFFFFFF; + uint64_t visitPPC64(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_PPC64_ADDR32: + return (Value + getELFAddend(R)) & 0xFFFFFFFF; + case ELF::R_PPC64_ADDR64: + return Value + getELFAddend(R); + } + HasError = true; + return 0; } - uint64_t visitELF_X86_64_32S(RelocationRef R, uint64_t Value) { - return (Value + getELFAddend(R)) & 0xFFFFFFFF; + uint64_t visitSystemz(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_390_32: { + int64_t Res = Value + getELFAddend(R); + if (Res < INT32_MIN || Res > UINT32_MAX) + HasError = true; + return static_cast(Res); + } + case ELF::R_390_64: + return Value + getELFAddend(R); + } + HasError = true; + return 0; } - /// BPF ELF - uint64_t visitELF_BPF_64_32(RelocationRef R, uint64_t Value) { - return Value & 0xFFFFFFFF; + uint64_t visitSparc64(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_SPARC_32: + case ELF::R_SPARC_64: + case ELF::R_SPARC_UA32: + case ELF::R_SPARC_UA64: + return Value + getELFAddend(R); + } + HasError = true; + return 0; } - uint64_t visitELF_BPF_64_64(RelocationRef R, uint64_t Value) { - return Value; + uint64_t visitAmdgpu(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_AMDGPU_ABS32: + case ELF::R_AMDGPU_ABS64: + return Value + getELFAddend(R); + } + HasError = true; + return 0; } - /// PPC64 ELF - uint64_t visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) { - return (Value + getELFAddend(R)) & 0xFFFFFFFF; + uint64_t visitX86(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (Rel) { + case ELF::R_386_NONE: + return 0; + case ELF::R_386_32: + return Value; + case ELF::R_386_PC32: + return Value - R.getOffset(); + } + HasError = true; + return 0; } - uint64_t visitELF_PPC64_ADDR64(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); + uint64_t visitPPC32(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (Rel == ELF::R_PPC_ADDR32) + return (Value + getELFAddend(R)) & 0xFFFFFFFF; + HasError = true; + return 0; } - /// PPC32 ELF - uint64_t visitELF_PPC_ADDR32(RelocationRef R, uint64_t Value) { - return (Value + getELFAddend(R)) & 0xFFFFFFFF; + uint64_t visitARM(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (Rel == ELF::R_ARM_ABS32) { + if ((int64_t)Value < INT32_MIN || (int64_t)Value > UINT32_MAX) + HasError = true; + return static_cast(Value); + } + HasError = true; + return 0; } - /// Lanai ELF - uint64_t visitELF_Lanai_32(RelocationRef R, uint64_t Value) { - return (Value + getELFAddend(R)) & 0xFFFFFFFF; + uint64_t visitLanai(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (Rel == ELF::R_LANAI_32) + return (Value + getELFAddend(R)) & 0xFFFFFFFF; + HasError = true; + return 0; } - /// MIPS ELF - uint64_t visitELF_MIPS_32(RelocationRef R, uint64_t Value) { - return Value & 0xFFFFFFFF; + uint64_t visitMips32(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (Rel == ELF::R_MIPS_32) + return Value & 0xFFFFFFFF; + HasError = true; + return 0; } - /// MIPS64 ELF - uint64_t visitELF_MIPS64_32(RelocationRef R, uint64_t Value) { - return (Value + getELFAddend(R)) & 0xFFFFFFFF; + uint64_t visitSparc32(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (Rel == ELF::R_SPARC_32 || Rel == ELF::R_SPARC_UA32) + return Value + getELFAddend(R); + HasError = true; + return 0; } - uint64_t visitELF_MIPS64_64(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); + uint64_t visitHexagon(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (Rel == ELF::R_HEX_32) + return Value + getELFAddend(R); + HasError = true; + return 0; } - // AArch64 ELF - uint64_t visitELF_AARCH64_ABS32(RelocationRef R, uint64_t Value) { - int64_t Addend = getELFAddend(R); - int64_t Res = Value + Addend; - - // Overflow check allows for both signed and unsigned interpretation. - if (Res < INT32_MIN || Res > UINT32_MAX) - HasError = true; - - return static_cast(Res); + uint64_t visitCOFF(uint32_t Rel, RelocationRef R, uint64_t Value) { + switch (ObjToVisit.getArch()) { + case Triple::x86: + switch (Rel) { + case COFF::IMAGE_REL_I386_SECREL: + case COFF::IMAGE_REL_I386_DIR32: + return static_cast(Value); + } + break; + case Triple::x86_64: + switch (Rel) { + case COFF::IMAGE_REL_AMD64_SECREL: + return static_cast(Value); + case COFF::IMAGE_REL_AMD64_ADDR64: + return Value; + } + break; + } + HasError = true; + return 0; } - uint64_t visitELF_AARCH64_ABS64(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); - } - - // SystemZ ELF - uint64_t visitELF_390_32(RelocationRef R, uint64_t Value) { - int64_t Addend = getELFAddend(R); - int64_t Res = Value + Addend; - - // Overflow check allows for both signed and unsigned interpretation. - if (Res < INT32_MIN || Res > UINT32_MAX) - HasError = true; - - return static_cast(Res); - } - - uint64_t visitELF_390_64(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); - } - - uint64_t visitELF_SPARC_32(RelocationRef R, uint32_t Value) { - return Value + getELFAddend(R); - } - - uint64_t visitELF_SPARCV9_32(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); - } - - uint64_t visitELF_SPARCV9_64(RelocationRef R, uint64_t Value) { - return Value + getELFAddend(R); - } - - uint64_t visitELF_ARM_ABS32(RelocationRef R, uint64_t Value) { - int64_t Res = Value; - - // Overflow check allows for both signed and unsigned interpretation. - if (Res < INT32_MIN || Res > UINT32_MAX) - HasError = true; - - return static_cast(Res); - } - - uint64_t visitELF_HEX_32(RelocationRef R, uint64_t Value) { - int64_t Addend = getELFAddend(R); - return Value + Addend; - } - - uint64_t visitELF_AMDGPU_ABS32(RelocationRef R, uint64_t Value) { - int64_t Addend = getELFAddend(R); - return Value + Addend; - } - - uint64_t visitELF_AMDGPU_ABS64(RelocationRef R, uint64_t Value) { - int64_t Addend = getELFAddend(R); - return Value + Addend; - } - - /// I386 COFF - uint64_t visitCOFF_I386_SECREL(RelocationRef R, uint64_t Value) { - return static_cast(Value); - } - - uint64_t visitCOFF_I386_DIR32(RelocationRef R, uint64_t Value) { - return static_cast(Value); - } - - /// AMD64 COFF - uint64_t visitCOFF_AMD64_SECREL(RelocationRef R, uint64_t Value) { - return static_cast(Value); - } - - uint64_t visitCOFF_AMD64_ADDR64(RelocationRef R, uint64_t Value) { - return Value; - } - - // X86_64 MachO - uint64_t visitMACHO_X86_64_UNSIGNED(RelocationRef R, uint64_t Value) { - return Value; + uint64_t visitMachO(uint32_t Rel, RelocationRef R, uint64_t Value) { + if (ObjToVisit.getArch() == Triple::x86_64 && + Rel == MachO::X86_64_RELOC_UNSIGNED) + return Value; + HasError = true; + return 0; } }; diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h index d200d4a148e3..de54a4928cce 100644 --- a/include/llvm/Object/Wasm.h +++ b/include/llvm/Object/Wasm.h @@ -119,6 +119,7 @@ public: std::error_code getSectionName(DataRefImpl Sec, StringRef &Res) const override; uint64_t getSectionAddress(DataRefImpl Sec) const override; + uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; std::error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const override; diff --git a/include/llvm/Option/OptTable.h b/include/llvm/Option/OptTable.h index 390e52774fea..8a323a255ca1 100644 --- a/include/llvm/Option/OptTable.h +++ b/include/llvm/Option/OptTable.h @@ -113,6 +113,14 @@ public: return getInfo(id).MetaVar; } + /// Find flags from OptTable which starts with Cur. + /// + /// \param [in] Cur - String prefix that all returned flags need + // to start with. + /// + /// \return The vector of flags which start with Cur. + std::vector findByPrefix(StringRef Cur) const; + /// \brief Parse a single argument; returning the new argument and /// updating Index. /// diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h index 1b07c33746e7..0dbb2cf9f269 100644 --- a/include/llvm/ProfileData/InstrProf.h +++ b/include/llvm/ProfileData/InstrProf.h @@ -212,12 +212,12 @@ StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, /// third field is the uncompressed strings; otherwise it is the /// compressed string. When the string compression is off, the /// second field will have value zero. -Error collectPGOFuncNameStrings(const std::vector &NameStrs, +Error collectPGOFuncNameStrings(ArrayRef NameStrs, bool doCompression, std::string &Result); /// Produce \c Result string with the same format described above. The input /// is vector of PGO function name variables that are referenced. -Error collectPGOFuncNameStrings(const std::vector &NameVars, +Error collectPGOFuncNameStrings(ArrayRef NameVars, std::string &Result, bool doCompression = true); /// \c NameStrings is a string composed of one of more sub-strings encoded in @@ -967,7 +967,7 @@ struct Header { } // end namespace RawInstrProf // Parse MemOP Size range option. -void getMemOPSizeRangeFromOption(std::string Str, int64_t &RangeStart, +void getMemOPSizeRangeFromOption(StringRef Str, int64_t &RangeStart, int64_t &RangeLast); } // end namespace llvm diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h index fef5bf304566..d14a56cb87e0 100644 --- a/include/llvm/TableGen/Record.h +++ b/include/llvm/TableGen/Record.h @@ -671,7 +671,7 @@ public: /// [AL, AH, CL] - Represent a list of defs /// class ListInit final : public TypedInit, public FoldingSetNode, - public TrailingObjects { + public TrailingObjects { unsigned NumValues; public: @@ -1137,17 +1137,19 @@ public: /// to have at least one value then a (possibly empty) list of arguments. Each /// argument can have a name associated with it. /// -class DagInit : public TypedInit, public FoldingSetNode { +class DagInit final : public TypedInit, public FoldingSetNode, + public TrailingObjects { Init *Val; StringInit *ValName; - SmallVector Args; - SmallVector ArgNames; + unsigned NumArgs; + unsigned NumArgNames; - DagInit(Init *V, StringInit *VN, ArrayRef ArgRange, - ArrayRef NameRange) + DagInit(Init *V, StringInit *VN, unsigned NumArgs, unsigned NumArgNames) : TypedInit(IK_DagInit, DagRecTy::get()), Val(V), ValName(VN), - Args(ArgRange.begin(), ArgRange.end()), - ArgNames(NameRange.begin(), NameRange.end()) {} + NumArgs(NumArgs), NumArgNames(NumArgNames) {} + + friend TrailingObjects; + size_t numTrailingObjects(OverloadToken) const { return NumArgs; } public: DagInit(const DagInit &Other) = delete; @@ -1173,20 +1175,24 @@ public: return ValName ? ValName->getValue() : StringRef(); } - unsigned getNumArgs() const { return Args.size(); } + unsigned getNumArgs() const { return NumArgs; } Init *getArg(unsigned Num) const { - assert(Num < Args.size() && "Arg number out of range!"); - return Args[Num]; + assert(Num < NumArgs && "Arg number out of range!"); + return getTrailingObjects()[Num]; } StringInit *getArgName(unsigned Num) const { - assert(Num < ArgNames.size() && "Arg number out of range!"); - return ArgNames[Num]; + assert(Num < NumArgNames && "Arg number out of range!"); + return getTrailingObjects()[Num]; } StringRef getArgNameStr(unsigned Num) const { StringInit *Init = getArgName(Num); return Init ? Init->getValue() : StringRef(); } + ArrayRef getArgNames() const { + return makeArrayRef(getTrailingObjects(), NumArgNames); + } + Init *resolveReferences(Record &R, const RecordVal *RV) const override; std::string getAsString() const override; @@ -1194,20 +1200,20 @@ public: typedef SmallVectorImpl::const_iterator const_arg_iterator; typedef SmallVectorImpl::const_iterator const_name_iterator; - inline const_arg_iterator arg_begin() const { return Args.begin(); } - inline const_arg_iterator arg_end () const { return Args.end(); } + inline const_arg_iterator arg_begin() const { return getTrailingObjects(); } + inline const_arg_iterator arg_end () const { return arg_begin() + NumArgs; } inline iterator_range args() const { return llvm::make_range(arg_begin(), arg_end()); } - inline size_t arg_size () const { return Args.size(); } - inline bool arg_empty() const { return Args.empty(); } + inline size_t arg_size () const { return NumArgs; } + inline bool arg_empty() const { return NumArgs == 0; } - inline const_name_iterator name_begin() const { return ArgNames.begin(); } - inline const_name_iterator name_end () const { return ArgNames.end(); } + inline const_name_iterator name_begin() const { return getTrailingObjects(); } + inline const_name_iterator name_end () const { return name_begin() + NumArgNames; } - inline size_t name_size () const { return ArgNames.size(); } - inline bool name_empty() const { return ArgNames.empty(); } + inline size_t name_size () const { return NumArgNames; } + inline bool name_empty() const { return NumArgNames == 0; } Init *getBit(unsigned Bit) const override { llvm_unreachable("Illegal bit reference off dag"); diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 1ca32d4c3589..17182b958ecb 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -405,7 +405,9 @@ public: } /// Returns if it's reasonable to merge stores to MemVT size. - virtual bool canMergeStoresTo(EVT MemVT) const { return true; } + virtual bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const { + return true; + } /// \brief Return true if it is cheap to speculate a call to intrinsic cttz. virtual bool isCheapToSpeculateCttz() const { @@ -736,7 +738,7 @@ public: if (VT.isExtended()) return Expand; // If a target-specific SDNode requires legalization, require the target // to provide custom legalization for it. - if (Op > array_lengthof(OpActions[0])) return Custom; + if (Op >= array_lengthof(OpActions[0])) return Custom; return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index ba0a3ee1287a..856c288a071f 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -354,6 +354,13 @@ FunctionPass *createEarlyCSEPass(bool UseMemorySSA = false); // FunctionPass *createGVNHoistPass(); +//===----------------------------------------------------------------------===// +// +// GVNSink - This pass uses an "inverted" value numbering to decide the +// similarity of expressions and sinks similar expressions into successors. +// +FunctionPass *createGVNSinkPass(); + //===----------------------------------------------------------------------===// // // MergedLoadStoreMotion - This pass merges loads and stores in diamonds. Loads diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h index 8f05e8cdb233..3f97789cabbc 100644 --- a/include/llvm/Transforms/Scalar/GVN.h +++ b/include/llvm/Transforms/Scalar/GVN.h @@ -68,6 +68,24 @@ public: class ValueTable { DenseMap valueNumbering; DenseMap expressionNumbering; + + // Expressions is the vector of Expression. ExprIdx is the mapping from + // value number to the index of Expression in Expressions. We use it + // instead of a DenseMap because filling such mapping is faster than + // filling a DenseMap and the compile time is a little better. + uint32_t nextExprNumber; + std::vector Expressions; + std::vector ExprIdx; + // Value number to PHINode mapping. Used for phi-translate in scalarpre. + DenseMap NumberingPhi; + // Cache for phi-translate in scalarpre. + typedef DenseMap, uint32_t> + PhiTranslateMap; + PhiTranslateMap PhiTranslateTable; + // Map the block to reversed postorder traversal number. It is used to + // find back edge easily. + DenseMap BlockRPONumber; + AliasAnalysis *AA; MemoryDependenceResults *MD; DominatorTree *DT; @@ -79,6 +97,10 @@ public: Value *LHS, Value *RHS); Expression createExtractvalueExpr(ExtractValueInst *EI); uint32_t lookupOrAddCall(CallInst *C); + uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock, + uint32_t Num, GVN &Gvn); + std::pair assignExpNewValueNum(Expression &exp); + bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVN &Gvn); public: ValueTable(); @@ -87,9 +109,12 @@ public: ~ValueTable(); uint32_t lookupOrAdd(Value *V); - uint32_t lookup(Value *V) const; + uint32_t lookup(Value *V, bool Verify = true) const; uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS, Value *RHS); + uint32_t phiTranslate(const BasicBlock *BB, const BasicBlock *PhiBlock, + uint32_t Num, GVN &Gvn); + void assignBlockRPONumber(Function &F); bool exists(Value *V) const; void add(Value *V, uint32_t num); void clear(); @@ -238,7 +263,12 @@ struct GVNHoistPass : PassInfoMixin { /// \brief Run the pass over the function. PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; - +/// \brief Uses an "inverted" value numbering to decide the similarity of +/// expressions and sinks similar expressions into successors. +struct GVNSinkPass : PassInfoMixin { + /// \brief Run the pass over the function. + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; } #endif diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index b5a5f4c2704c..8942111307ff 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -356,6 +356,10 @@ void combineMetadata(Instruction *K, const Instruction *J, ArrayRef Kn /// Unknown metadata is removed. void combineMetadataForCSE(Instruction *K, const Instruction *J); +// Replace each use of 'From' with 'To', if that use does not belong to basic +// block where 'From' is defined. Returns the number of replacements made. +unsigned replaceNonLocalUsesWith(Instruction *From, Value *To); + /// Replace each use of 'From' with 'To' if that use is dominated by /// the given edge. Returns the number of replacements made. unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, @@ -406,6 +410,14 @@ bool recognizeBSwapOrBitReverseIdiom( void maybeMarkSanitizerLibraryCallNoBuiltin(CallInst *CI, const TargetLibraryInfo *TLI); +//===----------------------------------------------------------------------===// +// Transform predicates +// + +/// Given an instruction, is it legal to set operand OpIdx to a non-constant +/// value? +bool canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx); + } // End llvm namespace #endif diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 0ca712bbfe70..79517ec6a3a8 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -687,11 +687,8 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1, // bits. if (Opc == Instruction::And) { - unsigned BitWidth = DL.getTypeSizeInBits(Op0->getType()->getScalarType()); - KnownBits Known0(BitWidth); - KnownBits Known1(BitWidth); - computeKnownBits(Op0, Known0, DL); - computeKnownBits(Op1, Known1, DL); + KnownBits Known0 = computeKnownBits(Op0, DL); + KnownBits Known1 = computeKnownBits(Op1, DL); if ((Known1.One | Known0.Zero).isAllOnesValue()) { // All the bits of Op0 that the 'and' could be masking are already zero. return Op0; diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 2e72d5aa8269..122442bafb11 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -688,9 +688,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW, if (isNUW) return Op0; - unsigned BitWidth = Op1->getType()->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(Op1, Known, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); if (Known.Zero.isMaxSignedValue()) { // Op1 is either 0 or the minimum signed value. If the sub is NSW, then // Op1 must be 0 because negating the minimum signed value is undefined. @@ -1309,15 +1307,13 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0, // If any bits in the shift amount make that value greater than or equal to // the number of bits in the type, the shift is undefined. - unsigned BitWidth = Op1->getType()->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(Op1, Known, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); - if (Known.One.getLimitedValue() >= BitWidth) + KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (Known.One.getLimitedValue() >= Known.getBitWidth()) return UndefValue::get(Op0->getType()); // If all valid bits in the shift amount are known zero, the first operand is // unchanged. - unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth); + unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth()); if (Known.countMinTrailingZeros() >= NumValidShiftBits) return Op0; @@ -1343,9 +1339,7 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0, // The low bit cannot be shifted out of an exact shift if it is set. if (isExact) { - unsigned BitWidth = Op0->getType()->getScalarSizeInBits(); - KnownBits Op0Known(BitWidth); - computeKnownBits(Op0, Op0Known, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); + KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); if (Op0Known.One[0]) return Op0; } @@ -1428,6 +1422,8 @@ Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact, return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit); } +/// Commuted variants are assumed to be handled by calling this function again +/// with the parameters swapped. static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp, ICmpInst *UnsignedICmp, bool IsAnd) { Value *X, *Y; @@ -1560,20 +1556,8 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1, return nullptr; } -/// Commuted variants are assumed to be handled by calling this function again -/// with the parameters swapped. -static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { - if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true)) - return X; - - if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1)) - return X; - - if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true)) - return X; - +static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) { // (icmp (add V, C0), C1) & (icmp V, C0) - Type *ITy = Op0->getType(); ICmpInst::Predicate Pred0, Pred1; const APInt *C0, *C1; Value *V; @@ -1587,6 +1571,7 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { if (AddInst->getOperand(1) != Op1->getOperand(1)) return nullptr; + Type *ITy = Op0->getType(); bool isNSW = AddInst->hasNoSignedWrap(); bool isNUW = AddInst->hasNoUnsignedWrap(); @@ -1617,18 +1602,29 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { return nullptr; } -/// Commuted variants are assumed to be handled by calling this function again -/// with the parameters swapped. -static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) { - if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false)) +static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) { + if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true)) + return X; + if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true)) return X; - if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1)) + if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1)) + return X; + if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0)) return X; - if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false)) + if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true)) return X; + if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1)) + return X; + if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0)) + return X; + + return nullptr; +} + +static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) { // (icmp (add V, C0), C1) | (icmp V, C0) ICmpInst::Predicate Pred0, Pred1; const APInt *C0, *C1; @@ -1674,19 +1670,24 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) { return nullptr; } -static Value *simplifyPossiblyCastedAndOrOfICmps(ICmpInst *Cmp0, ICmpInst *Cmp1, - bool IsAnd, CastInst *Cast) { - Value *V = - IsAnd ? simplifyAndOfICmps(Cmp0, Cmp1) : simplifyOrOfICmps(Cmp0, Cmp1); - if (!V) - return nullptr; - if (!Cast) - return V; +static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) { + if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false)) + return X; + if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false)) + return X; - // If we looked through casts, we can only handle a constant simplification - // because we are not allowed to create a cast instruction here. - if (auto *C = dyn_cast(V)) - return ConstantExpr::getCast(Cast->getOpcode(), C, Cast->getType()); + if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1)) + return X; + if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0)) + return X; + + if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false)) + return X; + + if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1)) + return X; + if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0)) + return X; return nullptr; } @@ -1706,11 +1707,18 @@ static Value *simplifyAndOrOfICmps(Value *Op0, Value *Op1, bool IsAnd) { if (!Cmp0 || !Cmp1) return nullptr; - if (Value *V = simplifyPossiblyCastedAndOrOfICmps(Cmp0, Cmp1, IsAnd, Cast0)) - return V; - if (Value *V = simplifyPossiblyCastedAndOrOfICmps(Cmp1, Cmp0, IsAnd, Cast0)) + Value *V = + IsAnd ? simplifyAndOfICmps(Cmp0, Cmp1) : simplifyOrOfICmps(Cmp0, Cmp1); + if (!V) + return nullptr; + if (!Cast0) return V; + // If we looked through casts, we can only handle a constant simplification + // because we are not allowed to create a cast instruction here. + if (auto *C = dyn_cast(V)) + return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType()); + return nullptr; } @@ -1927,37 +1935,27 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, MaxRecurse)) return V; - // (A & C)|(B & D) - Value *C = nullptr, *D = nullptr; - if (match(Op0, m_And(m_Value(A), m_Value(C))) && - match(Op1, m_And(m_Value(B), m_Value(D)))) { - ConstantInt *C1 = dyn_cast(C); - ConstantInt *C2 = dyn_cast(D); - if (C1 && C2 && (C1->getValue() == ~C2->getValue())) { + // (A & C1)|(B & C2) + const APInt *C1, *C2; + if (match(Op0, m_And(m_Value(A), m_APInt(C1))) && + match(Op1, m_And(m_Value(B), m_APInt(C2)))) { + if (*C1 == ~*C2) { // (A & C1)|(B & C2) // If we have: ((V + N) & C1) | (V & C2) // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 // replace with V+N. - Value *V1, *V2; - if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+ - match(A, m_Add(m_Value(V1), m_Value(V2)))) { + Value *N; + if (C2->isMask() && // C2 == 0+1+ + match(A, m_c_Add(m_Specific(B), m_Value(N)))) { // Add commutes, try both ways. - if (V1 == B && - MaskedValueIsZero(V2, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) - return A; - if (V2 == B && - MaskedValueIsZero(V1, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) + if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return A; } // Or commutes, try both ways. - if ((C1->getValue() & (C1->getValue() + 1)) == 0 && - match(B, m_Add(m_Value(V1), m_Value(V2)))) { + if (C1->isMask() && + match(B, m_c_Add(m_Specific(A), m_Value(N)))) { // Add commutes, try both ways. - if (V1 == A && - MaskedValueIsZero(V2, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) - return B; - if (V2 == A && - MaskedValueIsZero(V1, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) + if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return B; } } @@ -3372,9 +3370,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS, if (ICmpInst::isEquality(Pred)) { const APInt *RHSVal; if (match(RHS, m_APInt(RHSVal))) { - unsigned BitWidth = RHSVal->getBitWidth(); - KnownBits LHSKnown(BitWidth); - computeKnownBits(LHS, LHSKnown, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); + KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); if (LHSKnown.Zero.intersects(*RHSVal) || !LHSKnown.One.isSubsetOf(*RHSVal)) return Pred == ICmpInst::ICMP_EQ ? ConstantInt::getFalse(ITy) @@ -3539,6 +3535,10 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp, if (V == Op) return RepOp; + // We cannot replace a constant, and shouldn't even try. + if (isa(Op)) + return nullptr; + auto *I = dyn_cast(V); if (!I) return nullptr; @@ -4444,19 +4444,21 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd, case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: { // X + undef -> undef - if (isa(RHS)) + if (isa(LHS) || isa(RHS)) return UndefValue::get(ReturnType); return nullptr; } case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: { + // 0 * X -> { 0, false } // X * 0 -> { 0, false } - if (match(RHS, m_Zero())) + if (match(LHS, m_Zero()) || match(RHS, m_Zero())) return Constant::getNullValue(ReturnType); + // undef * X -> { 0, false } // X * undef -> { 0, false } - if (match(RHS, m_Undef())) + if (match(LHS, m_Undef()) || match(RHS, m_Undef())) return Constant::getNullValue(ReturnType); return nullptr; @@ -4680,9 +4682,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ, // In general, it is possible for computeKnownBits to determine all bits in a // value even when the operands are not all constants. if (!Result && I->getType()->isIntOrIntVectorTy()) { - unsigned BitWidth = I->getType()->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(I, Known, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE); + KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE); if (Known.isConstant()) Result = ConstantInt::get(I->getType(), Known.getConstant()); } diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp index 471ccb62970d..e6391792bc23 100644 --- a/lib/Analysis/Lint.cpp +++ b/lib/Analysis/Lint.cpp @@ -534,9 +534,7 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, VectorType *VecTy = dyn_cast(V->getType()); if (!VecTy) { - unsigned BitWidth = V->getType()->getIntegerBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(V, Known, DL, 0, AC, dyn_cast(V), DT); + KnownBits Known = computeKnownBits(V, DL, 0, AC, dyn_cast(V), DT); return Known.isZero(); } @@ -550,14 +548,12 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, // For a vector, KnownZero will only be true if all values are zero, so check // this per component - unsigned BitWidth = VecTy->getElementType()->getIntegerBitWidth(); for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) { Constant *Elem = C->getAggregateElement(I); if (isa(Elem)) return true; - KnownBits Known(BitWidth); - computeKnownBits(Elem, Known, DL); + KnownBits Known = computeKnownBits(Elem, DL); if (Known.isZero()) return true; } diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp index 0b5f6266e373..e988f6444a58 100644 --- a/lib/Analysis/LoopPass.cpp +++ b/lib/Analysis/LoopPass.cpp @@ -73,30 +73,23 @@ LPPassManager::LPPassManager() CurrentLoop = nullptr; } -// Inset loop into loop nest (LoopInfo) and loop queue (LQ). -Loop &LPPassManager::addLoop(Loop *ParentLoop) { - // Create a new loop. LI will take ownership. - Loop *L = new Loop(); - - // Insert into the loop nest and the loop queue. - if (!ParentLoop) { +// Insert loop into loop nest (LoopInfo) and loop queue (LQ). +void LPPassManager::addLoop(Loop &L) { + if (!L.getParentLoop()) { // This is the top level loop. - LI->addTopLevelLoop(L); - LQ.push_front(L); - return *L; + LQ.push_front(&L); + return; } - ParentLoop->addChildLoop(L); // Insert L into the loop queue after the parent loop. for (auto I = LQ.begin(), E = LQ.end(); I != E; ++I) { - if (*I == L->getParentLoop()) { + if (*I == L.getParentLoop()) { // deque does not support insert after. ++I; - LQ.insert(I, 1, L); - break; + LQ.insert(I, 1, &L); + return; } } - return *L; } /// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 78ded8141c08..d280fda0a162 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -2178,6 +2178,63 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type, return Flags; } +bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L, + DominatorTree &DT, LoopInfo &LI) { + if (!isLoopInvariant(S, L)) + return false; + // If a value depends on a SCEVUnknown which is defined after the loop, we + // conservatively assume that we cannot calculate it at the loop's entry. + struct FindDominatedSCEVUnknown { + bool Found = false; + const Loop *L; + DominatorTree &DT; + LoopInfo &LI; + + FindDominatedSCEVUnknown(const Loop *L, DominatorTree &DT, LoopInfo &LI) + : L(L), DT(DT), LI(LI) {} + + bool checkSCEVUnknown(const SCEVUnknown *SU) { + if (auto *I = dyn_cast(SU->getValue())) { + if (DT.dominates(L->getHeader(), I->getParent())) + Found = true; + else + assert(DT.dominates(I->getParent(), L->getHeader()) && + "No dominance relationship between SCEV and loop?"); + } + return false; + } + + bool follow(const SCEV *S) { + switch (static_cast(S->getSCEVType())) { + case scConstant: + return false; + case scAddRecExpr: + case scTruncate: + case scZeroExtend: + case scSignExtend: + case scAddExpr: + case scMulExpr: + case scUMaxExpr: + case scSMaxExpr: + case scUDivExpr: + return true; + case scUnknown: + return checkSCEVUnknown(cast(S)); + case scCouldNotCompute: + llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); + } + return false; + } + + bool isDone() { return Found; } + }; + + FindDominatedSCEVUnknown FSU(L, DT, LI); + SCEVTraversal ST(FSU); + ST.visitAll(S); + return !FSU.Found; +} + /// Get a canonical add expression, or something simpler if possible. const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags Flags, @@ -2459,7 +2516,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, const SCEVAddRecExpr *AddRec = cast(Ops[Idx]); const Loop *AddRecLoop = AddRec->getLoop(); for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (isLoopInvariant(Ops[i], AddRecLoop)) { + if (isAvailableAtLoopEntry(Ops[i], AddRecLoop, DT, LI)) { LIOps.push_back(Ops[i]); Ops.erase(Ops.begin()+i); --i; --e; @@ -2734,7 +2791,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, const SCEVAddRecExpr *AddRec = cast(Ops[Idx]); const Loop *AddRecLoop = AddRec->getLoop(); for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (isLoopInvariant(Ops[i], AddRecLoop)) { + if (isAvailableAtLoopEntry(Ops[i], AddRecLoop, DT, LI)) { LIOps.push_back(Ops[i]); Ops.erase(Ops.begin()+i); --i; --e; @@ -4648,10 +4705,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) { if (const SCEVUnknown *U = dyn_cast(S)) { // For a SCEVUnknown, ask ValueTracking. - unsigned BitWidth = getTypeSizeInBits(U->getType()); - KnownBits Known(BitWidth); - computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC, - nullptr, &DT); + KnownBits Known = computeKnownBits(U->getValue(), getDataLayout(), 0, &AC, nullptr, &DT); return Known.countMinTrailingZeros(); } @@ -4831,8 +4885,7 @@ ScalarEvolution::getRange(const SCEV *S, const DataLayout &DL = getDataLayout(); if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) { // For a SCEVUnknown, ask ValueTracking. - KnownBits Known(BitWidth); - computeKnownBits(U->getValue(), Known, DL, 0, &AC, nullptr, &DT); + KnownBits Known = computeKnownBits(U->getValue(), DL, 0, &AC, nullptr, &DT); if (Known.One != ~Known.Zero + 1) ConservativeResult = ConservativeResult.intersectWith(ConstantRange(Known.One, @@ -9537,8 +9590,11 @@ struct SCEVCollectAddRecMultiplies { bool HasAddRec = false; SmallVector Operands; for (auto Op : Mul->operands()) { - if (isa(Op)) { + const SCEVUnknown *Unknown = dyn_cast(Op); + if (Unknown && !isa(Unknown->getValue())) { Operands.push_back(Op); + } else if (Unknown) { + HasAddRec = true; } else { bool ContainsAddRec; SCEVHasAddRec ContiansAddRec(ContainsAddRec); diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp index 86cbd79aa84e..f9b9df2bc707 100644 --- a/lib/Analysis/ScalarEvolutionExpander.cpp +++ b/lib/Analysis/ScalarEvolutionExpander.cpp @@ -1305,12 +1305,17 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { // Expand the core addrec. If we need post-loop scaling, force it to // expand to an integer type to avoid the need for additional casting. Type *ExpandTy = PostLoopScale ? IntTy : STy; + // We can't use a pointer type for the addrec if the pointer type is + // non-integral. + Type *AddRecPHIExpandTy = + DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy; + // In some cases, we decide to reuse an existing phi node but need to truncate // it and/or invert the step. Type *TruncTy = nullptr; bool InvertStep = false; - PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy, - TruncTy, InvertStep); + PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy, + IntTy, TruncTy, InvertStep); // Accommodate post-inc mode, if necessary. Value *Result; @@ -1383,8 +1388,15 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { // Re-apply any non-loop-dominating offset. if (PostLoopOffset) { if (PointerType *PTy = dyn_cast(ExpandTy)) { - const SCEV *const OffsetArray[1] = { PostLoopOffset }; - Result = expandAddToGEP(OffsetArray, OffsetArray+1, PTy, IntTy, Result); + if (Result->getType()->isIntegerTy()) { + Value *Base = expandCodeFor(PostLoopOffset, ExpandTy); + const SCEV *const OffsetArray[1] = {SE.getUnknown(Result)}; + Result = expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Base); + } else { + const SCEV *const OffsetArray[1] = {PostLoopOffset}; + Result = + expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Result); + } } else { Result = InsertNoopCastOfTo(Result, IntTy); Result = Builder.CreateAdd(Result, diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 8a5d10473662..7a8d4f3be24f 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -149,6 +149,10 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const { return TTIImpl->isLegalMaskedGather(DataType); } +bool TargetTransformInfo::prefersVectorizedAddressing() const { + return TTIImpl->prefersVectorizedAddressing(); +} + int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index 8e6c1096eec8..bd79cd56a18b 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -149,8 +149,10 @@ static KnownBits computeKnownBits(const Value *V, unsigned Depth, KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, - const DominatorTree *DT) { - return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); + const DominatorTree *DT, + OptimizationRemarkEmitter *ORE) { + return ::computeKnownBits(V, Depth, + Query(DL, AC, safeCxtI(V, CxtI), DT, ORE)); } bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1f8b50342c2d..c1d81ac203a1 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -660,10 +660,12 @@ void ModuleBitcodeWriter::writeAttributeTable() { SmallVector Record; for (unsigned i = 0, e = Attrs.size(); i != e; ++i) { - const AttributeList &A = Attrs[i]; - for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) - Record.push_back( - VE.getAttributeGroupID({A.getSlotIndex(i), A.getSlotAttributes(i)})); + AttributeList AL = Attrs[i]; + for (unsigned i = AL.index_begin(), e = AL.index_end(); i != e; ++i) { + AttributeSet AS = AL.getAttributes(i); + if (AS.hasAttributes()) + Record.push_back(VE.getAttributeGroupID({i, AS})); + } Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record); Record.clear(); @@ -3413,30 +3415,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // Create value IDs for undefined references. forEachSummary([&](GVInfo I) { - if (auto *VS = dyn_cast(I.second)) { - for (auto &RI : VS->refs()) - assignValueId(RI.getGUID()); - return; - } - - auto *FS = dyn_cast(I.second); - if (!FS) - return; - for (auto &RI : FS->refs()) + for (auto &RI : I.second->refs()) assignValueId(RI.getGUID()); - - for (auto &EI : FS->calls()) { - GlobalValue::GUID GUID = EI.first.getGUID(); - if (!hasValueId(GUID)) { - // For SamplePGO, the indirect call targets for local functions will - // have its original name annotated in profile. We try to find the - // corresponding PGOFuncName as the GUID. - GUID = Index.getGUIDFromOriginalID(GUID); - if (GUID == 0 || !hasValueId(GUID)) - continue; - } - assignValueId(GUID); - } }); for (const auto &GVI : valueIds()) { diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp index fd76400331d9..bb626baabd12 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -902,8 +902,11 @@ void ValueEnumerator::EnumerateAttributes(AttributeList PAL) { } // Do lookups for all attribute groups. - for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) { - IndexAndAttrSet Pair = {PAL.getSlotIndex(i), PAL.getSlotAttributes(i)}; + for (unsigned i = PAL.index_begin(), e = PAL.index_end(); i != e; ++i) { + AttributeSet AS = PAL.getAttributes(i); + if (!AS.hasAttributes()) + continue; + IndexAndAttrSet Pair = {i, AS}; unsigned &Entry = AttributeGroupMap[Pair]; if (Entry == 0) { AttributeGroups.push_back(Pair); diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 7ddb86d80bf0..d72cf5922987 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -628,12 +628,15 @@ void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value, /// EmitFunctionHeader - This method emits the header for the current /// function. void AsmPrinter::EmitFunctionHeader() { + const Function *F = MF->getFunction(); + + if (isVerbose()) + OutStreamer->GetCommentOS() << "-- Begin function " << F->getName() << '\n'; + // Print out constants referenced by the function EmitConstantPool(); // Print the 'header' of function. - const Function *F = MF->getFunction(); - OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(F, TM)); EmitVisibility(CurrentFnSym, F->getVisibility()); @@ -1107,6 +1110,9 @@ void AsmPrinter::EmitFunctionBody() { HI.Handler->endFunction(MF); } + if (isVerbose()) + OutStreamer->GetCommentOS() << "-- End function\n"; + OutStreamer->AddBlankLine(); } diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 1b39e46ee466..114aea391a86 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -1025,11 +1025,11 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) { bool EmptyPrologue = true; for (const auto &MBB : *MF) { for (const auto &MI : MBB) { - if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && + if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) { PrologEndLoc = MI.getDebugLoc(); break; - } else if (!MI.isDebugValue()) { + } else if (!MI.isMetaInstruction()) { EmptyPrologue = false; } } @@ -1562,7 +1562,7 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) { EnumeratorCount++; } } - FTI = FLRB.end(); + FTI = FLRB.end(true); } std::string FullName = getFullyQualifiedName(Ty); @@ -1869,7 +1869,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) { MemberCount++; } - TypeIndex FieldTI = FLBR.end(); + TypeIndex FieldTI = FLBR.end(true); return std::make_tuple(FieldTI, Info.VShapeTI, MemberCount, !Info.NestedClasses.empty()); } diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp index 8e3b88d0af0e..201030f0ac5c 100644 --- a/lib/CodeGen/AsmPrinter/DIEHash.cpp +++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp @@ -116,65 +116,17 @@ void DIEHash::addParentContext(const DIE &Parent) { // Collect all of the attributes for a particular DIE in single structure. void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) { -#define COLLECT_ATTR(NAME) \ - case dwarf::NAME: \ - Attrs.NAME = V; \ - break for (const auto &V : Die.values()) { DEBUG(dbgs() << "Attribute: " << dwarf::AttributeString(V.getAttribute()) << " added.\n"); switch (V.getAttribute()) { - COLLECT_ATTR(DW_AT_name); - COLLECT_ATTR(DW_AT_accessibility); - COLLECT_ATTR(DW_AT_address_class); - COLLECT_ATTR(DW_AT_allocated); - COLLECT_ATTR(DW_AT_artificial); - COLLECT_ATTR(DW_AT_associated); - COLLECT_ATTR(DW_AT_binary_scale); - COLLECT_ATTR(DW_AT_bit_offset); - COLLECT_ATTR(DW_AT_bit_size); - COLLECT_ATTR(DW_AT_bit_stride); - COLLECT_ATTR(DW_AT_byte_size); - COLLECT_ATTR(DW_AT_byte_stride); - COLLECT_ATTR(DW_AT_const_expr); - COLLECT_ATTR(DW_AT_const_value); - COLLECT_ATTR(DW_AT_containing_type); - COLLECT_ATTR(DW_AT_count); - COLLECT_ATTR(DW_AT_data_bit_offset); - COLLECT_ATTR(DW_AT_data_location); - COLLECT_ATTR(DW_AT_data_member_location); - COLLECT_ATTR(DW_AT_decimal_scale); - COLLECT_ATTR(DW_AT_decimal_sign); - COLLECT_ATTR(DW_AT_default_value); - COLLECT_ATTR(DW_AT_digit_count); - COLLECT_ATTR(DW_AT_discr); - COLLECT_ATTR(DW_AT_discr_list); - COLLECT_ATTR(DW_AT_discr_value); - COLLECT_ATTR(DW_AT_encoding); - COLLECT_ATTR(DW_AT_enum_class); - COLLECT_ATTR(DW_AT_endianity); - COLLECT_ATTR(DW_AT_explicit); - COLLECT_ATTR(DW_AT_is_optional); - COLLECT_ATTR(DW_AT_location); - COLLECT_ATTR(DW_AT_lower_bound); - COLLECT_ATTR(DW_AT_mutable); - COLLECT_ATTR(DW_AT_ordering); - COLLECT_ATTR(DW_AT_picture_string); - COLLECT_ATTR(DW_AT_prototyped); - COLLECT_ATTR(DW_AT_small); - COLLECT_ATTR(DW_AT_segment); - COLLECT_ATTR(DW_AT_string_length); - COLLECT_ATTR(DW_AT_threads_scaled); - COLLECT_ATTR(DW_AT_upper_bound); - COLLECT_ATTR(DW_AT_use_location); - COLLECT_ATTR(DW_AT_use_UTF8); - COLLECT_ATTR(DW_AT_variable_parameter); - COLLECT_ATTR(DW_AT_virtuality); - COLLECT_ATTR(DW_AT_visibility); - COLLECT_ATTR(DW_AT_vtable_elem_location); - COLLECT_ATTR(DW_AT_type); +#define HANDLE_DIE_HASH_ATTR(NAME) \ + case dwarf::NAME: \ + Attrs.NAME = V; \ + break; +#include "DIEHashAttributes.def" default: break; } @@ -366,62 +318,12 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) { // Go through the attributes from \param Attrs in the order specified in 7.27.4 // and hash them. void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) { -#define ADD_ATTR(ATTR) \ +#define HANDLE_DIE_HASH_ATTR(NAME) \ { \ - if (ATTR) \ - hashAttribute(ATTR, Tag); \ + if (Attrs.NAME) \ + hashAttribute(Attrs.NAME, Tag); \ } - - ADD_ATTR(Attrs.DW_AT_name); - ADD_ATTR(Attrs.DW_AT_accessibility); - ADD_ATTR(Attrs.DW_AT_address_class); - ADD_ATTR(Attrs.DW_AT_allocated); - ADD_ATTR(Attrs.DW_AT_artificial); - ADD_ATTR(Attrs.DW_AT_associated); - ADD_ATTR(Attrs.DW_AT_binary_scale); - ADD_ATTR(Attrs.DW_AT_bit_offset); - ADD_ATTR(Attrs.DW_AT_bit_size); - ADD_ATTR(Attrs.DW_AT_bit_stride); - ADD_ATTR(Attrs.DW_AT_byte_size); - ADD_ATTR(Attrs.DW_AT_byte_stride); - ADD_ATTR(Attrs.DW_AT_const_expr); - ADD_ATTR(Attrs.DW_AT_const_value); - ADD_ATTR(Attrs.DW_AT_containing_type); - ADD_ATTR(Attrs.DW_AT_count); - ADD_ATTR(Attrs.DW_AT_data_bit_offset); - ADD_ATTR(Attrs.DW_AT_data_location); - ADD_ATTR(Attrs.DW_AT_data_member_location); - ADD_ATTR(Attrs.DW_AT_decimal_scale); - ADD_ATTR(Attrs.DW_AT_decimal_sign); - ADD_ATTR(Attrs.DW_AT_default_value); - ADD_ATTR(Attrs.DW_AT_digit_count); - ADD_ATTR(Attrs.DW_AT_discr); - ADD_ATTR(Attrs.DW_AT_discr_list); - ADD_ATTR(Attrs.DW_AT_discr_value); - ADD_ATTR(Attrs.DW_AT_encoding); - ADD_ATTR(Attrs.DW_AT_enum_class); - ADD_ATTR(Attrs.DW_AT_endianity); - ADD_ATTR(Attrs.DW_AT_explicit); - ADD_ATTR(Attrs.DW_AT_is_optional); - ADD_ATTR(Attrs.DW_AT_location); - ADD_ATTR(Attrs.DW_AT_lower_bound); - ADD_ATTR(Attrs.DW_AT_mutable); - ADD_ATTR(Attrs.DW_AT_ordering); - ADD_ATTR(Attrs.DW_AT_picture_string); - ADD_ATTR(Attrs.DW_AT_prototyped); - ADD_ATTR(Attrs.DW_AT_small); - ADD_ATTR(Attrs.DW_AT_segment); - ADD_ATTR(Attrs.DW_AT_string_length); - ADD_ATTR(Attrs.DW_AT_threads_scaled); - ADD_ATTR(Attrs.DW_AT_upper_bound); - ADD_ATTR(Attrs.DW_AT_use_location); - ADD_ATTR(Attrs.DW_AT_use_UTF8); - ADD_ATTR(Attrs.DW_AT_variable_parameter); - ADD_ATTR(Attrs.DW_AT_virtuality); - ADD_ATTR(Attrs.DW_AT_visibility); - ADD_ATTR(Attrs.DW_AT_vtable_elem_location); - ADD_ATTR(Attrs.DW_AT_type); - +#include "DIEHashAttributes.def" // FIXME: Add the extended attributes. } @@ -478,10 +380,12 @@ void DIEHash::computeHash(const DIE &Die) { /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE /// with the inclusion of the full CU and all top level CU entities. // TODO: Initialize the type chain at 0 instead of 1 for CU signatures. -uint64_t DIEHash::computeCUSignature(const DIE &Die) { +uint64_t DIEHash::computeCUSignature(StringRef DWOName, const DIE &Die) { Numbering.clear(); Numbering[&Die] = 1; + if (!DWOName.empty()) + Hash.update(DWOName); // Hash the DIE. computeHash(Die); diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h index 996cd7ef3d2e..29337ae38a99 100644 --- a/lib/CodeGen/AsmPrinter/DIEHash.h +++ b/lib/CodeGen/AsmPrinter/DIEHash.h @@ -28,64 +28,15 @@ class CompileUnit; class DIEHash { // Collection of all attributes used in hashing a particular DIE. struct DIEAttrs { - DIEValue DW_AT_name; - DIEValue DW_AT_accessibility; - DIEValue DW_AT_address_class; - DIEValue DW_AT_allocated; - DIEValue DW_AT_artificial; - DIEValue DW_AT_associated; - DIEValue DW_AT_binary_scale; - DIEValue DW_AT_bit_offset; - DIEValue DW_AT_bit_size; - DIEValue DW_AT_bit_stride; - DIEValue DW_AT_byte_size; - DIEValue DW_AT_byte_stride; - DIEValue DW_AT_const_expr; - DIEValue DW_AT_const_value; - DIEValue DW_AT_containing_type; - DIEValue DW_AT_count; - DIEValue DW_AT_data_bit_offset; - DIEValue DW_AT_data_location; - DIEValue DW_AT_data_member_location; - DIEValue DW_AT_decimal_scale; - DIEValue DW_AT_decimal_sign; - DIEValue DW_AT_default_value; - DIEValue DW_AT_digit_count; - DIEValue DW_AT_discr; - DIEValue DW_AT_discr_list; - DIEValue DW_AT_discr_value; - DIEValue DW_AT_encoding; - DIEValue DW_AT_enum_class; - DIEValue DW_AT_endianity; - DIEValue DW_AT_explicit; - DIEValue DW_AT_is_optional; - DIEValue DW_AT_location; - DIEValue DW_AT_lower_bound; - DIEValue DW_AT_mutable; - DIEValue DW_AT_ordering; - DIEValue DW_AT_picture_string; - DIEValue DW_AT_prototyped; - DIEValue DW_AT_small; - DIEValue DW_AT_segment; - DIEValue DW_AT_string_length; - DIEValue DW_AT_threads_scaled; - DIEValue DW_AT_upper_bound; - DIEValue DW_AT_use_location; - DIEValue DW_AT_use_UTF8; - DIEValue DW_AT_variable_parameter; - DIEValue DW_AT_virtuality; - DIEValue DW_AT_visibility; - DIEValue DW_AT_vtable_elem_location; - DIEValue DW_AT_type; - - // Insert any additional ones here... +#define HANDLE_DIE_HASH_ATTR(NAME) DIEValue NAME; +#include "DIEHashAttributes.def" }; public: DIEHash(AsmPrinter *A = nullptr) : AP(A) {} /// \brief Computes the CU signature. - uint64_t computeCUSignature(const DIE &Die); + uint64_t computeCUSignature(StringRef DWOName, const DIE &Die); /// \brief Computes the type signature. uint64_t computeTypeSignature(const DIE &Die); diff --git a/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/lib/CodeGen/AsmPrinter/DIEHashAttributes.def new file mode 100644 index 000000000000..28a02390fccb --- /dev/null +++ b/lib/CodeGen/AsmPrinter/DIEHashAttributes.def @@ -0,0 +1,55 @@ +#ifndef HANDLE_DIE_HASH_ATTR +#error "Missing macro definition of HANDLE_DIE_HASH_ATTR" +#endif + +HANDLE_DIE_HASH_ATTR(DW_AT_name) +HANDLE_DIE_HASH_ATTR(DW_AT_accessibility) +HANDLE_DIE_HASH_ATTR(DW_AT_address_class) +HANDLE_DIE_HASH_ATTR(DW_AT_allocated) +HANDLE_DIE_HASH_ATTR(DW_AT_artificial) +HANDLE_DIE_HASH_ATTR(DW_AT_associated) +HANDLE_DIE_HASH_ATTR(DW_AT_binary_scale) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_offset) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_size) +HANDLE_DIE_HASH_ATTR(DW_AT_bit_stride) +HANDLE_DIE_HASH_ATTR(DW_AT_byte_size) +HANDLE_DIE_HASH_ATTR(DW_AT_byte_stride) +HANDLE_DIE_HASH_ATTR(DW_AT_const_expr) +HANDLE_DIE_HASH_ATTR(DW_AT_const_value) +HANDLE_DIE_HASH_ATTR(DW_AT_containing_type) +HANDLE_DIE_HASH_ATTR(DW_AT_count) +HANDLE_DIE_HASH_ATTR(DW_AT_data_bit_offset) +HANDLE_DIE_HASH_ATTR(DW_AT_data_location) +HANDLE_DIE_HASH_ATTR(DW_AT_data_member_location) +HANDLE_DIE_HASH_ATTR(DW_AT_decimal_scale) +HANDLE_DIE_HASH_ATTR(DW_AT_decimal_sign) +HANDLE_DIE_HASH_ATTR(DW_AT_default_value) +HANDLE_DIE_HASH_ATTR(DW_AT_digit_count) +HANDLE_DIE_HASH_ATTR(DW_AT_discr) +HANDLE_DIE_HASH_ATTR(DW_AT_discr_list) +HANDLE_DIE_HASH_ATTR(DW_AT_discr_value) +HANDLE_DIE_HASH_ATTR(DW_AT_encoding) +HANDLE_DIE_HASH_ATTR(DW_AT_enum_class) +HANDLE_DIE_HASH_ATTR(DW_AT_endianity) +HANDLE_DIE_HASH_ATTR(DW_AT_explicit) +HANDLE_DIE_HASH_ATTR(DW_AT_is_optional) +HANDLE_DIE_HASH_ATTR(DW_AT_location) +HANDLE_DIE_HASH_ATTR(DW_AT_lower_bound) +HANDLE_DIE_HASH_ATTR(DW_AT_mutable) +HANDLE_DIE_HASH_ATTR(DW_AT_ordering) +HANDLE_DIE_HASH_ATTR(DW_AT_picture_string) +HANDLE_DIE_HASH_ATTR(DW_AT_prototyped) +HANDLE_DIE_HASH_ATTR(DW_AT_small) +HANDLE_DIE_HASH_ATTR(DW_AT_segment) +HANDLE_DIE_HASH_ATTR(DW_AT_string_length) +HANDLE_DIE_HASH_ATTR(DW_AT_threads_scaled) +HANDLE_DIE_HASH_ATTR(DW_AT_upper_bound) +HANDLE_DIE_HASH_ATTR(DW_AT_use_location) +HANDLE_DIE_HASH_ATTR(DW_AT_use_UTF8) +HANDLE_DIE_HASH_ATTR(DW_AT_variable_parameter) +HANDLE_DIE_HASH_ATTR(DW_AT_virtuality) +HANDLE_DIE_HASH_ATTR(DW_AT_visibility) +HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location) +HANDLE_DIE_HASH_ATTR(DW_AT_type) + +#undef HANDLE_DIE_HASH_ATTR diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 826162ad47c4..0971c5942203 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -115,7 +115,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) { return getBaseTypeSize(BaseType); } -bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) { +static bool hasDebugInfo(const MachineModuleInfo *MMI, + const MachineFunction *MF) { if (!MMI->hasDebugInfo()) return false; auto *SP = MF->getFunction()->getSubprogram(); @@ -223,9 +224,9 @@ void DebugHandlerBase::endInstruction() { return; assert(CurMI != nullptr); - // Don't create a new label after DBG_VALUE instructions. - // They don't generate code. - if (!CurMI->isDebugValue()) { + // Don't create a new label after DBG_VALUE and other instructions that don't + // generate code. + if (!CurMI->isMetaInstruction()) { PrevLabel = nullptr; PrevInstBB = CurMI->getParent(); } diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index e172712cf889..04073b3aed68 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -760,7 +760,7 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) { /// addGlobalName - Add a new global name to the compile unit. void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Name.str(); GlobalNames[FullName] = &Die; @@ -768,7 +768,7 @@ void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die, void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Name.str(); // Insert, allowing the entry to remain as-is if it's already present @@ -781,7 +781,7 @@ void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name, /// Add a new global type to the unit. void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Ty->getName().str(); GlobalTypes[FullName] = &Die; @@ -789,7 +789,7 @@ void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die, void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context) { - if (includeMinimalInlineScopes()) + if (!DD->hasDwarfPubSections(includeMinimalInlineScopes())) return; std::string FullName = getParentContextString(Context) + Ty->getName().str(); // Insert, allowing the entry to remain as-is if it's already present diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 77e9e671529f..b8f57472f17c 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -77,8 +77,6 @@ class DwarfCompileUnit final : public DwarfUnit { bool isDwoUnit() const override; - bool includeMinimalInlineScopes() const; - DenseMap &getAbstractSPDies() { if (isDwoUnit() && !DD->shareAcrossDWOCUs()) return AbstractSPDies; @@ -101,6 +99,8 @@ public: return Skeleton; } + bool includeMinimalInlineScopes() const; + void initStmtList(); /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE. diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 3410b98d7776..bf27516e1ccd 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -252,12 +252,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) // Handle split DWARF. HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty(); - // Pubnames/pubtypes on by default for GDB. - if (DwarfPubSections == Default) - HasDwarfPubSections = tuneForGDB(); - else - HasDwarfPubSections = DwarfPubSections == Enable; - // SCE defaults to linkage names only for abstract subprograms. if (DwarfLinkageNames == DefaultLinkageNames) UseAllLinkageNames = !tuneForSCE(); @@ -380,19 +374,35 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - auto &CU = *CUMap.lookup(SP->getUnit()); - if (auto *SkelCU = CU.getSkeleton()) { - (shareAcrossDWOCUs() ? CU : SrcCU) - .constructAbstractSubprogramScopeDIE(Scope); - if (CU.getCUNode()->getSplitDebugInlining()) - SkelCU->constructAbstractSubprogramScopeDIE(Scope); - } else { - CU.constructAbstractSubprogramScopeDIE(Scope); + if (useSplitDwarf() && !shareAcrossDWOCUs() && !SP->getUnit()->getSplitDebugInlining()) + // Avoid building the original CU if it won't be used + SrcCU.constructAbstractSubprogramScopeDIE(Scope); + else { + auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); + if (auto *SkelCU = CU.getSkeleton()) { + (shareAcrossDWOCUs() ? CU : SrcCU) + .constructAbstractSubprogramScopeDIE(Scope); + if (CU.getCUNode()->getSplitDebugInlining()) + SkelCU->constructAbstractSubprogramScopeDIE(Scope); + } else + CU.constructAbstractSubprogramScopeDIE(Scope); } } -void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { - if (!GenerateGnuPubSections) +bool DwarfDebug::hasDwarfPubSections(bool includeMinimalInlineScopes) const { + // Opting in to GNU Pubnames/types overrides the default to ensure these are + // generated for things like Gold's gdb_index generation. + if (GenerateGnuPubSections) + return true; + + if (DwarfPubSections == Default) + return tuneForGDB() && !includeMinimalInlineScopes; + + return DwarfPubSections == Enable; +} + +void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const { + if (!hasDwarfPubSections(U.includeMinimalInlineScopes())) return; U.addFlag(D, dwarf::DW_AT_GNU_pubnames); @@ -401,7 +411,9 @@ void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { // Create new DwarfCompileUnit for the given metadata node with tag // DW_TAG_compile_unit. DwarfCompileUnit & -DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) { +DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { + if (auto *CU = CUMap.lookup(DIUnit)) + return *CU; StringRef FN = DIUnit->getFilename(); CompilationDir = DIUnit->getDirectory(); @@ -534,7 +546,12 @@ void DwarfDebug::beginModule() { } for (DICompileUnit *CUNode : M->debug_compile_units()) { - DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode); + if (CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() && + CUNode->getGlobalVariables().empty() && + CUNode->getImportedEntities().empty() && CUNode->getMacros().empty()) + continue; + + DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(CUNode); for (auto *IE : CUNode->getImportedEntities()) CU.addImportedEntity(IE); @@ -581,11 +598,12 @@ void DwarfDebug::finishVariableDefinitions() { } void DwarfDebug::finishSubprogramDefinitions() { - for (const DISubprogram *SP : ProcessedSPNodes) - if (SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug) - forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) { - CU.finishSubprogramDefinition(SP); - }); + for (const DISubprogram *SP : ProcessedSPNodes) { + assert(SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug); + forBothCUs( + getOrCreateDwarfCompileUnit(SP->getUnit()), + [&](DwarfCompileUnit &CU) { CU.finishSubprogramDefinition(SP); }); + } } void DwarfDebug::finalizeModuleInfo() { @@ -595,6 +613,13 @@ void DwarfDebug::finalizeModuleInfo() { finishVariableDefinitions(); + // Include the DWO file name in the hash if there's more than one CU. + // This handles ThinLTO's situation where imported CUs may very easily be + // duplicate with the same CU partially imported into another ThinLTO unit. + StringRef DWOName; + if (CUMap.size() > 1) + DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile; + // Handle anything that needs to be done on a per-unit basis after // all other generation. for (const auto &P : CUMap) { @@ -609,7 +634,8 @@ void DwarfDebug::finalizeModuleInfo() { auto *SkCU = TheCU.getSkeleton(); if (useSplitDwarf()) { // Emit a unique identifier for this CU. - uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie()); + uint64_t ID = + DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie()); TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, ID); SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id, @@ -718,7 +744,9 @@ void DwarfDebug::endModule() { } // Emit the pubnames and pubtypes sections if requested. - if (HasDwarfPubSections) { + // The condition is optimistically correct - any CU not using GMLT (& + // implicit/default pubnames state) might still have pubnames. + if (hasDwarfPubSections(/* gmlt */ false)) { emitDebugPubNames(GenerateGnuPubSections); emitDebugPubTypes(GenerateGnuPubSections); } @@ -1028,8 +1056,12 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); assert(CurMI); + const auto *SP = MI->getParent()->getParent()->getFunction()->getSubprogram(); + if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) + return; + // Check if source location changes, but ignore DBG_VALUE and CFI locations. - if (MI->isDebugValue() || MI->isCFIInstruction()) + if (MI->isMetaInstruction()) return; const DebugLoc &DL = MI->getDebugLoc(); // When we emit a line-0 record, we don't update PrevInstLoc; so look at @@ -1111,7 +1143,7 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { // the beginning of the function body. for (const auto &MBB : *MF) for (const auto &MI : MBB) - if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) && + if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) return MI.getDebugLoc(); return DebugLoc(); @@ -1122,40 +1154,28 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) { CurFn = MF; - if (LScopes.empty()) + auto *SP = MF->getFunction()->getSubprogram(); + assert(LScopes.empty() || SP == LScopes.getCurrentFunctionScope()->getScopeNode()); + if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) return; + DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); + // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function // belongs to so that we add to the correct per-cu line table in the // non-asm case. - LexicalScope *FnScope = LScopes.getCurrentFunctionScope(); - // FnScope->getScopeNode() and DI->second should represent the same function, - // though they may not be the same MDNode due to inline functions merged in - // LTO where the debug info metadata still differs (either due to distinct - // written differences - two versions of a linkonce_odr function - // written/copied into two separate files, or some sub-optimal metadata that - // isn't structurally identical (see: file path/name info from clang, which - // includes the directory of the cpp file being built, even when the file name - // is absolute (such as an <> lookup header))) - auto *SP = cast(FnScope->getScopeNode()); - DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit()); - if (!TheCU) { - assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug && - "DICompileUnit missing from llvm.dbg.cu?"); - return; - } if (Asm->OutStreamer->hasRawTextSupport()) // Use a single line table if we are generating assembly. Asm->OutStreamer->getContext().setDwarfCompileUnitID(0); else - Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID()); + Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID()); // Record beginning of function. PrologEndLoc = findPrologueEndLoc(MF); - if (DILocation *L = PrologEndLoc) { + if (PrologEndLoc) { // We'd like to list the prologue as "not statements" but GDB behaves // poorly if we do that. Revisit this with caution/GDB (7.5+) testing. - auto *SP = L->getInlinedAtScope()->getSubprogram(); + auto *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram(); recordSourceLine(SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT); } } @@ -1395,7 +1415,7 @@ void DwarfDebug::emitDebugPubSection( const auto &Globals = (TheU->*Accessor)(); - if (Globals.empty()) + if (!hasDwarfPubSections(TheU->includeMinimalInlineScopes())) continue; if (auto *Skeleton = TheU->getSkeleton()) @@ -1544,6 +1564,9 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) { // Emit locations into the debug loc section. void DwarfDebug::emitDebugLoc() { + if (DebugLocs.getLists().empty()) + return; + // Start the dwarf loc section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfLocSection()); @@ -1755,6 +1778,9 @@ void DwarfDebug::emitDebugARanges() { /// Emit address ranges into a debug ranges section. void DwarfDebug::emitDebugRanges() { + if (CUMap.empty()) + return; + // Start the dwarf ranges section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfRangesSection()); @@ -1834,6 +1860,9 @@ void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) { /// Emit macros into a debug macinfo section. void DwarfDebug::emitDebugMacinfo() { + if (CUMap.empty()) + return; + // Start the dwarf macinfo section. Asm->OutStreamer->SwitchSection( Asm->getObjFileLowering().getDwarfMacinfoSection()); diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index b9c5aa9ffb23..ebfba4cfc275 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -246,9 +246,6 @@ class DwarfDebug : public DebugHandlerBase { std::pair, const DICompositeType *>, 1> TypeUnitsUnderConstruction; - /// Whether to emit the pubnames/pubtypes sections. - bool HasDwarfPubSections; - /// Whether to use the GNU TLS opcode (instead of the standard opcode). bool UseGNUTLSOpcode; @@ -415,11 +412,11 @@ class DwarfDebug : public DebugHandlerBase { /// Flags to let the linker know we have emitted new style pubnames. Only /// emit it here if we don't have a skeleton CU for split dwarf. - void addGnuPubAttributes(DwarfUnit &U, DIE &D) const; + void addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const; /// Create new DwarfCompileUnit for the given metadata node with tag /// DW_TAG_compile_unit. - DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit); + DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit); /// Construct imported_module or imported_declaration DIE. void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU, @@ -556,6 +553,8 @@ public: /// A helper function to check whether the DIE for a given Scope is /// going to be null. bool isLexicalScopeDIENull(LexicalScope *Scope); + + bool hasDwarfPubSections(bool includeMinimalInlineScopes) const; }; } // End of namespace llvm diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index 984973cf3a3b..344136b1f195 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -96,7 +96,7 @@ namespace { char AtomicExpand::ID = 0; char &llvm::AtomicExpandID = AtomicExpand::ID; -INITIALIZE_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions", +INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", false, false) FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); } diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index a67e194356d8..d3fced436b68 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -24,8 +24,6 @@ #include using namespace llvm; -#define DEBUG_TYPE "basictti" - // This flag is used by the template base class for BasicTTIImpl, and here to // provide a definition. cl::opt diff --git a/lib/CodeGen/BranchCoalescing.cpp b/lib/CodeGen/BranchCoalescing.cpp index efdf300df850..2c41b597843c 100644 --- a/lib/CodeGen/BranchCoalescing.cpp +++ b/lib/CodeGen/BranchCoalescing.cpp @@ -27,7 +27,7 @@ using namespace llvm; -#define DEBUG_TYPE "coal-branch" +#define DEBUG_TYPE "branch-coalescing" static cl::opt EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden, @@ -193,11 +193,11 @@ public: char BranchCoalescing::ID = 0; char &llvm::BranchCoalescingID = BranchCoalescing::ID; -INITIALIZE_PASS_BEGIN(BranchCoalescing, "branch-coalescing", +INITIALIZE_PASS_BEGIN(BranchCoalescing, DEBUG_TYPE, "Branch Coalescing", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_END(BranchCoalescing, "branch-coalescing", "Branch Coalescing", +INITIALIZE_PASS_END(BranchCoalescing, DEBUG_TYPE, "Branch Coalescing", false, false) BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo() diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp index b63d9f4a4351..03ceac10beec 100644 --- a/lib/CodeGen/BranchFolding.cpp +++ b/lib/CodeGen/BranchFolding.cpp @@ -44,7 +44,7 @@ #include using namespace llvm; -#define DEBUG_TYPE "branchfolding" +#define DEBUG_TYPE "branch-folder" STATISTIC(NumDeadBlocks, "Number of dead blocks removed"); STATISTIC(NumBranchOpts, "Number of branches optimized"); @@ -89,7 +89,7 @@ namespace { char BranchFolderPass::ID = 0; char &llvm::BranchFolderPassID = BranchFolderPass::ID; -INITIALIZE_PASS(BranchFolderPass, "branch-folder", +INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE, "Control Flow Optimizer", false, false) bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) { @@ -153,13 +153,14 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF, TriedMerging.clear(); + MachineRegisterInfo &MRI = MF.getRegInfo(); AfterBlockPlacement = AfterPlacement; TII = tii; TRI = tri; MMI = mmi; MLI = mli; + this->MRI = &MRI; - MachineRegisterInfo &MRI = MF.getRegInfo(); UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF); if (!UpdateLiveIns) MRI.invalidateLiveness(); @@ -351,7 +352,7 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst, if (UpdateLiveIns) { NewDest->clearLiveIns(); - computeLiveIns(LiveRegs, *TRI, *NewDest); + computeLiveIns(LiveRegs, *MRI, *NewDest); } ++NumTailMerge; @@ -388,7 +389,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB, MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB)); if (UpdateLiveIns) - computeLiveIns(LiveRegs, *TRI, *NewMBB); + computeLiveIns(LiveRegs, *MRI, *NewMBB); // Add the new block to the funclet. const auto &FuncletI = FuncletMembership.find(&CurMBB); diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h index 4852721eea10..92681137e4c6 100644 --- a/lib/CodeGen/BranchFolding.h +++ b/lib/CodeGen/BranchFolding.h @@ -108,6 +108,7 @@ namespace llvm { bool UpdateLiveIns; unsigned MinCommonTailLength; const TargetInstrInfo *TII; + const MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; MachineModuleInfo *MMI; MachineLoopInfo *MLI; diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp index 7af136941661..e3de61c7816f 100644 --- a/lib/CodeGen/BranchRelaxation.cpp +++ b/lib/CodeGen/BranchRelaxation.cpp @@ -259,7 +259,7 @@ MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI, // Need to fix live-in lists if we track liveness. if (TRI->trackLivenessAfterRegAlloc(*MF)) - computeLiveIns(LiveRegs, *TRI, *NewBB); + computeLiveIns(LiveRegs, MF->getRegInfo(), *NewBB); ++NumSplit; @@ -345,6 +345,10 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) { // Do it here since if there's no split, no update is needed. MBB->replaceSuccessor(FBB, &NewBB); NewBB.addSuccessor(FBB); + + // Need to fix live-in lists if we track liveness. + if (TRI->trackLivenessAfterRegAlloc(*MF)) + computeLiveIns(LiveRegs, MF->getRegInfo(), NewBB); } // We now have an appropriate fall-through block in place (either naturally or diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index 3a1a3020a8d4..4e85708efafc 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -257,10 +257,10 @@ class TypePromotionTransaction; } char CodeGenPrepare::ID = 0; -INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare", +INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE, "Optimize for code generation", false, false) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) -INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare", +INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, "Optimize for code generation", false, false) FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); } diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp index 7ac2e5445435..265dda16bfa7 100644 --- a/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -23,7 +23,7 @@ using namespace llvm; -#define DEBUG_TYPE "codegen-dce" +#define DEBUG_TYPE "dead-mi-elimination" STATISTIC(NumDeletes, "Number of dead instructions deleted"); @@ -54,7 +54,7 @@ namespace { char DeadMachineInstructionElim::ID = 0; char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID; -INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination", +INITIALIZE_PASS(DeadMachineInstructionElim, DEBUG_TYPE, "Remove dead machine instructions", false, false) bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const { diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp index 6f4ea1912cf4..ab9a0592e017 100644 --- a/lib/CodeGen/DetectDeadLanes.cpp +++ b/lib/CodeGen/DetectDeadLanes.cpp @@ -132,8 +132,7 @@ private: char DetectDeadLanes::ID = 0; char &llvm::DetectDeadLanesID = DetectDeadLanes::ID; -INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes", - false, false) +INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false) /// Returns true if \p MI will get lowered to a series of COPY instructions. /// We call this a COPY-like instruction. diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp index 1ef4d8660657..06ae5cd72c85 100644 --- a/lib/CodeGen/DwarfEHPrepare.cpp +++ b/lib/CodeGen/DwarfEHPrepare.cpp @@ -71,12 +71,12 @@ namespace { } // end anonymous namespace char DwarfEHPrepare::ID = 0; -INITIALIZE_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare", +INITIALIZE_PASS_BEGIN(DwarfEHPrepare, DEBUG_TYPE, "Prepare DWARF exceptions", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_END(DwarfEHPrepare, "dwarfehprepare", +INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE, "Prepare DWARF exceptions", false, false) FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); } diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp index 729172796453..402afe75b141 100644 --- a/lib/CodeGen/EarlyIfConversion.cpp +++ b/lib/CodeGen/EarlyIfConversion.cpp @@ -616,13 +616,13 @@ private: char EarlyIfConverter::ID = 0; char &llvm::EarlyIfConverterID = EarlyIfConverter::ID; -INITIALIZE_PASS_BEGIN(EarlyIfConverter, - "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE, + "Early If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(EarlyIfConverter, - "early-ifcvt", "Early If Converter", false, false) +INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE, + "Early If Converter", false, false) void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp index 0ec79c2e69f9..88d422a0f545 100644 --- a/lib/CodeGen/ExpandISelPseudos.cpp +++ b/lib/CodeGen/ExpandISelPseudos.cpp @@ -41,7 +41,7 @@ namespace { char ExpandISelPseudos::ID = 0; char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID; -INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos", +INITIALIZE_PASS(ExpandISelPseudos, DEBUG_TYPE, "Expand ISel Pseudo-instructions", false, false) bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) { diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp index e860906043dd..27cd639b2a49 100644 --- a/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -58,7 +58,7 @@ private: char ExpandPostRA::ID = 0; char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID; -INITIALIZE_PASS(ExpandPostRA, "postrapseudos", +INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE, "Post-RA pseudo instruction expansion pass", false, false) /// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered diff --git a/lib/CodeGen/FuncletLayout.cpp b/lib/CodeGen/FuncletLayout.cpp index d61afad4db57..0bdd5e64a7f2 100644 --- a/lib/CodeGen/FuncletLayout.cpp +++ b/lib/CodeGen/FuncletLayout.cpp @@ -37,7 +37,7 @@ public: char FuncletLayout::ID = 0; char &llvm::FuncletLayoutID = FuncletLayout::ID; -INITIALIZE_PASS(FuncletLayout, "funclet-layout", +INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE, "Contiguously Lay Out Funclets", false, false) bool FuncletLayout::runOnMachineFunction(MachineFunction &F) { diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt index 03a8c4f5f909..eba7ea8132e3 100644 --- a/lib/CodeGen/GlobalISel/CMakeLists.txt +++ b/lib/CodeGen/GlobalISel/CMakeLists.txt @@ -8,6 +8,7 @@ set(GLOBAL_ISEL_FILES LegalizerHelper.cpp Legalizer.cpp LegalizerInfo.cpp + Localizer.cpp RegBankSelect.cpp RegisterBank.cpp RegisterBankInfo.cpp @@ -24,11 +25,11 @@ endif() # In LLVMBuild.txt files, it is not possible to mark a dependency to a # library as optional. So instead, generate an empty library if we did -# not ask for it. +# not ask for it. add_llvm_library(LLVMGlobalISel ${GLOBAL_ISEL_BUILD_FILES} GlobalISel.cpp - + DEPENDS intrinsics_gen ) diff --git a/lib/CodeGen/GlobalISel/GlobalISel.cpp b/lib/CodeGen/GlobalISel/GlobalISel.cpp index fcd2722f1c2f..29d1209bb02a 100644 --- a/lib/CodeGen/GlobalISel/GlobalISel.cpp +++ b/lib/CodeGen/GlobalISel/GlobalISel.cpp @@ -26,6 +26,7 @@ void llvm::initializeGlobalISel(PassRegistry &Registry) { void llvm::initializeGlobalISel(PassRegistry &Registry) { initializeIRTranslatorPass(Registry); initializeLegalizerPass(Registry); + initializeLocalizerPass(Registry); initializeRegBankSelectPass(Registry); initializeInstructionSelectPass(Registry); } diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp new file mode 100644 index 000000000000..bdca732b4e33 --- /dev/null +++ b/lib/CodeGen/GlobalISel/Localizer.cpp @@ -0,0 +1,125 @@ +//===- Localizer.cpp ---------------------- Localize some instrs -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the Localizer class. +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GlobalISel/Localizer.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "localizer" + +using namespace llvm; + +char Localizer::ID = 0; +INITIALIZE_PASS(Localizer, DEBUG_TYPE, + "Move/duplicate certain instructions close to their use", false, + false); + +Localizer::Localizer() : MachineFunctionPass(ID) { + initializeLocalizerPass(*PassRegistry::getPassRegistry()); +} + +void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); } + +bool Localizer::shouldLocalize(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + // Constants-like instructions should be close to their users. + // We don't want long live-ranges for them. + case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FRAME_INDEX: + return true; + } +} + +bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def, + MachineBasicBlock *&InsertMBB) { + MachineInstr &MIUse = *MOUse.getParent(); + InsertMBB = MIUse.getParent(); + if (MIUse.isPHI()) + InsertMBB = MIUse.getOperand(MIUse.getOperandNo(&MOUse) + 1).getMBB(); + return InsertMBB == Def.getParent(); +} + +bool Localizer::runOnMachineFunction(MachineFunction &MF) { + // If the ISel pipeline failed, do not bother running that pass. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + + DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n'); + + init(MF); + + bool Changed = false; + // Keep track of the instructions we localized. + // We won't need to process them if we see them later in the CFG. + SmallPtrSet LocalizedInstrs; + DenseMap, unsigned> MBBWithLocalDef; + // TODO: Do bottom up traversal. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) + continue; + DEBUG(dbgs() << "Should localize: " << MI); + assert(MI.getDesc().getNumDefs() == 1 && + "More than one definition not supported yet"); + unsigned Reg = MI.getOperand(0).getReg(); + // Check if all the users of MI are local. + // We are going to invalidation the list of use operands, so we + // can't use range iterator. + for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); + MOIt != MOItEnd;) { + MachineOperand &MOUse = *MOIt++; + // Check if the use is already local. + MachineBasicBlock *InsertMBB; + DEBUG(MachineInstr &MIUse = *MOUse.getParent(); + dbgs() << "Checking use: " << MIUse + << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); + if (isLocalUse(MOUse, MI, InsertMBB)) + continue; + DEBUG(dbgs() << "Fixing non-local use\n"); + Changed = true; + auto MBBAndReg = std::make_pair(InsertMBB, Reg); + auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); + if (NewVRegIt == MBBWithLocalDef.end()) { + // Create the localized instruction. + MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); + LocalizedInstrs.insert(LocalizedMI); + // Move it at the right place. + MachineInstr &MIUse = *MOUse.getParent(); + if (MIUse.getParent() == InsertMBB) + InsertMBB->insert(MIUse, LocalizedMI); + else + InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI); + + // Set a new register for the definition. + unsigned NewReg = + MRI->createGenericVirtualRegister(MRI->getType(Reg)); + MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); + LocalizedMI->getOperand(0).setReg(NewReg); + NewVRegIt = + MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; + DEBUG(dbgs() << "Inserted: " << *LocalizedMI); + } + DEBUG(dbgs() << "Update use with: " << PrintReg(NewVRegIt->second) + << '\n'); + // Update the user reg. + MOUse.setReg(NewVRegIt->second); + } + } + } + return Changed; +} diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp index 1ea534939948..23812a2a2344 100644 --- a/lib/CodeGen/GlobalMerge.cpp +++ b/lib/CodeGen/GlobalMerge.cpp @@ -192,10 +192,7 @@ namespace { } // end anonymous namespace char GlobalMerge::ID = 0; -INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables", - false, false) -INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables", - false, false) +INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false) bool GlobalMerge::doMerge(SmallVectorImpl &Globals, Module &M, bool isConst, unsigned AddrSpace) const { diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 628d599a3cc7..1c33f3b6800e 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -39,7 +39,7 @@ using namespace llvm; -#define DEBUG_TYPE "ifcvt" +#define DEBUG_TYPE "if-converter" // Hidden options for help debugging. static cl::opt IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden); @@ -316,9 +316,9 @@ namespace { char &llvm::IfConverterID = IfConverter::ID; -INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false) +INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false) +INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false) bool IfConverter::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction()) || (PredicateFtor && !PredicateFtor(MF))) @@ -1588,32 +1588,22 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB); } - // To be able to insert code freely at the end of BBI we sometimes remove - // the branch from BBI to NextMBB temporarily. Remember if this happened. - bool RemovedBranchToNextMBB = false; if (CvtMBB.pred_size() > 1) { BBI.NonPredSize -= TII->removeBranch(*BBI.BB); // Copy instructions in the true block, predicate them, and add them to // the entry block. CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true); - // Keep the CFG updated. + // RemoveExtraEdges won't work if the block has an unanalyzable branch, so + // explicitly remove CvtBBI as a successor. BBI.BB->removeSuccessor(&CvtMBB, true); } else { // Predicate the 'true' block after removing its branch. CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB); PredicateBlock(*CvtBBI, CvtMBB.end(), Cond); - // Remove the branch from the entry of the triangle to NextBB to be able to - // do the merge below. Keep the CFG updated, but remember we removed the - // branch since we do want to execute NextMBB, either by introducing a - // branch to it again, or merging it into the entry block. - // How it's handled is decided further down. - BBI.NonPredSize -= TII->removeBranch(*BBI.BB); - BBI.BB->removeSuccessor(&NextMBB, true); - RemovedBranchToNextMBB = true; - // Now merge the entry of the triangle with the true block. + BBI.NonPredSize -= TII->removeBranch(*BBI.BB); MergeBlocks(BBI, *CvtBBI, false); } @@ -1651,19 +1641,12 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { // block. By not merging them, we make it possible to iteratively // ifcvt the blocks. if (!HasEarlyExit && - // We might have removed BBI from NextMBB's predecessor list above but - // we want it to be there, so consider that too. - (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) && - !NextBBI->HasFallThrough && + NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough && !NextMBB.hasAddressTaken()) { - // We will merge NextBBI into BBI, and thus remove the current - // fallthrough from BBI into CvtBBI. - BBI.BB->removeSuccessor(&CvtMBB, true); MergeBlocks(BBI, *NextBBI); FalseBBDead = true; } else { InsertUncondBranch(*BBI.BB, NextMBB, TII); - BBI.BB->addSuccessor(&NextMBB); BBI.HasFallThrough = false; } // Mixed predicated and unpredicated code. This cannot be iteratively @@ -1671,6 +1654,8 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { IterIfcvt = false; } + RemoveExtraEdges(BBI); + // Update block info. BB can be iteratively if-converted. if (!IterIfcvt) BBI.IsDone = true; diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index 920c2a372a9b..24e289dd4f1b 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -674,8 +674,8 @@ void ImplicitNullChecks::rewriteNullChecks( char ImplicitNullChecks::ID = 0; char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID; -INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks", +INITIALIZE_PASS_BEGIN(ImplicitNullChecks, DEBUG_TYPE, "Implicit null checks", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks", +INITIALIZE_PASS_END(ImplicitNullChecks, DEBUG_TYPE, "Implicit null checks", false, false) diff --git a/lib/CodeGen/InterleavedAccessPass.cpp b/lib/CodeGen/InterleavedAccessPass.cpp index bb29db301a95..ee4929c91482 100644 --- a/lib/CodeGen/InterleavedAccessPass.cpp +++ b/lib/CodeGen/InterleavedAccessPass.cpp @@ -107,13 +107,11 @@ private: } // end anonymous namespace. char InterleavedAccess::ID = 0; -INITIALIZE_PASS_BEGIN( - InterleavedAccess, "interleaved-access", +INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE, "Lower interleaved memory accesses to target specific intrinsics", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END( - InterleavedAccess, "interleaved-access", +INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE, "Lower interleaved memory accesses to target specific intrinsics", false, false) diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp index 275d84e2c185..40ee7ea785f0 100644 --- a/lib/CodeGen/LexicalScopes.cpp +++ b/lib/CodeGen/LexicalScopes.cpp @@ -86,8 +86,9 @@ void LexicalScopes::extractLexicalScopes( continue; } - // Ignore DBG_VALUE. It does not contribute to any instruction in output. - if (MInsn.isDebugValue()) + // Ignore DBG_VALUE and similar instruction that do not contribute to any + // instruction in the output. + if (MInsn.isMetaInstruction()) continue; if (RangeBeginMI) { diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp index f956974b1aaf..b5e705f6455d 100644 --- a/lib/CodeGen/LiveDebugValues.cpp +++ b/lib/CodeGen/LiveDebugValues.cpp @@ -43,7 +43,7 @@ using namespace llvm; -#define DEBUG_TYPE "live-debug-values" +#define DEBUG_TYPE "livedebugvalues" STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); @@ -283,7 +283,7 @@ public: char LiveDebugValues::ID = 0; char &llvm::LiveDebugValuesID = LiveDebugValues::ID; -INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis", +INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false, false) /// Default construct and initialize the pass. diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp index bcf7c8e99c7f..bbd783367c9e 100644 --- a/lib/CodeGen/LiveDebugVariables.cpp +++ b/lib/CodeGen/LiveDebugVariables.cpp @@ -45,7 +45,7 @@ using namespace llvm; -#define DEBUG_TYPE "livedebug" +#define DEBUG_TYPE "livedebugvars" static cl::opt EnableLDV("live-debug-variables", cl::init(true), @@ -54,11 +54,11 @@ EnableLDV("live-debug-variables", cl::init(true), STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted"); char LiveDebugVariables::ID = 0; -INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars", +INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars", +INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp index 3f5b8e19d1f0..0c05dbeacba0 100644 --- a/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -1,4 +1,4 @@ -//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===// +//===- LiveIntervalAnalysis.cpp - Live Interval Analysis ------------------===// // // The LLVM Compiler Infrastructure // @@ -14,28 +14,45 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "LiveRangeCalc.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/IR/Value.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include -#include +#include +#include +#include +#include +#include + using namespace llvm; #define DEBUG_TYPE "regalloc" @@ -59,11 +76,13 @@ static bool EnablePrecomputePhysRegs = false; #endif // NDEBUG namespace llvm { + cl::opt UseSegmentSetForPhysRegs( "use-segment-set-for-physregs", cl::Hidden, cl::init(true), cl::desc( "Use segment set for the computation of the live ranges of physregs.")); -} + +} // end namespace llvm void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); @@ -78,8 +97,7 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -LiveIntervals::LiveIntervals() : MachineFunctionPass(ID), - DomTree(nullptr), LRCalc(nullptr) { +LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) { initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); } @@ -168,12 +186,10 @@ LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const { #endif LiveInterval* LiveIntervals::createInterval(unsigned reg) { - float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? - llvm::huge_valf : 0.0F; + float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F; return new LiveInterval(reg, Weight); } - /// Compute the live interval of a virtual register, based on defs and uses. void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) { assert(LRCalc && "LRCalc not initialized."); @@ -337,7 +353,7 @@ static void createSegmentsForValues(LiveRange &LR, } } -typedef SmallVector, 16> ShrinkToUsesWorkList; +using ShrinkToUsesWorkList = SmallVector, 16>; static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes, ShrinkToUsesWorkList &WorkList, @@ -593,7 +609,7 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill, // Find all blocks that are reachable from KillMBB without leaving VNI's live // range. It is possible that KillMBB itself is reachable, so start a DFS // from each successor. - typedef df_iterator_default_set VisitedTy; + using VisitedTy = df_iterator_default_set; VisitedTy Visited; for (MachineBasicBlock *Succ : KillMBB->successors()) { for (df_ext_iterator @@ -822,7 +838,6 @@ LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) { return S; } - //===----------------------------------------------------------------------===// // Register mask functions //===----------------------------------------------------------------------===// @@ -855,7 +870,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, return false; bool Found = false; - for (;;) { + while (true) { assert(*SlotI >= LiveI->start); // Loop over all slots overlapping this segment. while (*SlotI < LiveI->end) { diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp index 9f7d7cf54848..0dc1079b2ad4 100644 --- a/lib/CodeGen/LivePhysRegs.cpp +++ b/lib/CodeGen/LivePhysRegs.cpp @@ -53,7 +53,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) { continue; removeReg(Reg); } else if (O->isRegMask()) - removeRegsInMask(*O, nullptr); + removeRegsInMask(*O); } // Add uses to the set. @@ -142,66 +142,85 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI, /// Add live-in registers of basic block \p MBB to \p LiveRegs. void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) { for (const auto &LI : MBB.liveins()) { - MCSubRegIndexIterator S(LI.PhysReg, TRI); - if (LI.LaneMask.all() || (LI.LaneMask.any() && !S.isValid())) { - addReg(LI.PhysReg); + unsigned Reg = LI.PhysReg; + LaneBitmask Mask = LI.LaneMask; + MCSubRegIndexIterator S(Reg, TRI); + assert(Mask.any() && "Invalid livein mask"); + if (Mask.all() || !S.isValid()) { + addReg(Reg); continue; } for (; S.isValid(); ++S) { unsigned SI = S.getSubRegIndex(); - if ((LI.LaneMask & TRI->getSubRegIndexLaneMask(SI)).any()) + if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any()) addReg(S.getSubReg()); } } } -/// Add pristine registers to the given \p LiveRegs. This function removes -/// actually saved callee save registers when \p InPrologueEpilogue is false. -static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF, - const MachineFrameInfo &MFI, - const TargetRegisterInfo &TRI) { +/// Adds all callee saved registers to \p LiveRegs. +static void addCalleeSavedRegs(LivePhysRegs &LiveRegs, + const MachineFunction &MF) { const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; - ++CSR) + for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) LiveRegs.addReg(*CSR); +} + +/// Adds pristine registers to the given \p LiveRegs. Pristine registers are +/// callee saved registers that are unused in the function. +static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid()) + return; + /// Add all callee saved regs, then remove the ones that are saved+restored. + addCalleeSavedRegs(LiveRegs, MF); + /// Remove the ones that are not saved/restored; they are pristine. for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) LiveRegs.removeReg(Info.getReg()); } void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) { - // To get the live-outs we simply merge the live-ins of all successors. - for (const MachineBasicBlock *Succ : MBB.successors()) - addBlockLiveIns(*Succ); + if (!MBB.succ_empty()) { + // To get the live-outs we simply merge the live-ins of all successors. + for (const MachineBasicBlock *Succ : MBB.successors()) + addBlockLiveIns(*Succ); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers that are saved and + // restored (somewhere); This does not include callee saved registers that + // are unused and hence not saved and restored; they are called pristine. + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) { + for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) + addReg(Info.getReg()); + } + } } void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) { - const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) { - if (MBB.isReturnBlock()) { - // The return block has no successors whose live-ins we could merge - // below. So instead we add the callee saved registers manually. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (const MCPhysReg *I = MRI.getCalleeSavedRegs(); *I; ++I) - addReg(*I); - } else { - addPristines(*this, MF, MFI, *TRI); - } + if (!MBB.succ_empty()) { + const MachineFunction &MF = *MBB.getParent(); + addPristines(*this, MF); + addLiveOutsNoPristines(MBB); + } else if (MBB.isReturnBlock()) { + // For the return block: Add all callee saved registers. + const MachineFunction &MF = *MBB.getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.isCalleeSavedInfoValid()) + addCalleeSavedRegs(*this, MF); } - - addLiveOutsNoPristines(MBB); } void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) { const MachineFunction &MF = *MBB.getParent(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.isCalleeSavedInfoValid()) - addPristines(*this, MF, MFI, *TRI); + addPristines(*this, MF); addBlockLiveIns(MBB); } -void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, +void llvm::computeLiveIns(LivePhysRegs &LiveRegs, + const MachineRegisterInfo &MRI, MachineBasicBlock &MBB) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); assert(MBB.livein_empty()); LiveRegs.init(TRI); LiveRegs.addLiveOutsNoPristines(MBB); @@ -209,10 +228,12 @@ void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI, LiveRegs.stepBackward(MI); for (unsigned Reg : LiveRegs) { + if (MRI.isReserved(Reg)) + continue; // Skip the register if we are about to add one of its super registers. bool ContainsSuperReg = false; for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) { - if (LiveRegs.contains(*SReg)) { + if (LiveRegs.contains(*SReg) && !MRI.isReserved(*SReg)) { ContainsSuperReg = true; break; } diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp index dbf1f96102d1..b51f8b0aa6bb 100644 --- a/lib/CodeGen/LiveStackAnalysis.cpp +++ b/lib/CodeGen/LiveStackAnalysis.cpp @@ -25,10 +25,10 @@ using namespace llvm; #define DEBUG_TYPE "livestacks" char LiveStacks::ID = 0; -INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks", +INITIALIZE_PASS_BEGIN(LiveStacks, DEBUG_TYPE, "Live Stack Slot Analysis", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_END(LiveStacks, "livestacks", +INITIALIZE_PASS_END(LiveStacks, DEBUG_TYPE, "Live Stack Slot Analysis", false, false) char &llvm::LiveStacksID = LiveStacks::ID; diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp index e189fb0dd89d..17cab0ae910e 100644 --- a/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -103,10 +103,10 @@ namespace { char LocalStackSlotPass::ID = 0; char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID; -INITIALIZE_PASS_BEGIN(LocalStackSlotPass, "localstackalloc", +INITIALIZE_PASS_BEGIN(LocalStackSlotPass, DEBUG_TYPE, "Local Stack Slot Allocation", false, false) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc", +INITIALIZE_PASS_END(LocalStackSlotPass, DEBUG_TYPE, "Local Stack Slot Allocation", false, false) diff --git a/lib/CodeGen/LowerEmuTLS.cpp b/lib/CodeGen/LowerEmuTLS.cpp index 5fb5b747f471..0fc48d4e0b6b 100644 --- a/lib/CodeGen/LowerEmuTLS.cpp +++ b/lib/CodeGen/LowerEmuTLS.cpp @@ -53,7 +53,7 @@ private: char LowerEmuTLS::ID = 0; -INITIALIZE_PASS(LowerEmuTLS, "loweremutls", +INITIALIZE_PASS(LowerEmuTLS, DEBUG_TYPE, "Add __emutls_[vt]. variables for emultated TLS model", false, false) diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 9c7367b4c780..4d1ec11df46c 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -26,7 +26,7 @@ using namespace llvm; -#define DEBUG_TYPE "block-freq" +#define DEBUG_TYPE "machine-block-freq" static cl::opt ViewMachineBlockFreqPropagationDAG( @@ -149,11 +149,11 @@ struct DOTGraphTraits } // end namespace llvm -INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq", +INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE, "Machine Block Frequency Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq", +INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE, "Machine Block Frequency Analysis", true, true) char MachineBlockFrequencyInfo::ID = 0; diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index adfca9a46239..c1ca8e8e83b4 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -499,13 +499,13 @@ public: char MachineBlockPlacement::ID = 0; char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID; -INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement", +INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement", +INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) #ifndef NDEBUG diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp index 0766f465456c..34f6bbd59e9b 100644 --- a/lib/CodeGen/MachineCSE.cpp +++ b/lib/CodeGen/MachineCSE.cpp @@ -108,12 +108,12 @@ namespace { char MachineCSE::ID = 0; char &llvm::MachineCSEID = MachineCSE::ID; -INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse", - "Machine Common Subexpression Elimination", false, false) +INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, + "Machine Common Subexpression Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineCSE, "machine-cse", - "Machine Common Subexpression Elimination", false, false) +INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, + "Machine Common Subexpression Elimination", false, false) /// The source register of a COPY machine instruction can be propagated to all /// its users, and this propagation could increase the probability of finding @@ -180,8 +180,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg, I = skipDebugInstructionsForward(I, E); if (I == E) - // Reached end of block, register is obviously dead. - return true; + // Reached end of block, we don't know if register is dead or not. + return false; bool SeenDef = false; for (const MachineOperand &MO : I->operands()) { diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp index 50e453e4067c..c176de16b593 100644 --- a/lib/CodeGen/MachineCombiner.cpp +++ b/lib/CodeGen/MachineCombiner.cpp @@ -86,11 +86,11 @@ private: char MachineCombiner::ID = 0; char &llvm::MachineCombinerID = MachineCombiner::ID; -INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", +INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", +INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", false, false) void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp index 7312dc5e94bd..f83b5481e0a5 100644 --- a/lib/CodeGen/MachineCopyPropagation.cpp +++ b/lib/CodeGen/MachineCopyPropagation.cpp @@ -27,7 +27,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "codegen-cp" +#define DEBUG_TYPE "machine-cp" STATISTIC(NumDeletes, "Number of dead copies deleted"); @@ -79,7 +79,7 @@ namespace { char MachineCopyPropagation::ID = 0; char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID; -INITIALIZE_PASS(MachineCopyPropagation, "machine-cp", +INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE, "Machine Copy Propagation Pass", false, false) /// Remove any entry in \p Map where the register is a subregister or equal to diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp index 7eb991744f01..95c62d820b0e 100644 --- a/lib/CodeGen/MachineLICM.cpp +++ b/lib/CodeGen/MachineLICM.cpp @@ -38,7 +38,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "machine-licm" +#define DEBUG_TYPE "machinelicm" static cl::opt AvoidSpeculation("avoid-speculation", @@ -237,13 +237,13 @@ namespace { char MachineLICM::ID = 0; char &llvm::MachineLICMID = MachineLICM::ID; -INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm", - "Machine Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE, + "Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineLICM, "machinelicm", - "Machine Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE, + "Machine Loop Invariant Code Motion", false, false) /// Test if the given loop is the outer-most loop that has a unique predecessor. static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) { diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp index 581a8ad81149..9ea3c00a2fc4 100644 --- a/lib/CodeGen/MachineOutliner.cpp +++ b/lib/CodeGen/MachineOutliner.cpp @@ -901,7 +901,7 @@ namespace llvm { ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); } } -INITIALIZE_PASS(MachineOutliner, "machine-outliner", +INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false, false) void MachineOutliner::pruneOverlaps(std::vector &CandidateList, diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index d06c38cf4ed8..8f5ac8b3fc45 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -715,13 +715,13 @@ char MachinePipeliner::ID = 0; int MachinePipeliner::NumTries = 0; #endif char &llvm::MachinePipelinerID = MachinePipeliner::ID; -INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner", +INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(MachinePipeliner, "pipeliner", +INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) /// The "main" function for implementing Swing Modulo Scheduling. diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 41e161f71e53..edc3783afa2f 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -69,7 +69,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" namespace llvm { @@ -191,13 +191,13 @@ char MachineScheduler::ID = 0; char &llvm::MachineSchedulerID = MachineScheduler::ID; -INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler", +INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler", +INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) MachineScheduler::MachineScheduler() @@ -532,7 +532,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // thumb2 size reduction is currently an exception, so the PostMIScheduler // needs to do this. if (FixKillFlags) - Scheduler.fixupKills(&*MBB); + Scheduler.fixupKills(*MBB); } Scheduler.finalizeSchedule(); } @@ -3233,6 +3233,12 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) return; + // Keep clustered nodes together. + if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), + Cand.SU == DAG->getNextClusterSucc(), + TryCand, Cand, Cluster)) + return; + // Avoid critical resource consumption and balance the schedule. if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, TryCand, Cand, ResourceReduce)) diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp index 5f87b68123f1..7c34e71a0cce 100644 --- a/lib/CodeGen/MachineSink.cpp +++ b/lib/CodeGen/MachineSink.cpp @@ -173,14 +173,14 @@ namespace { char MachineSinking::ID = 0; char &llvm::MachineSinkingID = MachineSinking::ID; -INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink", - "Machine code sinking", false, false) +INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE, + "Machine code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(MachineSinking, "machine-sink", - "Machine code sinking", false, false) +INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE, + "Machine code sinking", false, false) bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI, MachineBasicBlock *MBB) { diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index 998a9645e68b..01391a1a0e50 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -44,12 +44,12 @@ using namespace llvm; char MachineTraceMetrics::ID = 0; char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID; -INITIALIZE_PASS_BEGIN(MachineTraceMetrics, - "machine-trace-metrics", "Machine Trace Metrics", false, true) +INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, + "Machine Trace Metrics", false, true) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(MachineTraceMetrics, - "machine-trace-metrics", "Machine Trace Metrics", false, true) +INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE, + "Machine Trace Metrics", false, true) MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) { std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr); diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index b53b002f55a6..265f93c363ca 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -87,7 +87,6 @@ namespace { RegSet regsLive; RegVector regsDefined, regsDead, regsKilled; RegMaskVector regMasks; - RegSet regsLiveInButUnused; SlotIndex lastIndex; @@ -419,7 +418,6 @@ unsigned MachineVerifier::verify(MachineFunction &MF) { regsDead.clear(); regsKilled.clear(); regMasks.clear(); - regsLiveInButUnused.clear(); MBBInfoMap.clear(); return foundErrors; @@ -756,7 +754,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { regsLive.insert(*SubRegs); } } - regsLiveInButUnused = regsLive; const MachineFrameInfo &MFI = MF->getFrameInfo(); BitVector PR = MFI.getPristineRegs(*MF); @@ -1268,8 +1265,6 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { // Both use and def operands can read a register. if (MO->readsReg()) { - regsLiveInButUnused.erase(Reg); - if (MO->isKill()) addRegWithSubRegs(regsKilled, Reg); diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp index 2a8531f337a0..76ad668104b4 100644 --- a/lib/CodeGen/OptimizePHIs.cpp +++ b/lib/CodeGen/OptimizePHIs.cpp @@ -23,7 +23,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; -#define DEBUG_TYPE "phi-opt" +#define DEBUG_TYPE "opt-phis" STATISTIC(NumPHICycles, "Number of PHI cycles replaced"); STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles"); @@ -59,7 +59,7 @@ namespace { char OptimizePHIs::ID = 0; char &llvm::OptimizePHIsID = OptimizePHIs::ID; -INITIALIZE_PASS(OptimizePHIs, "opt-phis", +INITIALIZE_PASS(OptimizePHIs, DEBUG_TYPE, "Optimize machine instruction PHIs", false, false) bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) { diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp index db2264b2439d..9c898fa40d7e 100644 --- a/lib/CodeGen/PHIElimination.cpp +++ b/lib/CodeGen/PHIElimination.cpp @@ -112,11 +112,11 @@ STATISTIC(NumReused, "Number of reused lowered phis"); char PHIElimination::ID = 0; char& llvm::PHIEliminationID = PHIElimination::ID; -INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination", +INITIALIZE_PASS_BEGIN(PHIElimination, DEBUG_TYPE, "Eliminate PHI nodes for register allocation", false, false) INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination", +INITIALIZE_PASS_END(PHIElimination, DEBUG_TYPE, "Eliminate PHI nodes for register allocation", false, false) void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp index 61dccdde8f1d..f2249f9e37e0 100644 --- a/lib/CodeGen/PostRASchedulerList.cpp +++ b/lib/CodeGen/PostRASchedulerList.cpp @@ -200,7 +200,7 @@ namespace { char &llvm::PostRASchedulerID = PostRAScheduler::ID; -INITIALIZE_PASS(PostRAScheduler, "post-RA-sched", +INITIALIZE_PASS(PostRAScheduler, DEBUG_TYPE, "Post RA top-down list latency scheduler", false, false) SchedulePostRATDList::SchedulePostRATDList( @@ -367,7 +367,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) { Scheduler.finishBlock(); // Update register kills - Scheduler.fixupKills(&MBB); + Scheduler.fixupKills(MBB); } return true; diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp index d27ea2f51867..0118580a626a 100644 --- a/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/lib/CodeGen/ProcessImplicitDefs.cpp @@ -20,7 +20,7 @@ using namespace llvm; -#define DEBUG_TYPE "processimplicitdefs" +#define DEBUG_TYPE "processimpdefs" namespace { /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def @@ -51,9 +51,7 @@ public: char ProcessImplicitDefs::ID = 0; char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID; -INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs", - "Process Implicit Definitions", false, false) -INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs", +INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE, "Process Implicit Definitions", false, false) void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const { diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp index aaa253fde494..a9813e534c5f 100644 --- a/lib/CodeGen/PrologEpilogInserter.cpp +++ b/lib/CodeGen/PrologEpilogInserter.cpp @@ -45,7 +45,7 @@ using namespace llvm; -#define DEBUG_TYPE "pei" +#define DEBUG_TYPE "prologepilog" typedef SmallVector MBBVector; static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, @@ -129,12 +129,12 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1), cl::desc("Warn for stack size bigger than the given" " number")); -INITIALIZE_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion", false, +INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(PEI, "prologepilog", +INITIALIZE_PASS_END(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion & Frame Finalization", false, false) @@ -450,12 +450,13 @@ static void updateLiveness(MachineFunction &MF) { const std::vector &CSI = MFI.getCalleeSavedInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { for (MachineBasicBlock *MBB : Visited) { MCPhysReg Reg = CSI[i].getReg(); // Add the callee-saved register as live-in. // It's killed at the spill. - if (!MBB->isLiveIn(Reg)) + if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); } } diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp index 2f7ee8bf414c..cc32e43968bb 100644 --- a/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/lib/CodeGen/RenameIndependentSubregs.cpp @@ -112,11 +112,11 @@ char RenameIndependentSubregs::ID; char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID; -INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs", +INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, DEBUG_TYPE, "Rename Independent Subregisters", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs", +INITIALIZE_PASS_END(RenameIndependentSubregs, DEBUG_TYPE, "Rename Independent Subregisters", false, false) bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const { diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp index 2771fdbd737a..8584a9b7c897 100644 --- a/lib/CodeGen/SafeStack.cpp +++ b/lib/CodeGen/SafeStack.cpp @@ -52,7 +52,7 @@ using namespace llvm; using namespace llvm::safestack; -#define DEBUG_TYPE "safestack" +#define DEBUG_TYPE "safe-stack" namespace llvm { @@ -820,10 +820,10 @@ public: } // anonymous namespace char SafeStackLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, "safe-stack", +INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, DEBUG_TYPE, "Safe Stack instrumentation pass", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(SafeStackLegacyPass, "safe-stack", +INITIALIZE_PASS_END(SafeStackLegacyPass, DEBUG_TYPE, "Safe Stack instrumentation pass", false, false) FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); } diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp index dab5b91f50ad..07b43a82ca99 100644 --- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -49,12 +49,8 @@ private: } // namespace char ScalarizeMaskedMemIntrin::ID = 0; -INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin", - "Scalarize unsupported masked memory intrinsics", false, - false) -INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin", - "Scalarize unsupported masked memory intrinsics", false, - false) +INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE, + "Scalarize unsupported masked memory intrinsics", false, false) FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() { return new ScalarizeMaskedMemIntrin(); diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 18823b74c47f..8035ea80364b 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -1057,179 +1057,71 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores, loads.dump()); } -void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) { - // Start with no live registers. - LiveRegs.reset(); +static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, + MachineInstr &MI, bool addToLiveRegs) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.readsReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; - // Examine the live-in regs of all successors. - for (const MachineBasicBlock *Succ : BB->successors()) { - for (const auto &LI : Succ->liveins()) { - // Repeat, for reg and all subregs. - for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.set(*SubRegs); - } + // Things that are available after the instruction are killed by it. + bool IsKill = LiveRegs.available(MRI, Reg); + MO.setIsKill(IsKill); + if (IsKill && addToLiveRegs) + LiveRegs.addReg(Reg); } } -/// \brief If we change a kill flag on the bundle instruction implicit register -/// operands, then we also need to propagate that to any instructions inside -/// the bundle which had the same kill state. -static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg, - bool NewKillState, - const TargetRegisterInfo *TRI) { - if (MI->getOpcode() != TargetOpcode::BUNDLE) - return; +void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) { + DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n'); - // Walk backwards from the last instruction in the bundle to the first. - // Once we set a kill flag on an instruction, we bail out, as otherwise we - // might set it on too many operands. We will clear as many flags as we - // can though. - MachineBasicBlock::instr_iterator Begin = MI->getIterator(); - MachineBasicBlock::instr_iterator End = getBundleEnd(Begin); - while (Begin != End) { - if (NewKillState) { - if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false)) - return; - } else - (--End)->clearRegisterKills(Reg, TRI); - } -} - -void ScheduleDAGInstrs::toggleKillFlag(MachineInstr &MI, MachineOperand &MO) { - if (MO.isDebug()) - return; - - // Setting kill flag... - if (!MO.isKill()) { - MO.setIsKill(true); - toggleBundleKillFlag(&MI, MO.getReg(), true, TRI); - return; - } - - // If MO itself is live, clear the kill flag... - if (LiveRegs.test(MO.getReg())) { - MO.setIsKill(false); - toggleBundleKillFlag(&MI, MO.getReg(), false, TRI); - return; - } - - // If any subreg of MO is live, then create an imp-def for that - // subreg and keep MO marked as killed. - MO.setIsKill(false); - toggleBundleKillFlag(&MI, MO.getReg(), false, TRI); - bool AllDead = true; - const unsigned SuperReg = MO.getReg(); - MachineInstrBuilder MIB(MF, &MI); - for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) { - if (LiveRegs.test(*SubRegs)) { - MIB.addReg(*SubRegs, RegState::ImplicitDefine); - AllDead = false; - } - } - - if(AllDead) { - MO.setIsKill(true); - toggleBundleKillFlag(&MI, MO.getReg(), true, TRI); - } -} - -void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) { - // FIXME: Reuse the LivePhysRegs utility for this. - DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n'); - - LiveRegs.resize(TRI->getNumRegs()); - BitVector killedRegs(TRI->getNumRegs()); - - startBlockForKills(MBB); + LiveRegs.init(*TRI); + LiveRegs.addLiveOuts(MBB); // Examine block from end to start... - unsigned Count = MBB->size(); - for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin(); - I != E; --Count) { - MachineInstr &MI = *--I; + for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) { if (MI.isDebugValue()) continue; // Update liveness. Registers that are defed but not used in this // instruction are now dead. Mark register and all subregs as they // are completely defined. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (MO.isRegMask()) - LiveRegs.clearBitsNotInMask(MO.getRegMask()); - if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (Reg == 0) continue; - if (!MO.isDef()) continue; - // Ignore two-addr defs. - if (MI.isRegTiedToUseOperand(i)) continue; - - // Repeat for reg and all subregs. - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.reset(*SubRegs); + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + const MachineOperand &MO = *O; + if (MO.isReg()) { + if (!MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + LiveRegs.removeReg(Reg); + } else if (MO.isRegMask()) { + LiveRegs.removeRegsInMask(MO); + } } - // Examine all used registers and set/clear kill flag. When a - // register is used multiple times we only set the kill flag on - // the first use. Don't set kill flags on undef operands. - killedRegs.reset(); - - // toggleKillFlag can append new operands (implicit defs), so using - // a range-based loop is not safe. The new operands will be appended - // at the end of the operand list and they don't need to be visited, - // so iterating until the currently last operand is ok. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); - if ((Reg == 0) || MRI.isReserved(Reg)) continue; - - bool kill = false; - if (!killedRegs.test(Reg)) { - kill = true; - // A register is not killed if any subregs are live... - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - if (LiveRegs.test(*SubRegs)) { - kill = false; - break; - } - } - - // If subreg is not live, then register is killed if it became - // live in this instruction - if (kill) - kill = !LiveRegs.test(Reg); + // If there is a bundle header fix it up first. + if (!MI.isBundled()) { + toggleKills(MRI, LiveRegs, MI, true); + } else { + MachineBasicBlock::instr_iterator First = MI.getIterator(); + if (MI.isBundle()) { + toggleKills(MRI, LiveRegs, MI, false); + ++First; } - - if (MO.isKill() != kill) { - DEBUG(dbgs() << "Fixing " << MO << " in "); - toggleKillFlag(MI, MO); - DEBUG(MI.dump()); - DEBUG({ - if (MI.getOpcode() == TargetOpcode::BUNDLE) { - MachineBasicBlock::instr_iterator Begin = MI.getIterator(); - MachineBasicBlock::instr_iterator End = getBundleEnd(Begin); - while (++Begin != End) - DEBUG(Begin->dump()); - } - }); - } - - killedRegs.set(Reg); - } - - // Mark any used register (that is not using undef) and subregs as - // now live... - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue; - unsigned Reg = MO.getReg(); - if ((Reg == 0) || MRI.isReserved(Reg)) continue; - - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - LiveRegs.set(*SubRegs); + // Some targets make the (questionable) assumtion that the instructions + // inside the bundle are ordered and consequently only the last use of + // a register inside the bundle can kill it. + MachineBasicBlock::instr_iterator I = std::next(First); + while (I->isBundledWithSucc()) + ++I; + do { + if (!I->isDebugValue()) + toggleKills(MRI, LiveRegs, *I, true); + --I; + } while(I != First); } } } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d450e7e078c..23a302f3e561 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12349,9 +12349,9 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SDValue Val = St->getValue(); StoreInt <<= ElementSizeBytes * 8; if (ConstantSDNode *C = dyn_cast(Val)) { - StoreInt |= C->getAPIntValue().zext(SizeInBits); + StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits); } else if (ConstantFPSDNode *C = dyn_cast(Val)) { - StoreInt |= C->getValueAPF().bitcastToAPInt().zext(SizeInBits); + StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); } else { llvm_unreachable("Invalid constant element type"); } @@ -12617,16 +12617,19 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); bool IsFast = false; if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) { LastLegalType = i + 1; // Or check whether a truncstore is legal. - } else if (TLI.getTypeAction(Context, StoreTy) == + } else if (!LegalTypes && + TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) && TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) { @@ -12642,7 +12645,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { !NoVectors) { // Find a legal type for the vector store. EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1); - if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) && + if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) @@ -12700,7 +12703,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); bool IsFast; - if (TLI.isTypeLegal(Ty) && + if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) @@ -12810,6 +12813,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1); bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && @@ -12823,6 +12827,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && @@ -12834,7 +12839,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy); - if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + if (!LegalTypes && + TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) && + TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, @@ -14455,6 +14462,145 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +/// If we are extracting a subvector produced by a wide binary operator with at +/// at least one operand that was the result of a vector concatenation, then try +/// to use the narrow vector operands directly to avoid the concatenation and +/// extraction. +static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share + // some of these bailouts with other transforms. + + // The extract index must be a constant, so we can map it to a concat operand. + auto *ExtractIndex = dyn_cast(Extract->getOperand(1)); + if (!ExtractIndex) + return SDValue(); + + // Only handle the case where we are doubling and then halving. A larger ratio + // may require more than two narrow binops to replace the wide binop. + EVT VT = Extract->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + assert((ExtractIndex->getZExtValue() % NumElems) == 0 && + "Extract index is not a multiple of the vector length."); + if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2) + return SDValue(); + + // We are looking for an optionally bitcasted wide vector binary operator + // feeding an extract subvector. + SDValue BinOp = Extract->getOperand(0); + if (BinOp.getOpcode() == ISD::BITCAST) + BinOp = BinOp.getOperand(0); + + // TODO: The motivating case for this transform is an x86 AVX1 target. That + // target has temptingly almost legal versions of bitwise logic ops in 256-bit + // flavors, but no other 256-bit integer support. This could be extended to + // handle any binop, but that may require fixing/adding other folds to avoid + // codegen regressions. + unsigned BOpcode = BinOp.getOpcode(); + if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) + return SDValue(); + + // The binop must be a vector type, so we can chop it in half. + EVT WideBVT = BinOp.getValueType(); + if (!WideBVT.isVector()) + return SDValue(); + + // Bail out if the target does not support a narrower version of the binop. + EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), + WideBVT.getVectorNumElements() / 2); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) + return SDValue(); + + // Peek through bitcasts of the binary operator operands if needed. + SDValue LHS = BinOp.getOperand(0); + if (LHS.getOpcode() == ISD::BITCAST) + LHS = LHS.getOperand(0); + + SDValue RHS = BinOp.getOperand(1); + if (RHS.getOpcode() == ISD::BITCAST) + RHS = RHS.getOperand(0); + + // We need at least one concatenation operation of a binop operand to make + // this transform worthwhile. The concat must double the input vector sizes. + // TODO: Should we also handle INSERT_SUBVECTOR patterns? + bool ConcatL = + LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; + bool ConcatR = + RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; + if (!ConcatL && !ConcatR) + return SDValue(); + + // If one of the binop operands was not the result of a concat, we must + // extract a half-sized operand for our new narrow binop. We can't just reuse + // the original extract index operand because we may have bitcasted. + unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems; + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + SDLoc DL(Extract); + + // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN + // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) + // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN + SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(0), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(1), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); + return DAG.getBitcast(VT, NarrowBinOp); +} + +/// If we are extracting a subvector from a wide vector load, convert to a +/// narrow load to eliminate the extraction: +/// (extract_subvector (load wide vector)) --> (load narrow vector) +static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Add support for big-endian. The offset calculation must be adjusted. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + + // TODO: The one-use check is overly conservative. Check the cost of the + // extract instead or remove that condition entirely. + auto *Ld = dyn_cast(Extract->getOperand(0)); + auto *ExtIdx = dyn_cast(Extract->getOperand(1)); + if (!Ld || !Ld->hasOneUse() || Ld->isVolatile() || !ExtIdx) + return SDValue(); + + // The narrow load will be offset from the base address of the old load if + // we are extracting from something besides index 0 (little-endian). + EVT VT = Extract->getValueType(0); + SDLoc DL(Extract); + SDValue BaseAddr = Ld->getOperand(1); + unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); + + // TODO: Use "BaseIndexOffset" to make this more effective. + SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, + VT.getStoreSize()); + SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); + + // The new load must have the same position as the old load in terms of memory + // dependency. Create a TokenFactor for Ld and NewLd and update uses of Ld's + // output chain to use that TokenFactor. + // TODO: This code is based on a similar sequence in x86 lowering. It should + // be moved to a helper function, so it can be shared and reused. + if (Ld->hasAnyUseOfValue(1)) { + SDValue OldChain = SDValue(Ld, 1); + SDValue NewChain = SDValue(NewLd.getNode(), 1); + SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + OldChain, NewChain); + DAG.ReplaceAllUsesOfValueWith(OldChain, TokenFactor); + DAG.UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain); + } + + return NewLd; +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -14463,6 +14609,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { if (V.isUndef()) return DAG.getUNDEF(NVT); + if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) + if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) + return NarrowLoad; + // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: @@ -14510,6 +14660,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { } } + if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) + return NarrowBOp; + return SDValue(); } @@ -14745,10 +14898,10 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, // This is often generated during legalization. // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. -SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, - SelectionDAG &DAG, - const TargetLowering &TLI, - bool LegalOperations) { +static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); @@ -14795,7 +14948,8 @@ SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, // destination type. This is often generated during legalization. // If the source node itself was a '*_extend_vector_inreg' node then we should // then be able to remove it. -SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) { +static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 9a47a914df91..d0a8b34c69c6 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -899,6 +899,39 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { } } +static TargetLowering::LegalizeAction +getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) { + unsigned EqOpc; + switch (Opcode) { + default: llvm_unreachable("Unexpected FP pseudo-opcode"); + case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; + case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break; + case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break; + case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break; + case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break; + case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; + case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; + case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; + case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; + case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; + } + + auto Action = TLI.getOperationAction(EqOpc, VT); + + // We don't currently handle Custom or Promote for strict FP pseudo-ops. + // For now, we just expand for those cases. + if (Action != TargetLowering::Legal) + Action = TargetLowering::Expand; + + // ISD::FPOWI returns 'Legal' even though it should be expanded. + if (Opcode == ISD::STRICT_FPOWI && Action == TargetLowering::Legal) + Action = TargetLowering::Expand; + + return Action; +} + /// Return a legal replacement for the given operation, with all legal operands. void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG)); @@ -1043,6 +1076,25 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { return; } break; + case ISD::STRICT_FSQRT: + case ISD::STRICT_FPOW: + case ISD::STRICT_FPOWI: + case ISD::STRICT_FSIN: + case ISD::STRICT_FCOS: + case ISD::STRICT_FEXP: + case ISD::STRICT_FEXP2: + case ISD::STRICT_FLOG: + case ISD::STRICT_FLOG10: + case ISD::STRICT_FLOG2: + case ISD::STRICT_FRINT: + case ISD::STRICT_FNEARBYINT: + // These pseudo-ops get legalized as if they were their non-strict + // equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT + // is also legal, but if ISD::FSQRT requires expansion then so does + // ISD::STRICT_FSQRT. + Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(), + Node->getValueType(0)); + break; default: if (Node->getOpcode() >= ISD::BUILTIN_OP_END) { @@ -2032,6 +2084,9 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128) { + if (Node->isStrictFPOpcode()) + Node = DAG.mutateStrictFPToFP(Node); + RTLIB::Libcall LC; switch (Node->getSimpleValueType(0).SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); @@ -3907,16 +3962,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::FMAX_PPCF128)); break; case ISD::FSQRT: + case ISD::STRICT_FSQRT: Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64, RTLIB::SQRT_F80, RTLIB::SQRT_F128, RTLIB::SQRT_PPCF128)); break; case ISD::FSIN: + case ISD::STRICT_FSIN: Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64, RTLIB::SIN_F80, RTLIB::SIN_F128, RTLIB::SIN_PPCF128)); break; case ISD::FCOS: + case ISD::STRICT_FCOS: Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64, RTLIB::COS_F80, RTLIB::COS_F128, RTLIB::COS_PPCF128)); @@ -3926,26 +3984,31 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { ExpandSinCosLibCall(Node, Results); break; case ISD::FLOG: + case ISD::STRICT_FLOG: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, RTLIB::LOG_F80, RTLIB::LOG_F128, RTLIB::LOG_PPCF128)); break; case ISD::FLOG2: + case ISD::STRICT_FLOG2: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, RTLIB::LOG2_F80, RTLIB::LOG2_F128, RTLIB::LOG2_PPCF128)); break; case ISD::FLOG10: + case ISD::STRICT_FLOG10: Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, RTLIB::LOG10_F80, RTLIB::LOG10_F128, RTLIB::LOG10_PPCF128)); break; case ISD::FEXP: + case ISD::STRICT_FEXP: Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, RTLIB::EXP_F80, RTLIB::EXP_F128, RTLIB::EXP_PPCF128)); break; case ISD::FEXP2: + case ISD::STRICT_FEXP2: Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, RTLIB::EXP2_F80, RTLIB::EXP2_F128, RTLIB::EXP2_PPCF128)); @@ -3966,11 +4029,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::CEIL_PPCF128)); break; case ISD::FRINT: + case ISD::STRICT_FRINT: Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64, RTLIB::RINT_F80, RTLIB::RINT_F128, RTLIB::RINT_PPCF128)); break; case ISD::FNEARBYINT: + case ISD::STRICT_FNEARBYINT: Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32, RTLIB::NEARBYINT_F64, RTLIB::NEARBYINT_F80, @@ -3985,11 +4050,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::ROUND_PPCF128)); break; case ISD::FPOWI: + case ISD::STRICT_FPOWI: Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64, RTLIB::POWI_F80, RTLIB::POWI_F128, RTLIB::POWI_PPCF128)); break; case ISD::FPOW: + case ISD::STRICT_FPOW: Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, RTLIB::POW_F80, RTLIB::POW_F128, RTLIB::POW_PPCF128)); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 16c1f78f1b35..177898e1e950 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4779,23 +4779,23 @@ static bool FindOptimalMemOpLowering(std::vector &MemOps, DAG.getMachineFunction()); if (VT == MVT::Other) { - if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) || - TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) { - VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS); - } else { - switch (DstAlign & 7) { - case 0: VT = MVT::i64; break; - case 4: VT = MVT::i32; break; - case 2: VT = MVT::i16; break; - default: VT = MVT::i8; break; - } - } + // Use the largest integer type whose alignment constraints are satisfied. + // We only need to check DstAlign here as SrcAlign is always greater or + // equal to DstAlign (or zero). + VT = MVT::i64; + while (DstAlign && DstAlign < VT.getSizeInBits() / 8 && + !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) + VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); + assert(VT.isInteger()); + // Find the largest legal integer type. MVT LVT = MVT::i64; while (!TLI.isTypeLegal(LVT)) LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1); assert(LVT.isInteger()); + // If the type we've chosen is larger than the largest legal integer type + // then use that instead. if (VT.bitsGT(LVT)) VT = LVT; } @@ -6542,6 +6542,63 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc, return N; } +SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) { + unsigned OrigOpc = Node->getOpcode(); + unsigned NewOpc; + bool IsUnary = false; + switch (OrigOpc) { + default: + llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!"); + case ISD::STRICT_FADD: NewOpc = ISD::FADD; break; + case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break; + case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break; + case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break; + case ISD::STRICT_FREM: NewOpc = ISD::FREM; break; + case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break; + case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break; + case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break; + case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break; + case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break; + case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break; + case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break; + case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break; + case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break; + case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break; + case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break; + case ISD::STRICT_FNEARBYINT: + NewOpc = ISD::FNEARBYINT; + IsUnary = true; + break; + } + + // We're taking this node out of the chain, so we need to re-link things. + SDValue InputChain = Node->getOperand(0); + SDValue OutputChain = SDValue(Node, 1); + ReplaceAllUsesOfValueWith(OutputChain, InputChain); + + SDVTList VTs = getVTList(Node->getOperand(1).getValueType()); + SDNode *Res = nullptr; + if (IsUnary) + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) }); + else + Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1), + Node->getOperand(2) }); + + // MorphNodeTo can operate in two ways: if an existing node with the + // specified operands exists, it can just return it. Otherwise, it + // updates the node in place to have the requested operands. + if (Res == Node) { + // If we updated the node in place, reset the node ID. To the isel, + // this should be just like a newly allocated machine node. + Res->setNodeId(-1); + } else { + ReplaceAllUsesWith(Node, Res); + RemoveDeadNode(Node); + } + + return Res; +} + /// getMachineNode - These are used for target selectors to create a new node /// with specified return type(s), MachineInstr opcode, and operands. diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 57d340c41c39..b895da21a7ff 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4736,24 +4736,15 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N, DIExpression *Expr, int64_t Offset, const DebugLoc &dl, unsigned DbgSDNodeOrder) { - SDDbgValue *SDV; - auto *FISDN = dyn_cast(N.getNode()); - if (FISDN && Expr->startsWithDeref()) { + if (auto *FISDN = dyn_cast(N.getNode())) { // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe // stack slot locations as such instead of as indirectly addressed // locations. - ArrayRef TrailingElements(Expr->elements_begin() + 1, - Expr->elements_end()); - DIExpression *DerefedDIExpr = - DIExpression::get(*DAG.getContext(), TrailingElements); - int FI = FISDN->getIndex(); - SDV = DAG.getFrameIndexDbgValue(Variable, DerefedDIExpr, FI, 0, dl, - DbgSDNodeOrder); - } else { - SDV = DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, - Offset, dl, DbgSDNodeOrder); + return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), 0, dl, + DbgSDNodeOrder); } - return SDV; + return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false, + Offset, dl, DbgSDNodeOrder); } // VisualStudio defines setjmp as _setjmp @@ -5254,7 +5245,19 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: - visitConstrainedFPIntrinsic(I, Intrinsic); + case Intrinsic::experimental_constrained_sqrt: + case Intrinsic::experimental_constrained_pow: + case Intrinsic::experimental_constrained_powi: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_rint: + case Intrinsic::experimental_constrained_nearbyint: + visitConstrainedFPIntrinsic(cast(I)); return nullptr; case Intrinsic::fmuladd: { EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); @@ -5752,11 +5755,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { } } -void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I, - unsigned Intrinsic) { +void SelectionDAGBuilder::visitConstrainedFPIntrinsic( + const ConstrainedFPIntrinsic &FPI) { SDLoc sdl = getCurSDLoc(); unsigned Opcode; - switch (Intrinsic) { + switch (FPI.getIntrinsicID()) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::experimental_constrained_fadd: Opcode = ISD::STRICT_FADD; @@ -5773,23 +5776,64 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I, case Intrinsic::experimental_constrained_frem: Opcode = ISD::STRICT_FREM; break; + case Intrinsic::experimental_constrained_sqrt: + Opcode = ISD::STRICT_FSQRT; + break; + case Intrinsic::experimental_constrained_pow: + Opcode = ISD::STRICT_FPOW; + break; + case Intrinsic::experimental_constrained_powi: + Opcode = ISD::STRICT_FPOWI; + break; + case Intrinsic::experimental_constrained_sin: + Opcode = ISD::STRICT_FSIN; + break; + case Intrinsic::experimental_constrained_cos: + Opcode = ISD::STRICT_FCOS; + break; + case Intrinsic::experimental_constrained_exp: + Opcode = ISD::STRICT_FEXP; + break; + case Intrinsic::experimental_constrained_exp2: + Opcode = ISD::STRICT_FEXP2; + break; + case Intrinsic::experimental_constrained_log: + Opcode = ISD::STRICT_FLOG; + break; + case Intrinsic::experimental_constrained_log10: + Opcode = ISD::STRICT_FLOG10; + break; + case Intrinsic::experimental_constrained_log2: + Opcode = ISD::STRICT_FLOG2; + break; + case Intrinsic::experimental_constrained_rint: + Opcode = ISD::STRICT_FRINT; + break; + case Intrinsic::experimental_constrained_nearbyint: + Opcode = ISD::STRICT_FNEARBYINT; + break; } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Chain = getRoot(); - SDValue Ops[3] = { Chain, getValue(I.getArgOperand(0)), - getValue(I.getArgOperand(1)) }; SmallVector ValueVTs; - ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); + ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs); ValueVTs.push_back(MVT::Other); // Out chain SDVTList VTs = DAG.getVTList(ValueVTs); - SDValue Result = DAG.getNode(Opcode, sdl, VTs, Ops); + SDValue Result; + if (FPI.isUnaryOp()) + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)) }); + else + Result = DAG.getNode(Opcode, sdl, VTs, + { Chain, getValue(FPI.getArgOperand(0)), + getValue(FPI.getArgOperand(1)) }); assert(Result.getNode()->getNumValues() == 2); SDValue OutChain = Result.getValue(1); DAG.setRoot(OutChain); SDValue FPResult = Result.getValue(0); - setValue(&I, FPResult); + setValue(&FPI, FPResult); } std::pair diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index bdaee858da61..77e131fa551c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -895,7 +895,7 @@ private: void visitInlineAsm(ImmutableCallSite CS); const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic); void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); - void visitConstrainedFPIntrinsic(const CallInst &I, unsigned Intrinsic); + void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitVAStart(const CallInst &I); void visitVAArg(const VAArgInst &I); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 5e0feccb6b4c..687b882c5e4d 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -905,50 +905,6 @@ public: } // end anonymous namespace -static bool isStrictFPOp(SDNode *Node, unsigned &NewOpc) { - unsigned OrigOpc = Node->getOpcode(); - switch (OrigOpc) { - case ISD::STRICT_FADD: NewOpc = ISD::FADD; return true; - case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; return true; - case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; return true; - case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; return true; - case ISD::STRICT_FREM: NewOpc = ISD::FREM; return true; - default: return false; - } -} - -SDNode* SelectionDAGISel::MutateStrictFPToFP(SDNode *Node, unsigned NewOpc) { - assert(((Node->getOpcode() == ISD::STRICT_FADD && NewOpc == ISD::FADD) || - (Node->getOpcode() == ISD::STRICT_FSUB && NewOpc == ISD::FSUB) || - (Node->getOpcode() == ISD::STRICT_FMUL && NewOpc == ISD::FMUL) || - (Node->getOpcode() == ISD::STRICT_FDIV && NewOpc == ISD::FDIV) || - (Node->getOpcode() == ISD::STRICT_FREM && NewOpc == ISD::FREM)) && - "Unexpected StrictFP opcode!"); - - // We're taking this node out of the chain, so we need to re-link things. - SDValue InputChain = Node->getOperand(0); - SDValue OutputChain = SDValue(Node, 1); - CurDAG->ReplaceAllUsesOfValueWith(OutputChain, InputChain); - - SDVTList VTs = CurDAG->getVTList(Node->getOperand(1).getValueType()); - SDValue Ops[2] = { Node->getOperand(1), Node->getOperand(2) }; - SDNode *Res = CurDAG->MorphNodeTo(Node, NewOpc, VTs, Ops); - - // MorphNodeTo can operate in two ways: if an existing node with the - // specified operands exists, it can just return it. Otherwise, it - // updates the node in place to have the requested operands. - if (Res == Node) { - // If we updated the node in place, reset the node ID. To the isel, - // this should be just like a newly allocated machine node. - Res->setNodeId(-1); - } else { - CurDAG->ReplaceAllUsesWith(Node, Res); - CurDAG->RemoveDeadNode(Node); - } - - return Res; -} - void SelectionDAGISel::DoInstructionSelection() { DEBUG(dbgs() << "===== Instruction selection begins: BB#" << FuncInfo->MBB->getNumber() @@ -992,15 +948,12 @@ void SelectionDAGISel::DoInstructionSelection() { // If the current node is a strict FP pseudo-op, the isStrictFPOp() // function will provide the corresponding normal FP opcode to which the // node should be mutated. - unsigned NormalFPOpc = ISD::UNDEF; - bool IsStrictFPOp = isStrictFPOp(Node, NormalFPOpc); - if (IsStrictFPOp) - Node = MutateStrictFPToFP(Node, NormalFPOpc); + // + // FIXME: The backends need a way to handle FP constraints. + if (Node->isStrictFPOpcode()) + Node = CurDAG->mutateStrictFPToFP(Node); Select(Node); - - // FIXME: Add code here to attach an implicit def and use of - // target-specific FP environment registers. } CurDAG->setRoot(Dummy.getValue()); diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index befbd80d7965..0dffffee9976 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -603,11 +603,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); if (SimplifyDemandedBits(Op.getOperand(0), ~Known.Zero & NewMask, Known2, TLO, Depth+1)) return true; - assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known one on one side, return the other. // These bits cannot contribute to the result of the 'and'. @@ -633,11 +633,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, case ISD::OR: if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); if (SimplifyDemandedBits(Op.getOperand(0), ~Known.One & NewMask, Known2, TLO, Depth+1)) return true; - assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'or'. @@ -660,10 +660,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, case ISD::XOR: { if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); if (SimplifyDemandedBits(Op.getOperand(0), NewMask, Known2, TLO, Depth+1)) return true; - assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If all of the demanded bits are known zero on one side, return the other. // These bits cannot contribute to the result of the 'xor'. @@ -725,8 +725,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return true; if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known2, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); - assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. if (ShrinkDemandedConstant(Op, NewMask, TLO)) @@ -741,8 +741,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, return true; if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known2, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); - assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. if (ShrinkDemandedConstant(Op, NewMask, TLO)) @@ -907,7 +907,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // Compute the new bits that are at the top now. if (SimplifyDemandedBits(InOp, InDemandedMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); @@ -947,7 +947,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); @@ -1029,7 +1029,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. @@ -1084,7 +1084,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known = Known.zext(BitWidth); Known.Zero |= NewBits; break; @@ -1134,7 +1134,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt InMask = NewMask.trunc(OperandBitWidth); if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known = Known.zext(BitWidth); break; } @@ -1193,7 +1193,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, } } - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case ISD::AssertZext: { @@ -1205,7 +1205,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask, Known, TLO, Depth+1)) return true; - assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero |= ~InMask; break; diff --git a/lib/CodeGen/ShadowStackGCLowering.cpp b/lib/CodeGen/ShadowStackGCLowering.cpp index ff7d205c1f4c..6750fde57638 100644 --- a/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/lib/CodeGen/ShadowStackGCLowering.cpp @@ -27,7 +27,7 @@ using namespace llvm; -#define DEBUG_TYPE "shadowstackgclowering" +#define DEBUG_TYPE "shadow-stack-gc-lowering" namespace { @@ -66,10 +66,10 @@ private: }; } -INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering", +INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) INITIALIZE_PASS_DEPENDENCY(GCModuleInfo) -INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering", +INITIALIZE_PASS_END(ShadowStackGCLowering, DEBUG_TYPE, "Shadow Stack GC Lowering", false, false) FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); } diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp index 2638702da152..aa75f5e2caa2 100644 --- a/lib/CodeGen/ShrinkWrap.cpp +++ b/lib/CodeGen/ShrinkWrap.cpp @@ -210,13 +210,12 @@ public: char ShrinkWrap::ID = 0; char &llvm::ShrinkWrapID = ShrinkWrap::ID; -INITIALIZE_PASS_BEGIN(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, - false) +INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false) +INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const { diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index e9eff4d0acb2..09e9c3bb3354 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -74,7 +74,7 @@ private: } // end anonymous namespace char SjLjEHPrepare::ID = 0; -INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions", +INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions", false, false) // Public Interface To the SjLjEHPrepare pass. diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp index bc2a1d09056b..3656832a7f1a 100644 --- a/lib/CodeGen/SlotIndexes.cpp +++ b/lib/CodeGen/SlotIndexes.cpp @@ -19,7 +19,7 @@ using namespace llvm; #define DEBUG_TYPE "slotindexes" char SlotIndexes::ID = 0; -INITIALIZE_PASS(SlotIndexes, "slotindexes", +INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE, "Slot index numbering", false, false) STATISTIC(NumLocalRenum, "Number of local renumberings"); diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp index 43cbf4add0f8..0abe1c47da55 100644 --- a/lib/CodeGen/SpillPlacement.cpp +++ b/lib/CodeGen/SpillPlacement.cpp @@ -40,14 +40,14 @@ using namespace llvm; -#define DEBUG_TYPE "spillplacement" +#define DEBUG_TYPE "spill-code-placement" char SpillPlacement::ID = 0; -INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement", +INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE, "Spill Code Placement Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement", +INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE, "Spill Code Placement Analysis", true, true) char &llvm::SpillPlacementID = SpillPlacement::ID; diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp index 86a16187fcb6..acb3676fdd71 100644 --- a/lib/CodeGen/StackColoring.cpp +++ b/lib/CodeGen/StackColoring.cpp @@ -53,7 +53,7 @@ using namespace llvm; -#define DEBUG_TYPE "stackcoloring" +#define DEBUG_TYPE "stack-coloring" static cl::opt DisableColoring("no-stack-coloring", @@ -371,12 +371,12 @@ private: char StackColoring::ID = 0; char &llvm::StackColoringID = StackColoring::ID; -INITIALIZE_PASS_BEGIN(StackColoring, - "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE, + "Merge disjoint stack slots", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(StackProtector) -INITIALIZE_PASS_END(StackColoring, - "stack-coloring", "Merge disjoint stack slots", false, false) +INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE, + "Merge disjoint stack slots", false, false) void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp index 5da77264261b..ca8bde2d114a 100644 --- a/lib/CodeGen/StackProtector.cpp +++ b/lib/CodeGen/StackProtector.cpp @@ -58,10 +58,10 @@ static cl::opt EnableSelectionDAGSP("enable-selectiondag-sp", cl::init(true), cl::Hidden); char StackProtector::ID = 0; -INITIALIZE_PASS_BEGIN(StackProtector, "stack-protector", +INITIALIZE_PASS_BEGIN(StackProtector, DEBUG_TYPE, "Insert stack protectors", false, true) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(StackProtector, "stack-protector", +INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE, "Insert stack protectors", false, true) FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); } diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp index 234b2043a6a1..d1758ecbd79f 100644 --- a/lib/CodeGen/StackSlotColoring.cpp +++ b/lib/CodeGen/StackSlotColoring.cpp @@ -32,7 +32,7 @@ #include using namespace llvm; -#define DEBUG_TYPE "stackslotcoloring" +#define DEBUG_TYPE "stack-slot-coloring" static cl::opt DisableSharing("no-stack-slot-sharing", @@ -116,12 +116,12 @@ namespace { char StackSlotColoring::ID = 0; char &llvm::StackSlotColoringID = StackSlotColoring::ID; -INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring", +INITIALIZE_PASS_BEGIN(StackSlotColoring, DEBUG_TYPE, "Stack Slot Coloring", false, false) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveStacks) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring", +INITIALIZE_PASS_END(StackSlotColoring, DEBUG_TYPE, "Stack Slot Coloring", false, false) namespace { diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp index e2377d89497d..ad0b04373656 100644 --- a/lib/CodeGen/TailDuplication.cpp +++ b/lib/CodeGen/TailDuplication.cpp @@ -40,8 +40,7 @@ char TailDuplicatePass::ID = 0; char &llvm::TailDuplicateID = TailDuplicatePass::ID; -INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false, - false) +INITIALIZE_PASS(TailDuplicatePass, DEBUG_TYPE, "Tail Duplication", false, false) bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp index d2414200e9d5..d40f7af431a9 100644 --- a/lib/CodeGen/TailDuplicator.cpp +++ b/lib/CodeGen/TailDuplicator.cpp @@ -749,7 +749,7 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB, if (PredBB->succ_size() > 1) return false; - MachineBasicBlock *PredTBB, *PredFBB; + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector PredCond; if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond)) return false; @@ -832,7 +832,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, appendCopies(PredBB, CopyInfos, Copies); // Simplify - MachineBasicBlock *PredTBB, *PredFBB; + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector PredCond; TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond); diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 7392c8327148..552a89f76ca2 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -52,7 +52,7 @@ using namespace llvm; -#define DEBUG_TYPE "twoaddrinstr" +#define DEBUG_TYPE "twoaddressinstruction" STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions"); STATISTIC(NumCommuted , "Number of instructions commuted to coalesce"); @@ -171,10 +171,10 @@ public: } // end anonymous namespace char TwoAddressInstructionPass::ID = 0; -INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction", +INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction", +INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID; diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp index a632b40c20f5..4e7542bf31e0 100644 --- a/lib/CodeGen/WinEHPrepare.cpp +++ b/lib/CodeGen/WinEHPrepare.cpp @@ -94,7 +94,7 @@ private: } // end anonymous namespace char WinEHPrepare::ID = 0; -INITIALIZE_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions", +INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions", false, false) FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); } diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt index 556ebf78622f..90193d07b95d 100644 --- a/lib/DebugInfo/CodeView/CMakeLists.txt +++ b/lib/DebugInfo/CodeView/CMakeLists.txt @@ -22,6 +22,7 @@ add_llvm_library(LLVMDebugInfoCodeView TypeDatabaseVisitor.cpp TypeDumpVisitor.cpp TypeIndex.cpp + TypeIndexDiscovery.cpp TypeRecordMapping.cpp TypeSerializer.cpp TypeStreamMerger.cpp diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp index f95c3e79388e..705b548141b0 100644 --- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp @@ -45,24 +45,9 @@ static Error visitKnownMember(CVMemberRecord &Record, } static Expected deserializeTypeServerRecord(CVType &Record) { - class StealTypeServerVisitor : public TypeVisitorCallbacks { - public: - explicit StealTypeServerVisitor(TypeServer2Record &TR) : TR(TR) {} - - Error visitKnownRecord(CVType &CVR, TypeServer2Record &Record) override { - TR = Record; - return Error::success(); - } - - private: - TypeServer2Record &TR; - }; - TypeServer2Record R(TypeRecordKind::TypeServer2); - StealTypeServerVisitor Thief(R); - if (auto EC = visitTypeRecord(Record, Thief)) + if (auto EC = TypeDeserializer::deserializeAs(Record, R)) return std::move(EC); - return R; } @@ -308,8 +293,9 @@ Error llvm::codeview::visitTypeRecord(CVType &Record, Error llvm::codeview::visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks, + VisitorDataSource Source, TypeServerHandler *TS) { - VisitHelper V(Callbacks, VDS_BytesPresent); + VisitHelper V(Callbacks, Source); if (TS) V.Visitor.addTypeServerHandler(*TS); return V.Visitor.visitTypeStream(Types); diff --git a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp new file mode 100644 index 000000000000..11e2e215303c --- /dev/null +++ b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp @@ -0,0 +1,371 @@ +//===- TypeIndexDiscovery.cpp -----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Endian.h" + +using namespace llvm; +using namespace llvm::codeview; + +static inline MethodKind getMethodKind(uint16_t Attrs) { + Attrs &= uint16_t(MethodOptions::MethodKindMask); + Attrs >>= 2; + return MethodKind(Attrs); +} + +static inline bool isIntroVirtual(uint16_t Attrs) { + MethodKind MK = getMethodKind(Attrs); + return MK == MethodKind::IntroducingVirtual || + MK == MethodKind::PureIntroducingVirtual; +} + +static inline PointerMode getPointerMode(uint32_t Attrs) { + return static_cast((Attrs >> PointerRecord::PointerModeShift) & + PointerRecord::PointerModeMask); +} + +static inline bool isMemberPointer(uint32_t Attrs) { + PointerMode Mode = getPointerMode(Attrs); + return Mode == PointerMode::PointerToDataMember || + Mode == PointerMode::PointerToDataMember; +} + +static inline uint32_t getEncodedIntegerLength(ArrayRef Data) { + uint16_t N = support::endian::read16le(Data.data()); + if (N < LF_NUMERIC) + return 2; + + assert(N <= LF_UQUADWORD); + + constexpr uint32_t Sizes[] = { + 1, // LF_CHAR + 2, // LF_SHORT + 2, // LF_USHORT + 4, // LF_LONG + 4, // LF_ULONG + 4, // LF_REAL32 + 8, // LF_REAL64 + 10, // LF_REAL80 + 16, // LF_REAL128 + 8, // LF_QUADWORD + 8, // LF_UQUADWORD + }; + + return Sizes[N - LF_NUMERIC]; +} + +static inline uint32_t getCStringLength(ArrayRef Data) { + const char *S = reinterpret_cast(Data.data()); + return strlen(S) + 1; +} + +static void handleMethodOverloadList(ArrayRef Content, + SmallVectorImpl &Refs) { + uint32_t Offset = 0; + + while (!Content.empty()) { + // Array of: + // 0: Attrs + // 2: Padding + // 4: TypeIndex + // if (isIntroVirtual()) + // 8: VFTableOffset + + // At least 8 bytes are guaranteed. 4 extra bytes come iff function is an + // intro virtual. + uint32_t Len = 8; + + uint16_t Attrs = support::endian::read16le(Content.data()); + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + + if (LLVM_UNLIKELY(isIntroVirtual(Attrs))) + Len += 4; + Offset += Len; + Content = Content.drop_front(Len); + } +} + +static uint32_t handleBaseClass(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + // 8: Encoded Integer + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + return 8 + getEncodedIntegerLength(Data.drop_front(8)); +} + +static uint32_t handleEnumerator(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: Encoded Integer + // : Name + uint32_t Size = 4 + getEncodedIntegerLength(Data.drop_front(4)); + return Size + getCStringLength(Data.drop_front(Size)); +} + +static uint32_t handleDataMember(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + // 8: Encoded Integer + // : Name + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + uint32_t Size = 8 + getEncodedIntegerLength(Data.drop_front(8)); + return Size + getCStringLength(Data.drop_front(Size)); +} + +static uint32_t handleOverloadedMethod(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + // 8: Name + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + return 8 + getCStringLength(Data.drop_front(8)); +} + +static uint32_t handleOneMethod(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Attributes + // 4: Type + // if (isIntroVirtual) + // 8: VFTableOffset + // : Name + uint32_t Size = 8; + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + + uint16_t Attrs = support::endian::read16le(Data.drop_front(2).data()); + if (LLVM_UNLIKELY(isIntroVirtual(Attrs))) + Size += 4; + + return Size + getCStringLength(Data.drop_front(Size)); +} + +static uint32_t handleNestedType(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + // 8: Name + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + return 8 + getCStringLength(Data.drop_front(8)); +} + +static uint32_t handleStaticDataMember(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + // 8: Name + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + return 8 + getCStringLength(Data.drop_front(8)); +} + +static uint32_t handleVirtualBaseClass(ArrayRef Data, uint32_t Offset, + bool IsIndirect, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Attrs + // 4: TypeIndex + // 8: TypeIndex + // 12: Encoded Integer + // : Encoded Integer + uint32_t Size = 12; + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 2}); + Size += getEncodedIntegerLength(Data.drop_front(Size)); + Size += getEncodedIntegerLength(Data.drop_front(Size)); + return Size; +} + +static uint32_t handleVFPtr(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + return 8; +} + +static uint32_t handleListContinuation(ArrayRef Data, uint32_t Offset, + SmallVectorImpl &Refs) { + // 0: Kind + // 2: Padding + // 4: TypeIndex + Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1}); + return 8; +} + +static void handleFieldList(ArrayRef Content, + SmallVectorImpl &Refs) { + uint32_t Offset = 0; + uint32_t ThisLen = 0; + while (!Content.empty()) { + TypeLeafKind Kind = + static_cast(support::endian::read16le(Content.data())); + switch (Kind) { + case LF_BCLASS: + ThisLen = handleBaseClass(Content, Offset, Refs); + break; + case LF_ENUMERATE: + ThisLen = handleEnumerator(Content, Offset, Refs); + break; + case LF_MEMBER: + ThisLen = handleDataMember(Content, Offset, Refs); + break; + case LF_METHOD: + ThisLen = handleOverloadedMethod(Content, Offset, Refs); + break; + case LF_ONEMETHOD: + ThisLen = handleOneMethod(Content, Offset, Refs); + break; + case LF_NESTTYPE: + ThisLen = handleNestedType(Content, Offset, Refs); + break; + case LF_STMEMBER: + ThisLen = handleStaticDataMember(Content, Offset, Refs); + break; + case LF_VBCLASS: + case LF_IVBCLASS: + ThisLen = + handleVirtualBaseClass(Content, Offset, Kind == LF_VBCLASS, Refs); + break; + case LF_VFUNCTAB: + ThisLen = handleVFPtr(Content, Offset, Refs); + break; + case LF_INDEX: + ThisLen = handleListContinuation(Content, Offset, Refs); + break; + default: + return; + } + Content = Content.drop_front(ThisLen); + Offset += ThisLen; + if (!Content.empty()) { + uint8_t Pad = Content.front(); + if (Pad >= LF_PAD0) { + uint32_t Skip = Pad & 0x0F; + Content = Content.drop_front(Skip); + Offset += Skip; + } + } + } +} + +static void handlePointer(ArrayRef Content, + SmallVectorImpl &Refs) { + Refs.push_back({TiRefKind::TypeRef, 0, 1}); + + uint32_t Attrs = support::endian::read32le(Content.drop_front(4).data()); + if (isMemberPointer(Attrs)) + Refs.push_back({TiRefKind::TypeRef, 8, 1}); +} + +static void discoverTypeIndices(ArrayRef Content, TypeLeafKind Kind, + SmallVectorImpl &Refs) { + uint32_t Count; + // FIXME: In the future it would be nice if we could avoid hardcoding these + // values. One idea is to define some structures representing these types + // that would allow the use of offsetof(). + switch (Kind) { + case TypeLeafKind::LF_FUNC_ID: + Refs.push_back({TiRefKind::IndexRef, 0, 1}); + Refs.push_back({TiRefKind::TypeRef, 4, 1}); + break; + case TypeLeafKind::LF_MFUNC_ID: + Refs.push_back({TiRefKind::TypeRef, 0, 2}); + break; + case TypeLeafKind::LF_STRING_ID: + Refs.push_back({TiRefKind::IndexRef, 0, 1}); + break; + case TypeLeafKind::LF_SUBSTR_LIST: + Count = support::endian::read32le(Content.data()); + if (Count > 0) + Refs.push_back({TiRefKind::IndexRef, 4, Count}); + break; + case TypeLeafKind::LF_BUILDINFO: + Count = support::endian::read16le(Content.data()); + if (Count > 0) + Refs.push_back({TiRefKind::IndexRef, 2, Count}); + break; + case TypeLeafKind::LF_UDT_SRC_LINE: + Refs.push_back({TiRefKind::TypeRef, 0, 1}); + Refs.push_back({TiRefKind::IndexRef, 4, 1}); + break; + case TypeLeafKind::LF_UDT_MOD_SRC_LINE: + Refs.push_back({TiRefKind::TypeRef, 0, 1}); + break; + case TypeLeafKind::LF_MODIFIER: + Refs.push_back({TiRefKind::TypeRef, 0, 1}); + break; + case TypeLeafKind::LF_PROCEDURE: + Refs.push_back({TiRefKind::TypeRef, 0, 1}); + Refs.push_back({TiRefKind::TypeRef, 8, 1}); + break; + case TypeLeafKind::LF_MFUNCTION: + Refs.push_back({TiRefKind::TypeRef, 0, 3}); + Refs.push_back({TiRefKind::TypeRef, 16, 1}); + break; + case TypeLeafKind::LF_ARGLIST: + Count = support::endian::read32le(Content.data()); + if (Count > 0) + Refs.push_back({TiRefKind::TypeRef, 4, Count}); + break; + case TypeLeafKind::LF_ARRAY: + Refs.push_back({TiRefKind::TypeRef, 0, 2}); + break; + case TypeLeafKind::LF_CLASS: + case TypeLeafKind::LF_STRUCTURE: + case TypeLeafKind::LF_INTERFACE: + Refs.push_back({TiRefKind::TypeRef, 4, 3}); + break; + case TypeLeafKind::LF_UNION: + Refs.push_back({TiRefKind::TypeRef, 4, 1}); + break; + case TypeLeafKind::LF_ENUM: + Refs.push_back({TiRefKind::TypeRef, 4, 2}); + break; + case TypeLeafKind::LF_BITFIELD: + Refs.push_back({TiRefKind::TypeRef, 0, 1}); + break; + case TypeLeafKind::LF_VFTABLE: + Refs.push_back({TiRefKind::TypeRef, 0, 2}); + break; + case TypeLeafKind::LF_VTSHAPE: + break; + case TypeLeafKind::LF_METHODLIST: + handleMethodOverloadList(Content, Refs); + break; + case TypeLeafKind::LF_FIELDLIST: + handleFieldList(Content, Refs); + break; + case TypeLeafKind::LF_POINTER: + handlePointer(Content, Refs); + break; + default: + break; + } +} + +void llvm::codeview::discoverTypeIndices(const CVType &Type, + SmallVectorImpl &Refs) { + ::discoverTypeIndices(Type.content(), Type.kind(), Refs); +} + +void llvm::codeview::discoverTypeIndices(ArrayRef RecordData, + SmallVectorImpl &Refs) { + const RecordPrefix *P = + reinterpret_cast(RecordData.data()); + TypeLeafKind K = static_cast(uint16_t(P->RecordKind)); + ::discoverTypeIndices(RecordData.drop_front(sizeof(RecordPrefix)), K, Refs); +} diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp index 3b061e67e05e..93c1198e36ce 100644 --- a/lib/DebugInfo/CodeView/TypeSerializer.cpp +++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp @@ -9,6 +9,7 @@ #include "llvm/DebugInfo/CodeView/TypeSerializer.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/Support/BinaryStreamWriter.h" #include @@ -16,23 +17,111 @@ using namespace llvm; using namespace llvm::codeview; +namespace { +struct HashedType { + uint64_t Hash; + const uint8_t *Data; + unsigned Size; // FIXME: Go to uint16_t? + TypeIndex Index; +}; + +/// Wrapper around a poitner to a HashedType. Hash and equality operations are +/// based on data in the pointee. +struct HashedTypePtr { + HashedTypePtr() = default; + HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {} + HashedType *Ptr = nullptr; +}; +} // namespace + +namespace llvm { +template <> struct DenseMapInfo { + static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); } + static inline HashedTypePtr getTombstoneKey() { + return HashedTypePtr(reinterpret_cast(1)); + } + static unsigned getHashValue(HashedTypePtr Val) { + assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr); + return Val.Ptr->Hash; + } + static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) { + HashedType *LHS = LHSP.Ptr; + HashedType *RHS = RHSP.Ptr; + if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr) + return LHS == RHS; + if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size) + return false; + return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0; + } +}; +} + +/// Private implementation so that we don't leak our DenseMap instantiations to +/// users. +class llvm::codeview::TypeHasher { +private: + /// Storage for type record provided by the caller. Records will outlive the + /// hasher object, so they should be allocated here. + BumpPtrAllocator &RecordStorage; + + /// Storage for hash keys. These only need to live as long as the hashing + /// operation. + BumpPtrAllocator KeyStorage; + + /// Hash table. We really want a DenseMap, TypeIndex> here, + /// but DenseMap is inefficient when the keys are long (like type records) + /// because it recomputes the hash value of every key when it grows. This + /// value type stores the hash out of line in KeyStorage, so that table + /// entries are small and easy to rehash. + DenseSet HashedRecords; + +public: + TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {} + + void reset() { HashedRecords.clear(); } + + /// Takes the bytes of type record, inserts them into the hash table, saves + /// them, and returns a pointer to an identical stable type record along with + /// its type index in the destination stream. + TypeIndex getOrCreateRecord(ArrayRef &Record, TypeIndex TI); +}; + +TypeIndex TypeHasher::getOrCreateRecord(ArrayRef &Record, + TypeIndex TI) { + assert(Record.size() < UINT32_MAX && "Record too big"); + assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!"); + + // Compute the hash up front so we can store it in the key. + HashedType TempHashedType = {hash_value(Record), Record.data(), + unsigned(Record.size()), TI}; + auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType)); + HashedType *&Hashed = Result.first->Ptr; + + if (Result.second) { + // This was a new type record. We need stable storage for both the key and + // the record. The record should outlive the hashing operation. + Hashed = KeyStorage.Allocate(); + *Hashed = TempHashedType; + + uint8_t *Stable = RecordStorage.Allocate(Record.size()); + memcpy(Stable, Record.data(), Record.size()); + Hashed->Data = Stable; + assert(Hashed->Size == Record.size()); + } + + // Update the caller's copy of Record to point a stable copy. + Record = ArrayRef(Hashed->Data, Hashed->Size); + return Hashed->Index; +} + +TypeIndex TypeSerializer::nextTypeIndex() const { + return TypeIndex::fromArrayIndex(SeenRecords.size()); +} + bool TypeSerializer::isInFieldList() const { return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST; } -TypeIndex TypeSerializer::calcNextTypeIndex() const { - if (LastTypeIndex.isNoneType()) - return TypeIndex(TypeIndex::FirstNonSimpleIndex); - else - return TypeIndex(LastTypeIndex.getIndex() + 1); -} - -TypeIndex TypeSerializer::incrementTypeIndex() { - TypeIndex Previous = LastTypeIndex; - LastTypeIndex = calcNextTypeIndex(); - return Previous; -} - MutableArrayRef TypeSerializer::getCurrentSubRecordData() { assert(isInFieldList()); return getCurrentRecordData().drop_front(CurrentSegment.length()); @@ -51,46 +140,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) { return Error::success(); } -TypeIndex -TypeSerializer::insertRecordBytesPrivate(MutableArrayRef Record) { - assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!"); - - StringRef S(reinterpret_cast(Record.data()), Record.size()); - - TypeIndex NextTypeIndex = calcNextTypeIndex(); - auto Result = HashedRecords.try_emplace(S, NextTypeIndex); - if (Result.second) { - LastTypeIndex = NextTypeIndex; - SeenRecords.push_back(Record); - } - return Result.first->getValue(); -} - -TypeIndex -TypeSerializer::insertRecordBytesWithCopy(CVType &Record, - MutableArrayRef Data) { - assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!"); - - StringRef S(reinterpret_cast(Data.data()), Data.size()); - - // Do a two state lookup / insert so that we don't have to allocate unless - // we're going - // to do an insert. This is a big memory savings. - auto Iter = HashedRecords.find(S); - if (Iter != HashedRecords.end()) - return Iter->second; - - LastTypeIndex = calcNextTypeIndex(); - uint8_t *Copy = RecordStorage.Allocate(Data.size()); - ::memcpy(Copy, Data.data(), Data.size()); - Data = MutableArrayRef(Copy, Data.size()); - S = StringRef(reinterpret_cast(Data.data()), Data.size()); - HashedRecords.insert(std::make_pair(S, LastTypeIndex)); - SeenRecords.push_back(Data); - Record.RecordData = Data; - return LastTypeIndex; -} - Expected> TypeSerializer::addPadding(MutableArrayRef Record) { uint32_t Align = Record.size() % 4; @@ -108,27 +157,79 @@ TypeSerializer::addPadding(MutableArrayRef Record) { return MutableArrayRef(Record.data(), Record.size() + N); } -TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage) - : RecordStorage(Storage), LastTypeIndex(), - RecordBuffer(MaxRecordLength * 2), +TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage, bool Hash) + : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2), Stream(RecordBuffer, llvm::support::little), Writer(Stream), Mapping(Writer) { // RecordBuffer needs to be able to hold enough data so that if we are 1 // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes, // we won't overflow. + if (Hash) + Hasher = make_unique(Storage); } -ArrayRef> TypeSerializer::records() const { +TypeSerializer::~TypeSerializer() = default; + +ArrayRef> TypeSerializer::records() const { return SeenRecords; } -TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; } +void TypeSerializer::reset() { + if (Hasher) + Hasher->reset(); + Writer.setOffset(0); + CurrentSegment = RecordSegment(); + FieldListSegments.clear(); + TypeKind.reset(); + MemberKind.reset(); + SeenRecords.clear(); +} -TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef Record) { +TypeIndex TypeSerializer::insertRecordBytes(ArrayRef &Record) { assert(!TypeKind.hasValue() && "Already in a type mapping!"); assert(Writer.getOffset() == 0 && "Stream has data already!"); - return insertRecordBytesPrivate(Record); + if (Hasher) { + TypeIndex ActualTI = Hasher->getOrCreateRecord(Record, nextTypeIndex()); + if (nextTypeIndex() == ActualTI) + SeenRecords.push_back(Record); + return ActualTI; + } + + TypeIndex NewTI = nextTypeIndex(); + uint8_t *Stable = RecordStorage.Allocate(Record.size()); + memcpy(Stable, Record.data(), Record.size()); + Record = ArrayRef(Stable, Record.size()); + SeenRecords.push_back(Record); + return NewTI; +} + +TypeIndex TypeSerializer::insertRecord(const RemappedType &Record) { + assert(!TypeKind.hasValue() && "Already in a type mapping!"); + assert(Writer.getOffset() == 0 && "Stream has data already!"); + + TypeIndex TI; + ArrayRef OriginalData = Record.OriginalRecord.RecordData; + if (Record.Mappings.empty()) { + // This record did not remap any type indices. Just write it. + return insertRecordBytes(OriginalData); + } + + // At least one type index was remapped. Before we can hash it we have to + // copy the full record bytes, re-write each type index, then hash the copy. + // We do this in temporary storage since only the DenseMap can decide whether + // this record already exists, and if it does we don't want the memory to + // stick around. + RemapStorage.resize(OriginalData.size()); + ::memcpy(&RemapStorage[0], OriginalData.data(), OriginalData.size()); + uint8_t *ContentBegin = RemapStorage.data() + sizeof(RecordPrefix); + for (const auto &M : Record.Mappings) { + // First 4 bytes of every record are the record prefix, but the mapping + // offset is relative to the content which starts after. + *(TypeIndex *)(ContentBegin + M.first) = M.second; + } + auto RemapRef = makeArrayRef(RemapStorage); + return insertRecordBytes(RemapRef); } Error TypeSerializer::visitTypeBegin(CVType &Record) { @@ -163,8 +264,13 @@ Expected TypeSerializer::visitTypeEndGetIndex(CVType &Record) { Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t); Record.Type = *TypeKind; - TypeIndex InsertedTypeIndex = - insertRecordBytesWithCopy(Record, ThisRecordData); + Record.RecordData = ThisRecordData; + + // insertRecordBytes assumes we're not in a mapping, so do this first. + TypeKind.reset(); + Writer.setOffset(0); + + TypeIndex InsertedTypeIndex = insertRecordBytes(Record.RecordData); // Write out each additional segment in reverse order, and update each // record's continuation index to point to the previous one. @@ -174,11 +280,9 @@ Expected TypeSerializer::visitTypeEndGetIndex(CVType &Record) { reinterpret_cast(CIBytes.data()); assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder"); *CI = InsertedTypeIndex.getIndex(); - InsertedTypeIndex = insertRecordBytesPrivate(X); + InsertedTypeIndex = insertRecordBytes(X); } - TypeKind.reset(); - Writer.setOffset(0); FieldListSegments.clear(); CurrentSegment.SubRecords.clear(); diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp index 46747f8eab99..71a0966df036 100644 --- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp +++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp @@ -11,7 +11,9 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/TypeDeserializer.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" +#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h" #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" @@ -57,37 +59,56 @@ namespace { /// looking at the record kind. class TypeStreamMerger : public TypeVisitorCallbacks { public: - TypeStreamMerger(TypeTableBuilder &DestIdStream, - TypeTableBuilder &DestTypeStream, - SmallVectorImpl &SourceToDest, - TypeServerHandler *Handler) - : DestIdStream(DestIdStream), DestTypeStream(DestTypeStream), - FieldListBuilder(DestTypeStream), Handler(Handler), - IndexMap(SourceToDest) {} + explicit TypeStreamMerger(SmallVectorImpl &SourceToDest, + TypeServerHandler *Handler) + : Handler(Handler), IndexMap(SourceToDest) { + SourceToDest.clear(); + } static const TypeIndex Untranslated; -/// TypeVisitorCallbacks overrides. -#define TYPE_RECORD(EnumName, EnumVal, Name) \ - Error visitKnownRecord(CVType &CVR, Name##Record &Record) override; -#define MEMBER_RECORD(EnumName, EnumVal, Name) \ - Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override; -#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName) -#include "llvm/DebugInfo/CodeView/TypeRecords.def" - - Error visitUnknownType(CVType &Record) override; - Error visitTypeBegin(CVType &Record) override; Error visitTypeEnd(CVType &Record) override; - Error visitMemberEnd(CVMemberRecord &Record) override; - Error mergeStream(const CVTypeArray &Types); + Error mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes, + const CVTypeArray &IdsAndTypes); + Error mergeIdRecords(TypeTableBuilder &Dest, + ArrayRef TypeSourceToDest, + const CVTypeArray &Ids); + Error mergeTypeRecords(TypeTableBuilder &Dest, const CVTypeArray &Types); private: + Error doit(const CVTypeArray &Types); + void addMapping(TypeIndex Idx); - bool remapIndex(TypeIndex &Idx); + bool remapTypeIndex(TypeIndex &Idx); + bool remapItemIndex(TypeIndex &Idx); + + bool remapIndices(RemappedType &Record, ArrayRef Refs) { + auto OriginalData = Record.OriginalRecord.content(); + bool Success = true; + for (auto &Ref : Refs) { + uint32_t Offset = Ref.Offset; + ArrayRef Bytes = + OriginalData.slice(Ref.Offset, sizeof(TypeIndex)); + ArrayRef TIs(reinterpret_cast(Bytes.data()), + Ref.Count); + for (auto TI : TIs) { + TypeIndex NewTI = TI; + bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef) + ? remapItemIndex(NewTI) + : remapTypeIndex(NewTI); + if (ThisSuccess && NewTI != TI) + Record.Mappings.emplace_back(Offset, NewTI); + Offset += sizeof(TypeIndex); + Success &= ThisSuccess; + } + } + return Success; + } + + bool remapIndex(TypeIndex &Idx, ArrayRef Map); size_t slotForIndex(TypeIndex Idx) const { assert(!Idx.isSimple() && "simple type indices have no slots"); @@ -98,50 +119,46 @@ private: return llvm::make_error(cv_error_code::corrupt_record); } - template - Error writeRecord(RecordType &R, bool RemapSuccess) { + Error writeRecord(TypeTableBuilder &Dest, const RemappedType &Record, + bool RemapSuccess) { TypeIndex DestIdx = Untranslated; if (RemapSuccess) - DestIdx = DestTypeStream.writeKnownType(R); + DestIdx = Dest.writeSerializedRecord(Record); addMapping(DestIdx); return Error::success(); } - template - Error writeIdRecord(RecordType &R, bool RemapSuccess) { - TypeIndex DestIdx = Untranslated; - if (RemapSuccess) - DestIdx = DestIdStream.writeKnownType(R); + Error writeTypeRecord(const CVType &Record) { + TypeIndex DestIdx = + DestTypeStream->writeSerializedRecord(Record.RecordData); addMapping(DestIdx); return Error::success(); } - template - Error writeMember(RecordType &R, bool RemapSuccess) { - if (RemapSuccess) - FieldListBuilder.writeMemberType(R); - else - HadUntranslatedMember = true; - return Error::success(); + Error writeTypeRecord(const RemappedType &Record, bool RemapSuccess) { + return writeRecord(*DestTypeStream, Record, RemapSuccess); + } + + Error writeIdRecord(const RemappedType &Record, bool RemapSuccess) { + return writeRecord(*DestIdStream, Record, RemapSuccess); } Optional LastError; bool IsSecondPass = false; - bool HadUntranslatedMember = false; - unsigned NumBadIndices = 0; - BumpPtrAllocator Allocator; - - TypeTableBuilder &DestIdStream; - TypeTableBuilder &DestTypeStream; - FieldListRecordBuilder FieldListBuilder; - TypeServerHandler *Handler; - TypeIndex CurIndex{TypeIndex::FirstNonSimpleIndex}; + TypeTableBuilder *DestIdStream = nullptr; + TypeTableBuilder *DestTypeStream = nullptr; + TypeServerHandler *Handler = nullptr; + + // If we're only mapping id records, this array contains the mapping for + // type records. + ArrayRef TypeLookup; + /// Map from source type index to destination type index. Indexed by source /// type index minus 0x1000. SmallVectorImpl &IndexMap; @@ -151,22 +168,34 @@ private: const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated); -Error TypeStreamMerger::visitTypeBegin(CVRecord &Rec) { +Error TypeStreamMerger::visitTypeBegin(CVType &Rec) { + RemappedType R(Rec); + SmallVector Refs; + discoverTypeIndices(Rec.RecordData, Refs); + bool Success = remapIndices(R, Refs); + switch (Rec.kind()) { + case TypeLeafKind::LF_FUNC_ID: + case TypeLeafKind::LF_MFUNC_ID: + case TypeLeafKind::LF_STRING_ID: + case TypeLeafKind::LF_SUBSTR_LIST: + case TypeLeafKind::LF_BUILDINFO: + case TypeLeafKind::LF_UDT_SRC_LINE: + case TypeLeafKind::LF_UDT_MOD_SRC_LINE: + return writeIdRecord(R, Success); + default: + return writeTypeRecord(R, Success); + } return Error::success(); } -Error TypeStreamMerger::visitTypeEnd(CVRecord &Rec) { - CurIndex = TypeIndex(CurIndex.getIndex() + 1); +Error TypeStreamMerger::visitTypeEnd(CVType &Rec) { + ++CurIndex; if (!IsSecondPass) assert(IndexMap.size() == slotForIndex(CurIndex) && "visitKnownRecord should add one index map entry"); return Error::success(); } -Error TypeStreamMerger::visitMemberEnd(CVMemberRecord &Rec) { - return Error::success(); -} - void TypeStreamMerger::addMapping(TypeIndex Idx) { if (!IsSecondPass) { assert(IndexMap.size() == slotForIndex(CurIndex) && @@ -178,7 +207,7 @@ void TypeStreamMerger::addMapping(TypeIndex Idx) { } } -bool TypeStreamMerger::remapIndex(TypeIndex &Idx) { +bool TypeStreamMerger::remapIndex(TypeIndex &Idx, ArrayRef Map) { // Simple types are unchanged. if (Idx.isSimple()) return true; @@ -187,14 +216,14 @@ bool TypeStreamMerger::remapIndex(TypeIndex &Idx) { // successfully. If it refers to a type later in the stream or a record we // had to defer, defer it until later pass. unsigned MapPos = slotForIndex(Idx); - if (MapPos < IndexMap.size() && IndexMap[MapPos] != Untranslated) { - Idx = IndexMap[MapPos]; + if (MapPos < Map.size() && Map[MapPos] != Untranslated) { + Idx = Map[MapPos]; return true; } // If this is the second pass and this index isn't in the map, then it points // outside the current type stream, and this is a corrupt record. - if (IsSecondPass && MapPos >= IndexMap.size()) { + if (IsSecondPass && MapPos >= Map.size()) { // FIXME: Print a more useful error. We can give the current record and the // index that we think its pointing to. LastError = joinErrors(std::move(*LastError), errorCorruptRecord()); @@ -208,241 +237,61 @@ bool TypeStreamMerger::remapIndex(TypeIndex &Idx) { return false; } -//----------------------------------------------------------------------------// -// Item records -//----------------------------------------------------------------------------// +bool TypeStreamMerger::remapTypeIndex(TypeIndex &Idx) { + // If we're mapping a pure index stream, then IndexMap only contains mappings + // from OldIdStream -> NewIdStream, in which case we will need to use the + // special mapping from OldTypeStream -> NewTypeStream which was computed + // externally. Regardless, we use this special map if and only if we are + // doing an id-only mapping. + if (DestTypeStream == nullptr) + return remapIndex(Idx, TypeLookup); -Error TypeStreamMerger::visitKnownRecord(CVType &, FuncIdRecord &R) { - bool Success = true; - Success &= remapIndex(R.ParentScope); - Success &= remapIndex(R.FunctionType); - return writeIdRecord(R, Success); + assert(TypeLookup.empty()); + return remapIndex(Idx, IndexMap); } -Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFuncIdRecord &R) { - bool Success = true; - Success &= remapIndex(R.ClassType); - Success &= remapIndex(R.FunctionType); - return writeIdRecord(R, Success); +bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) { + assert(DestIdStream); + return remapIndex(Idx, IndexMap); } -Error TypeStreamMerger::visitKnownRecord(CVType &, StringIdRecord &R) { - return writeIdRecord(R, remapIndex(R.Id)); +Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest, + const CVTypeArray &Types) { + DestTypeStream = &Dest; + + return doit(Types); } -Error TypeStreamMerger::visitKnownRecord(CVType &, StringListRecord &R) { - bool Success = true; - for (TypeIndex &Str : R.StringIndices) - Success &= remapIndex(Str); - return writeIdRecord(R, Success); +Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest, + ArrayRef TypeSourceToDest, + const CVTypeArray &Ids) { + DestIdStream = &Dest; + TypeLookup = TypeSourceToDest; + + return doit(Ids); } -Error TypeStreamMerger::visitKnownRecord(CVType &, BuildInfoRecord &R) { - bool Success = true; - for (TypeIndex &Arg : R.ArgIndices) - Success &= remapIndex(Arg); - return writeIdRecord(R, Success); +Error TypeStreamMerger::mergeTypesAndIds(TypeTableBuilder &DestIds, + TypeTableBuilder &DestTypes, + const CVTypeArray &IdsAndTypes) { + DestIdStream = &DestIds; + DestTypeStream = &DestTypes; + + return doit(IdsAndTypes); } -Error TypeStreamMerger::visitKnownRecord(CVType &, UdtSourceLineRecord &R) { - bool Success = true; - Success &= remapIndex(R.UDT); - Success &= remapIndex(R.SourceFile); - // FIXME: Translate UdtSourceLineRecord into UdtModSourceLineRecords in the - // IPI stream. - return writeIdRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, UdtModSourceLineRecord &R) { - bool Success = true; - Success &= remapIndex(R.UDT); - Success &= remapIndex(R.SourceFile); - return writeIdRecord(R, Success); -} - -//----------------------------------------------------------------------------// -// Type records -//----------------------------------------------------------------------------// - -Error TypeStreamMerger::visitKnownRecord(CVType &, ModifierRecord &R) { - return writeRecord(R, remapIndex(R.ModifiedType)); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, ProcedureRecord &R) { - bool Success = true; - Success &= remapIndex(R.ReturnType); - Success &= remapIndex(R.ArgumentList); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFunctionRecord &R) { - bool Success = true; - Success &= remapIndex(R.ReturnType); - Success &= remapIndex(R.ClassType); - Success &= remapIndex(R.ThisType); - Success &= remapIndex(R.ArgumentList); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &Type, ArgListRecord &R) { - bool Success = true; - for (TypeIndex &Arg : R.ArgIndices) - Success &= remapIndex(Arg); - if (auto EC = writeRecord(R, Success)) - return EC; - return Error::success(); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, PointerRecord &R) { - bool Success = true; - Success &= remapIndex(R.ReferentType); - if (R.isPointerToMember()) - Success &= remapIndex(R.MemberInfo->ContainingType); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, ArrayRecord &R) { - bool Success = true; - Success &= remapIndex(R.ElementType); - Success &= remapIndex(R.IndexType); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, ClassRecord &R) { - bool Success = true; - Success &= remapIndex(R.FieldList); - Success &= remapIndex(R.DerivationList); - Success &= remapIndex(R.VTableShape); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, UnionRecord &R) { - return writeRecord(R, remapIndex(R.FieldList)); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, EnumRecord &R) { - bool Success = true; - Success &= remapIndex(R.FieldList); - Success &= remapIndex(R.UnderlyingType); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, BitFieldRecord &R) { - return writeRecord(R, remapIndex(R.Type)); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableShapeRecord &R) { - return writeRecord(R, true); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, TypeServer2Record &R) { - return writeRecord(R, true); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, LabelRecord &R) { - return writeRecord(R, true); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableRecord &R) { - bool Success = true; - Success &= remapIndex(R.CompleteClass); - Success &= remapIndex(R.OverriddenVFTable); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, - MethodOverloadListRecord &R) { - bool Success = true; - for (OneMethodRecord &Meth : R.Methods) - Success &= remapIndex(Meth.Type); - return writeRecord(R, Success); -} - -Error TypeStreamMerger::visitKnownRecord(CVType &, FieldListRecord &R) { - // Visit the members inside the field list. - HadUntranslatedMember = false; - FieldListBuilder.begin(); - if (auto EC = codeview::visitMemberRecordStream(R.Data, *this)) - return EC; - - // Write the record if we translated all field list members. - TypeIndex DestIdx = Untranslated; - if (!HadUntranslatedMember) - DestIdx = FieldListBuilder.end(); - else - FieldListBuilder.reset(); - addMapping(DestIdx); - - return Error::success(); -} - -//----------------------------------------------------------------------------// -// Member records -//----------------------------------------------------------------------------// - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - NestedTypeRecord &R) { - return writeMember(R, remapIndex(R.Type)); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, OneMethodRecord &R) { - bool Success = true; - Success &= remapIndex(R.Type); - return writeMember(R, Success); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - OverloadedMethodRecord &R) { - return writeMember(R, remapIndex(R.MethodList)); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - DataMemberRecord &R) { - return writeMember(R, remapIndex(R.Type)); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - StaticDataMemberRecord &R) { - return writeMember(R, remapIndex(R.Type)); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - EnumeratorRecord &R) { - return writeMember(R, true); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, VFPtrRecord &R) { - return writeMember(R, remapIndex(R.Type)); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, BaseClassRecord &R) { - return writeMember(R, remapIndex(R.Type)); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - VirtualBaseClassRecord &R) { - bool Success = true; - Success &= remapIndex(R.BaseType); - Success &= remapIndex(R.VBPtrType); - return writeMember(R, Success); -} - -Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, - ListContinuationRecord &R) { - return writeMember(R, remapIndex(R.ContinuationIndex)); -} - -Error TypeStreamMerger::visitUnknownType(CVType &Rec) { - // We failed to translate a type. Translate this index as "not translated". - addMapping(TypeIndex(SimpleTypeKind::NotTranslated)); - return errorCorruptRecord(); -} - -Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) { - assert(IndexMap.empty()); +Error TypeStreamMerger::doit(const CVTypeArray &Types) { LastError = Error::success(); - if (auto EC = codeview::visitTypeStream(Types, *this, Handler)) + // We don't want to deserialize records. I guess this flag is poorly named, + // but it really means "Don't deserialize records before switching on the + // concrete type. + // FIXME: We can probably get even more speed here if we don't use the visitor + // pipeline here, but instead write the switch ourselves. I don't think it + // would buy us much since it's already pretty fast, but it's probably worth + // a few cycles. + if (auto EC = + codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler)) return EC; // If we found bad indices but no other errors, try doing another pass and see @@ -458,7 +307,8 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) { NumBadIndices = 0; CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex); - if (auto EC = codeview::visitTypeStream(Types, *this, Handler)) + if (auto EC = + codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler)) return EC; assert(NumBadIndices <= BadIndicesRemaining && @@ -469,18 +319,32 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) { } } - IndexMap.clear(); - Error Ret = std::move(*LastError); LastError.reset(); return Ret; } -Error llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestIdStream, - TypeTableBuilder &DestTypeStream, +Error llvm::codeview::mergeTypeRecords(TypeTableBuilder &Dest, SmallVectorImpl &SourceToDest, TypeServerHandler *Handler, - const CVTypeArray &Types) { - return TypeStreamMerger(DestIdStream, DestTypeStream, SourceToDest, Handler) - .mergeStream(Types); + const CVTypeArray &Types) { + TypeStreamMerger M(SourceToDest, Handler); + return M.mergeTypeRecords(Dest, Types); +} + +Error llvm::codeview::mergeIdRecords(TypeTableBuilder &Dest, + ArrayRef TypeSourceToDest, + SmallVectorImpl &SourceToDest, + const CVTypeArray &Ids) { + TypeStreamMerger M(SourceToDest, nullptr); + return M.mergeIdRecords(Dest, TypeSourceToDest, Ids); +} + +Error llvm::codeview::mergeTypeAndIdRecords( + TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes, + SmallVectorImpl &SourceToDest, TypeServerHandler *Handler, + const CVTypeArray &IdsAndTypes) { + + TypeStreamMerger M(SourceToDest, Handler); + return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes); } diff --git a/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/lib/DebugInfo/CodeView/TypeTableCollection.cpp index a18710d6ab52..699694fde928 100644 --- a/lib/DebugInfo/CodeView/TypeTableCollection.cpp +++ b/lib/DebugInfo/CodeView/TypeTableCollection.cpp @@ -24,8 +24,7 @@ static void error(Error &&EC) { consumeError(std::move(EC)); } -TypeTableCollection::TypeTableCollection( - ArrayRef> Records) +TypeTableCollection::TypeTableCollection(ArrayRef> Records) : Records(Records), Database(Records.size()) {} Optional TypeTableCollection::getFirst() { diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 8e7c6c43d1a2..5ed55ce4c0dc 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -60,12 +60,15 @@ typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind; typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind; uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size, - uint32_t *Off, const RelocAddrMap *Relocs) { + uint32_t *Off, const RelocAddrMap *Relocs, + uint64_t *SectionIndex) { if (!Relocs) return Data.getUnsigned(Off, Size); RelocAddrMap::const_iterator AI = Relocs->find(*Off); if (AI == Relocs->end()) return Data.getUnsigned(Off, Size); + if (SectionIndex) + *SectionIndex = AI->second.SectionIndex; return Data.getUnsigned(Off, Size) + AI->second.Value; } @@ -287,6 +290,15 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH, getStringSection(), isLittleEndian()); } +DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) { + // FIXME: Improve this for the case where this DWO file is really a DWP file + // with an index - use the index for lookup instead of a linear search. + for (const auto &DWOCU : dwo_compile_units()) + if (DWOCU->getDWOId() == Hash) + return DWOCU.get(); + return nullptr; +} + DWARFDie DWARFContext::getDIEForOffset(uint32_t Offset) { parseCompileUnits(); if (auto *CU = CUs.getUnitForOffset(Offset)) @@ -897,28 +909,81 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address, return InliningInfo; } +std::shared_ptr +DWARFContext::getDWOContext(StringRef AbsolutePath) { + if (auto S = DWP.lock()) { + DWARFContext *Ctxt = S->Context.get(); + return std::shared_ptr(std::move(S), Ctxt); + } + + std::weak_ptr *Entry = &DWOFiles[AbsolutePath]; + + if (auto S = Entry->lock()) { + DWARFContext *Ctxt = S->Context.get(); + return std::shared_ptr(std::move(S), Ctxt); + } + + SmallString<128> DWPName; + Expected> Obj = [&] { + if (!CheckedForDWP) { + (getFileName() + ".dwp").toVector(DWPName); + auto Obj = object::ObjectFile::createObjectFile(DWPName); + if (Obj) { + Entry = &DWP; + return Obj; + } else { + CheckedForDWP = true; + // TODO: Should this error be handled (maybe in a high verbosity mode) + // before falling back to .dwo files? + consumeError(Obj.takeError()); + } + } + + return object::ObjectFile::createObjectFile(AbsolutePath); + }(); + + if (!Obj) { + // TODO: Actually report errors helpfully. + consumeError(Obj.takeError()); + return nullptr; + } + + auto S = std::make_shared(); + S->File = std::move(Obj.get()); + S->Context = llvm::make_unique(*S->File.getBinary()); + *Entry = S; + auto *Ctxt = S->Context.get(); + return std::shared_ptr(std::move(S), Ctxt); +} + static Error createError(const Twine &Reason, llvm::Error E) { return make_error(Reason + toString(std::move(E)), inconvertibleErrorCode()); } -/// Returns the address of symbol relocation used against. Used for futher -/// relocations computation. Symbol's section load address is taken in account if -/// LoadedObjectInfo interface is provided. -static Expected -getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc, - const LoadedObjectInfo *L, - std::map &Cache) { - uint64_t Ret = 0; +/// SymInfo contains information about symbol: it's address +/// and section index which is -1LL for absolute symbols. +struct SymInfo { + uint64_t Address; + uint64_t SectionIndex; +}; + +/// Returns the address of symbol relocation used against and a section index. +/// Used for futher relocations computation. Symbol's section load address is +static Expected getSymbolInfo(const object::ObjectFile &Obj, + const RelocationRef &Reloc, + const LoadedObjectInfo *L, + std::map &Cache) { + SymInfo Ret = {0, (uint64_t)-1LL}; object::section_iterator RSec = Obj.section_end(); object::symbol_iterator Sym = Reloc.getSymbol(); - std::map::iterator CacheIt = Cache.end(); + std::map::iterator CacheIt = Cache.end(); // First calculate the address of the symbol or section as it appears // in the object file if (Sym != Obj.symbol_end()) { bool New; - std::tie(CacheIt, New) = Cache.insert({*Sym, 0}); + std::tie(CacheIt, New) = Cache.insert({*Sym, {0, 0}}); if (!New) return CacheIt->second; @@ -934,12 +999,15 @@ getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc, SectOrErr.takeError()); RSec = *SectOrErr; - Ret = *SymAddrOrErr; + Ret.Address = *SymAddrOrErr; } else if (auto *MObj = dyn_cast(&Obj)) { RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl()); - Ret = RSec->getAddress(); + Ret.Address = RSec->getAddress(); } + if (RSec != Obj.section_end()) + Ret.SectionIndex = RSec->getIndex(); + // If we are given load addresses for the sections, we need to adjust: // SymAddr = (Address of Symbol Or Section in File) - // (Address of Section in File) + @@ -949,7 +1017,7 @@ getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc, // we need to perform the same computation. if (L && RSec != Obj.section_end()) if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec)) - Ret += SectionLoadAddress - RSec->getAddress(); + Ret.Address += SectionLoadAddress - RSec->getAddress(); if (CacheIt != Cache.end()) CacheIt->second = Ret; @@ -989,8 +1057,8 @@ Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec, } DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, - const LoadedObjectInfo *L) - : IsLittleEndian(Obj.isLittleEndian()), + const LoadedObjectInfo *L) + : FileName(Obj.getFileName()), IsLittleEndian(Obj.isLittleEndian()), AddressSize(Obj.getBytesInAddress()) { for (const SectionRef &Section : Obj.sections()) { StringRef name; @@ -1008,7 +1076,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, // Try to obtain an already relocated version of this section. // Else use the unrelocated section from the object file. We'll have to // apply relocations ourselves later. - if (!L || !L->getLoadedSectionContents(*RelocatedSection,data)) + if (!L || !L->getLoadedSectionContents(*RelocatedSection, data)) Section.getContents(data); if (auto Err = maybeDecompress(Section, name, data)) { @@ -1047,7 +1115,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, // If the section we're relocating was relocated already by the JIT, // then we used the relocated version above, so we do not need to process // relocations for it now. - if (L && L->getLoadedSectionContents(*RelocatedSection,RelSecData)) + if (L && L->getLoadedSectionContents(*RelocatedSection, RelSecData)) continue; // In Mach-o files, the relocations do not need to be applied if @@ -1091,29 +1159,30 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, if (Section.relocation_begin() == Section.relocation_end()) continue; - std::map AddrCache; + // Symbol to [address, section index] cache mapping. + std::map AddrCache; for (const RelocationRef &Reloc : Section.relocations()) { // FIXME: it's not clear how to correctly handle scattered // relocations. if (isRelocScattered(Obj, Reloc)) continue; - Expected SymAddrOrErr = - getSymbolAddress(Obj, Reloc, L, AddrCache); - if (!SymAddrOrErr) { - errs() << toString(SymAddrOrErr.takeError()) << '\n'; + Expected SymInfoOrErr = getSymbolInfo(Obj, Reloc, L, AddrCache); + if (!SymInfoOrErr) { + errs() << toString(SymInfoOrErr.takeError()) << '\n'; continue; } object::RelocVisitor V(Obj); - uint64_t Val = V.visit(Reloc.getType(), Reloc, *SymAddrOrErr); + uint64_t Val = V.visit(Reloc.getType(), Reloc, SymInfoOrErr->Address); if (V.error()) { SmallString<32> Name; Reloc.getTypeName(Name); errs() << "error: failed to compute relocation: " << Name << "\n"; continue; } - Map->insert({Reloc.getOffset(), {Val}}); + llvm::RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val}; + Map->insert({Reloc.getOffset(), Rel}); } } } diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp index 8da797750abd..6b5e1d3c931b 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp @@ -35,8 +35,8 @@ bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr, while (true) { RangeListEntry entry; uint32_t prev_offset = *offset_ptr; - entry.StartAddress = - getRelocatedValue(data, AddressSize, offset_ptr, &Relocs); + entry.StartAddress = getRelocatedValue(data, AddressSize, offset_ptr, + &Relocs, &entry.SectionIndex); entry.EndAddress = getRelocatedValue(data, AddressSize, offset_ptr, &Relocs); @@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const { if (RLE.isBaseAddressSelectionEntry(AddressSize)) { BaseAddress = RLE.EndAddress; } else { - Res.push_back( - {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress}); + Res.push_back({BaseAddress + RLE.StartAddress, + BaseAddress + RLE.EndAddress, RLE.SectionIndex}); } } return Res; diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp index e3bd759ba94b..fd45c77d3745 100644 --- a/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -211,13 +211,16 @@ Optional DWARFDie::getHighPC(uint64_t LowPC) const { return None; } -bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const { - auto LowPcAddr = toAddress(find(DW_AT_low_pc)); +bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC, + uint64_t &SectionIndex) const { + auto F = find(DW_AT_low_pc); + auto LowPcAddr = toAddress(F); if (!LowPcAddr) return false; if (auto HighPcAddr = getHighPC(*LowPcAddr)) { LowPC = *LowPcAddr; HighPC = *HighPcAddr; + SectionIndex = F->getSectionIndex(); return true; } return false; @@ -228,9 +231,9 @@ DWARFDie::getAddressRanges() const { if (isNULL()) return DWARFAddressRangesVector(); // Single range specified by low/high PC. - uint64_t LowPC, HighPC; - if (getLowAndHighPC(LowPC, HighPC)) - return {{LowPC, HighPC}}; + uint64_t LowPC, HighPC, Index; + if (getLowAndHighPC(LowPC, HighPC, Index)) + return {{LowPC, HighPC, Index}}; // Multiple ranges from .debug_ranges section. auto RangesOffset = toSectionOffset(find(DW_AT_ranges)); diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp index 1cbd3ea2c869..0963d7bfd713 100644 --- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp +++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp @@ -333,8 +333,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data, return false; uint16_t AddrSize = (Form == DW_FORM_addr) ? U->getAddressByteSize() : U->getRefAddrByteSize(); - Value.uval = - getRelocatedValue(Data, AddrSize, OffsetPtr, U->getRelocMap()); + Value.uval = getRelocatedValue(Data, AddrSize, OffsetPtr, + U->getRelocMap(), &Value.SectionIndex); break; } case DW_FORM_exprloc: diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index c268afc222c3..c5add6a478b3 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -249,23 +249,6 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) { return DieArray.size(); } -DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath, uint64_t DWOId) { - auto Obj = object::ObjectFile::createObjectFile(DWOPath); - if (!Obj) { - // TODO: Actually report errors helpfully. - consumeError(Obj.takeError()); - return; - } - DWOFile = std::move(Obj.get()); - DWOContext.reset( - cast(new DWARFContextInMemory(*DWOFile.getBinary()))); - for (const auto &DWOCU : DWOContext->dwo_compile_units()) - if (DWOCU->getDWOId() == DWOId) { - DWOU = DWOCU.get(); - return; - } -} - bool DWARFUnit::parseDWO() { if (isDWO) return false; @@ -287,16 +270,18 @@ bool DWARFUnit::parseDWO() { auto DWOId = getDWOId(); if (!DWOId) return false; - DWO = llvm::make_unique(AbsolutePath, *DWOId); - DWARFUnit *DWOCU = DWO->getUnit(); - if (!DWOCU) { - DWO.reset(); + auto DWOContext = Context.getDWOContext(AbsolutePath); + if (!DWOContext) return false; - } + + DWARFCompileUnit *DWOCU = DWOContext->getDWOCompileUnitForHash(*DWOId); + if (!DWOCU) + return false; + DWO = std::shared_ptr(std::move(DWOContext), DWOCU); // Share .debug_addr and .debug_ranges section with compile unit in .dwo - DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase); + DWO->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase); auto DWORangesBase = UnitDie.getRangesBaseAttribute(); - DWOCU->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0); + DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0); return true; } @@ -339,8 +324,8 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) { // Collect address ranges from DIEs in .dwo if necessary. bool DWOCreated = parseDWO(); - if (DWO.get()) - DWO->getUnit()->collectAddressRanges(CURanges); + if (DWO) + DWO->collectAddressRanges(CURanges); if (DWOCreated) DWO.reset(); @@ -400,7 +385,7 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address, // First, find the subroutine that contains the given address (the leaf // of inlined chain). DWARFDie SubroutineDIE = - (DWO ? DWO->getUnit() : this)->getSubroutineForAddress(Address); + (DWO ? DWO.get() : this)->getSubroutineForAddress(Address); while (SubroutineDIE) { if (SubroutineDIE.isSubroutineDIE()) diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp index 57953cfa338e..dfdeb8414212 100644 --- a/lib/DebugInfo/MSF/MappedBlockStream.cpp +++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp @@ -45,18 +45,17 @@ static Interval intersect(const Interval &I1, const Interval &I2) { std::min(I1.second, I2.second)); } -MappedBlockStream::MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks, +MappedBlockStream::MappedBlockStream(uint32_t BlockSize, const MSFStreamLayout &Layout, BinaryStreamRef MsfData) - : BlockSize(BlockSize), NumBlocks(NumBlocks), StreamLayout(Layout), - MsfData(MsfData) {} + : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData) {} std::unique_ptr -MappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks, +MappedBlockStream::createStream(uint32_t BlockSize, const MSFStreamLayout &Layout, BinaryStreamRef MsfData) { return llvm::make_unique>( - BlockSize, NumBlocks, Layout, MsfData); + BlockSize, Layout, MsfData); } std::unique_ptr MappedBlockStream::createIndexedStream( @@ -66,7 +65,7 @@ std::unique_ptr MappedBlockStream::createIndexedStream( SL.Blocks = Layout.StreamMap[StreamIndex]; SL.Length = Layout.StreamSizes[StreamIndex]; return llvm::make_unique>( - Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData); + Layout.SB->BlockSize, SL, MsfData); } std::unique_ptr @@ -75,7 +74,7 @@ MappedBlockStream::createDirectoryStream(const MSFLayout &Layout, MSFStreamLayout SL; SL.Blocks = Layout.DirectoryBlocks; SL.Length = Layout.SB->NumDirectoryBytes; - return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData); } std::unique_ptr @@ -83,7 +82,7 @@ MappedBlockStream::createFpmStream(const MSFLayout &Layout, BinaryStreamRef MsfData) { MSFStreamLayout SL; initializeFpmStreamLayout(Layout, SL); - return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData); } Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size, @@ -173,7 +172,7 @@ Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset, uint32_t First = Offset / BlockSize; uint32_t Last = First; - while (Last < NumBlocks - 1) { + while (Last < getNumBlocks() - 1) { if (StreamLayout.Blocks[Last] != StreamLayout.Blocks[Last + 1] - 1) break; ++Last; @@ -313,17 +312,16 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset, } WritableMappedBlockStream::WritableMappedBlockStream( - uint32_t BlockSize, uint32_t NumBlocks, const MSFStreamLayout &Layout, + uint32_t BlockSize, const MSFStreamLayout &Layout, WritableBinaryStreamRef MsfData) - : ReadInterface(BlockSize, NumBlocks, Layout, MsfData), - WriteInterface(MsfData) {} + : ReadInterface(BlockSize, Layout, MsfData), WriteInterface(MsfData) {} std::unique_ptr -WritableMappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks, +WritableMappedBlockStream::createStream(uint32_t BlockSize, const MSFStreamLayout &Layout, WritableBinaryStreamRef MsfData) { return llvm::make_unique>( - BlockSize, NumBlocks, Layout, MsfData); + BlockSize, Layout, MsfData); } std::unique_ptr @@ -334,7 +332,7 @@ WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout, MSFStreamLayout SL; SL.Blocks = Layout.StreamMap[StreamIndex]; SL.Length = Layout.StreamSizes[StreamIndex]; - return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData); } std::unique_ptr @@ -343,7 +341,7 @@ WritableMappedBlockStream::createDirectoryStream( MSFStreamLayout SL; SL.Blocks = Layout.DirectoryBlocks; SL.Length = Layout.SB->NumDirectoryBytes; - return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData); } std::unique_ptr @@ -351,7 +349,7 @@ WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData) { MSFStreamLayout SL; initializeFpmStreamLayout(Layout, SL); - return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData); + return createStream(Layout.SB->BlockSize, SL, MsfData); } Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size, diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp index c19a2f0d3110..23c7456d7772 100644 --- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp @@ -129,16 +129,21 @@ uint32_t DbiStreamBuilder::calculateSectionMapStreamSize() const { return sizeof(SecMapHeader) + sizeof(SecMapEntry) * SectionMap.size(); } -uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const { - uint32_t Size = 0; - Size += sizeof(ulittle16_t); // NumModules - Size += sizeof(ulittle16_t); // NumSourceFiles - Size += ModiList.size() * sizeof(ulittle16_t); // ModIndices - Size += ModiList.size() * sizeof(ulittle16_t); // ModFileCounts +uint32_t DbiStreamBuilder::calculateNamesOffset() const { + uint32_t Offset = 0; + Offset += sizeof(ulittle16_t); // NumModules + Offset += sizeof(ulittle16_t); // NumSourceFiles + Offset += ModiList.size() * sizeof(ulittle16_t); // ModIndices + Offset += ModiList.size() * sizeof(ulittle16_t); // ModFileCounts uint32_t NumFileInfos = 0; for (const auto &M : ModiList) NumFileInfos += M->source_files().size(); - Size += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets + Offset += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets + return Offset; +} + +uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const { + uint32_t Size = calculateNamesOffset(); Size += calculateNamesBufferSize(); return alignTo(Size, sizeof(uint32_t)); } @@ -157,9 +162,8 @@ uint32_t DbiStreamBuilder::calculateDbgStreamsSize() const { Error DbiStreamBuilder::generateFileInfoSubstream() { uint32_t Size = calculateFileInfoSubstreamSize(); - uint32_t NameSize = calculateNamesBufferSize(); auto Data = Allocator.Allocate(Size); - uint32_t NamesOffset = Size - NameSize; + uint32_t NamesOffset = calculateNamesOffset(); FileInfoBuffer = MutableBinaryByteStream(MutableArrayRef(Data, Size), llvm::support::little); @@ -207,6 +211,9 @@ Error DbiStreamBuilder::generateFileInfoSubstream() { } } + if (auto EC = NameBufferWriter.padToAlignment(sizeof(uint32_t))) + return EC; + if (NameBufferWriter.bytesRemaining() > 0) return make_error(raw_error_code::invalid_format, "The names buffer contained unexpected data."); diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp index f00567db743e..9fd90102f72c 100644 --- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp +++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp @@ -47,7 +47,7 @@ void PDBTypeServerHandler::addSearchPath(StringRef Path) { if (Path.empty() || !sys::fs::is_directory(Path)) return; - SearchPaths.push_back(Path); + SearchPaths.insert(Path); } Expected @@ -57,7 +57,13 @@ PDBTypeServerHandler::handleInternal(PDBFile &File, if (!ExpectedTpi) return ExpectedTpi.takeError(); - if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks)) + // For handling a type server, we should be using whatever the callback array + // was + // that is being used for the original file. We shouldn't allow the visitor + // to + // arbitrarily stick a deserializer in there. + if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks, + VDS_BytesExternal)) return std::move(EC); return true; @@ -80,13 +86,14 @@ Expected PDBTypeServerHandler::handle(TypeServer2Record &TS, cv_error_code::corrupt_record, "TypeServer2Record does not contain filename!"); - for (auto Path : SearchPaths) { - sys::path::append(Path, File); - if (!sys::fs::exists(Path)) + for (auto &Path : SearchPaths) { + SmallString<64> PathStr = Path.getKey(); + sys::path::append(PathStr, File); + if (!sys::fs::exists(PathStr)) continue; std::unique_ptr ThisSession; - if (auto EC = loadDataForPDB(PDB_ReaderType::Native, Path, ThisSession)) { + if (auto EC = loadDataForPDB(PDB_ReaderType::Native, PathStr, ThisSession)) { // It is not an error if this PDB fails to load, it just means that it // doesn't match and we should continue searching. ignoreErrors(std::move(EC)); diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp index 8e0065873892..623afb371b50 100644 --- a/lib/DebugInfo/PDB/Native/TpiStream.cpp +++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp @@ -8,7 +8,9 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/PDB/Native/TpiStream.h" + #include "llvm/ADT/iterator_range.h" +#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" #include "llvm/DebugInfo/MSF/MappedBlockStream.h" #include "llvm/DebugInfo/PDB/Native/PDBFile.h" @@ -104,6 +106,8 @@ Error TpiStream::reload() { HashStream = std::move(HS); } + Types = llvm::make_unique( + TypeRecords, getNumTypeRecords(), getTypeIndexOffsets()); return Error::success(); } diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp index f454ae61d965..34f4017d9828 100644 --- a/lib/Demangle/ItaniumDemangle.cpp +++ b/lib/Demangle/ItaniumDemangle.cpp @@ -2525,6 +2525,9 @@ static std::string base_name(std::string &s) { ++p0; break; } + if (!isalpha(*p0) && !isdigit(*p0) && *p0 != '_') { + return std::string(); + } } return std::string(p0, pe); } @@ -2612,39 +2615,45 @@ static const char *parse_unnamed_type_name(const char *first, const char *last, first = t0 + 1; } break; case 'l': { + size_t lambda_pos = db.names.size(); db.names.push_back(std::string("'lambda'(")); const char *t0 = first + 2; if (first[2] == 'v') { db.names.back().first += ')'; ++t0; } else { - const char *t1 = parse_type(t0, last, db); - if (t1 == t0) { + bool is_first_it = true; + while (true) { + long k0 = static_cast(db.names.size()); + const char *t1 = parse_type(t0, last, db); + long k1 = static_cast(db.names.size()); + if (t1 == t0) + break; + if (k0 >= k1) + return first; + // If the call to parse_type above found a pack expansion + // substitution, then multiple names could have been + // inserted into the name table. Walk through the names, + // appending each onto the lambda's parameter list. + std::for_each(db.names.begin() + k0, db.names.begin() + k1, + [&](typename C::sub_type::value_type &pair) { + if (pair.empty()) + return; + auto &lambda = db.names[lambda_pos].first; + if (!is_first_it) + lambda.append(", "); + is_first_it = false; + lambda.append(pair.move_full()); + }); + db.names.erase(db.names.begin() + k0, db.names.end()); + t0 = t1; + } + if (is_first_it) { if (!db.names.empty()) db.names.pop_back(); return first; } - if (db.names.size() < 2) - return first; - auto tmp = db.names.back().move_full(); - db.names.pop_back(); - db.names.back().first.append(tmp); - t0 = t1; - while (true) { - t1 = parse_type(t0, last, db); - if (t1 == t0) - break; - if (db.names.size() < 2) - return first; - tmp = db.names.back().move_full(); - db.names.pop_back(); - if (!tmp.empty()) { - db.names.back().first.append(", "); - db.names.back().first.append(tmp); - } - t0 = t1; - } - if (db.names.empty()) + if (db.names.empty() || db.names.size() - 1 != lambda_pos) return first; db.names.back().first.append(")"); } @@ -4030,6 +4039,8 @@ static const char *parse_encoding(const char *first, const char *last, C &db) { save_value sb(db.tag_templates); if (db.encoding_depth > 1) db.tag_templates = true; + save_value sp(db.parsed_ctor_dtor_cv); + db.parsed_ctor_dtor_cv = false; switch (*first) { case 'G': case 'T': @@ -4229,6 +4240,7 @@ template struct string_pair { template string_pair(const char (&s)[N]) : first(s, N - 1) {} size_t size() const { return first.size() + second.size(); } + bool empty() const { return first.empty() && second.empty(); } StrT full() const { return first + second; } StrT move_full() { return std::move(first) + std::move(second); } }; diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 660843765b3f..9ce3974529bb 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -737,23 +737,23 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section, writeInt16BE(LocalAddress, applyPPCha(Delta)); } break; case ELF::R_PPC64_ADDR32: { - int32_t Result = static_cast(Value + Addend); - if (SignExtend32<32>(Result) != Result) + int64_t Result = static_cast(Value + Addend); + if (SignExtend64<32>(Result) != Result) llvm_unreachable("Relocation R_PPC64_ADDR32 overflow"); writeInt32BE(LocalAddress, Result); } break; case ELF::R_PPC64_REL24: { uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); - int32_t delta = static_cast(Value - FinalAddress + Addend); - if (SignExtend32<26>(delta) != delta) + int64_t delta = static_cast(Value - FinalAddress + Addend); + if (SignExtend64<26>(delta) != delta) llvm_unreachable("Relocation R_PPC64_REL24 overflow"); // Generates a 'bl
' instruction writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC)); } break; case ELF::R_PPC64_REL32: { uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset); - int32_t delta = static_cast(Value - FinalAddress + Addend); - if (SignExtend32<32>(delta) != delta) + int64_t delta = static_cast(Value - FinalAddress + Addend); + if (SignExtend64<32>(delta) != delta) llvm_unreachable("Relocation R_PPC64_REL32 overflow"); writeInt32BE(LocalAddress, delta); } break; @@ -1324,12 +1324,13 @@ RuntimeDyldELF::processRelocationRef( Obj.getPlatformFlags(AbiVariant); AbiVariant &= ELF::EF_PPC64_ABI; // A PPC branch relocation will need a stub function if the target is - // an external symbol (Symbol::ST_Unknown) or if the target address - // is not within the signed 24-bits branch address. + // an external symbol (either Value.SymbolName is set, or SymType is + // Symbol::ST_Unknown) or if the target address is not within the + // signed 24-bits branch address. SectionEntry &Section = Sections[SectionID]; uint8_t *Target = Section.getAddressWithOffset(Offset); bool RangeOverflow = false; - if (SymType != SymbolRef::ST_Unknown) { + if (!Value.SymbolName && SymType != SymbolRef::ST_Unknown) { if (AbiVariant != 2) { // In the ELFv1 ABI, a function call may point to the .opd entry, // so the final symbol value is calculated based on the relocation @@ -1344,21 +1345,19 @@ RuntimeDyldELF::processRelocationRef( } uint8_t *RelocTarget = Sections[Value.SectionID].getAddressWithOffset(Value.Addend); - int32_t delta = static_cast(Target - RelocTarget); + int64_t delta = static_cast(Target - RelocTarget); // If it is within 26-bits branch range, just set the branch target - if (SignExtend32<26>(delta) == delta) { + if (SignExtend64<26>(delta) == delta) { RelocationEntry RE(SectionID, Offset, RelType, Value.Addend); - if (Value.SymbolName) - addRelocationForSymbol(RE, Value.SymbolName); - else - addRelocationForSection(RE, Value.SectionID); + addRelocationForSection(RE, Value.SectionID); } else { RangeOverflow = true; } } - if (SymType == SymbolRef::ST_Unknown || RangeOverflow) { - // It is an external symbol (SymbolRef::ST_Unknown) or within a range - // larger than 24-bits. + if (Value.SymbolName || SymType == SymbolRef::ST_Unknown || + RangeOverflow) { + // It is an external symbol (either Value.SymbolName is set, or + // SymType is SymbolRef::ST_Unknown) or out of range. StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { // Symbol function stub already created, just relocate to it @@ -1412,7 +1411,7 @@ RuntimeDyldELF::processRelocationRef( RelType, 0); Section.advanceStubOffset(getMaxStubSize()); } - if (SymType == SymbolRef::ST_Unknown) { + if (Value.SymbolName || SymType == SymbolRef::ST_Unknown) { // Restore the TOC for external calls if (AbiVariant == 2) writeInt32BE(Target + 4, 0xE8410018); // ld r2,28(r1) diff --git a/lib/Fuzzer/FuzzerUtilPosix.cpp b/lib/Fuzzer/FuzzerUtilPosix.cpp index 0161309fbf86..bc85264ac187 100644 --- a/lib/Fuzzer/FuzzerUtilPosix.cpp +++ b/lib/Fuzzer/FuzzerUtilPosix.cpp @@ -47,8 +47,21 @@ static void FileSizeExceedHandler(int, siginfo_t *, void *) { static void SetSigaction(int signum, void (*callback)(int, siginfo_t *, void *)) { - struct sigaction sigact; - memset(&sigact, 0, sizeof(sigact)); + struct sigaction sigact = {}; + if (sigaction(signum, nullptr, &sigact)) { + Printf("libFuzzer: sigaction failed with %d\n", errno); + exit(1); + } + if (sigact.sa_flags & SA_SIGINFO) { + if (sigact.sa_sigaction) + return; + } else { + if (sigact.sa_handler != SIG_DFL && sigact.sa_handler != SIG_IGN && + sigact.sa_handler != SIG_ERR) + return; + } + + sigact = {}; sigact.sa_sigaction = callback; if (sigaction(signum, &sigact, 0)) { Printf("libFuzzer: sigaction failed with %d\n", errno); diff --git a/lib/Fuzzer/test/fuzzer-segv.test b/lib/Fuzzer/test/fuzzer-segv.test index b9a6a5ce44ca..90f01932f652 100644 --- a/lib/Fuzzer/test/fuzzer-segv.test +++ b/lib/Fuzzer/test/fuzzer-segv.test @@ -3,3 +3,5 @@ LIBFUZZER_OWN_SEGV_HANDLER: == ERROR: libFuzzer: deadly signal LIBFUZZER_OWN_SEGV_HANDLER: SUMMARY: libFuzzer: deadly signal LIBFUZZER_OWN_SEGV_HANDLER: Test unit written to ./crash- +RUN: env ASAN_OPTIONS=handle_segv=1 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_ASAN_SEGV_HANDLER +LIBFUZZER_ASAN_SEGV_HANDLER: ERROR: AddressSanitizer: {{SEGV|access-violation}} on unknown address diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h index acfac316e91e..4ed7b021883d 100644 --- a/lib/IR/AttributeImpl.h +++ b/lib/IR/AttributeImpl.h @@ -212,27 +212,21 @@ using IndexAttrPair = std::pair; /// return type, and parameters. class AttributeListImpl final : public FoldingSetNode, - private TrailingObjects { + private TrailingObjects { friend class AttributeList; friend TrailingObjects; private: - LLVMContext &Context; - unsigned NumSlots; ///< Number of entries in this set. /// Bitset with a bit for each available attribute Attribute::AttrKind. uint64_t AvailableFunctionAttrs; + LLVMContext &Context; + unsigned NumAttrSets; ///< Number of entries in this set. // Helper fn for TrailingObjects class. - size_t numTrailingObjects(OverloadToken) { return NumSlots; } - - /// \brief Return a pointer to the IndexAttrPair for the specified slot. - const IndexAttrPair *getSlotPair(unsigned Slot) const { - return getTrailingObjects() + Slot; - } + size_t numTrailingObjects(OverloadToken) { return NumAttrSets; } public: - AttributeListImpl(LLVMContext &C, - ArrayRef> Slots); + AttributeListImpl(LLVMContext &C, ArrayRef Sets); // AttributesSetImpt is uniqued, these should not be available. AttributeListImpl(const AttributeListImpl &) = delete; @@ -243,41 +237,18 @@ public: /// \brief Get the context that created this AttributeListImpl. LLVMContext &getContext() { return Context; } - /// \brief Return the number of slots used in this attribute list. This is - /// the number of arguments that have an attribute set on them (including the - /// function itself). - unsigned getNumSlots() const { return NumSlots; } - - /// \brief Get the index of the given "slot" in the AttrNodes list. This index - /// is the index of the return, parameter, or function object that the - /// attributes are applied to, not the index into the AttrNodes list where the - /// attributes reside. - unsigned getSlotIndex(unsigned Slot) const { - return getSlotPair(Slot)->first; - } - - /// \brief Retrieve the attribute set node for the given "slot" in the - /// AttrNode list. - AttributeSet getSlotAttributes(unsigned Slot) const { - return getSlotPair(Slot)->second; - } - /// \brief Return true if the AttributeSet or the FunctionIndex has an /// enum attribute of the given kind. bool hasFnAttribute(Attribute::AttrKind Kind) const { return AvailableFunctionAttrs & ((uint64_t)1) << Kind; } - using iterator = AttributeSet::iterator; - - iterator begin(unsigned Slot) const { - return getSlotAttributes(Slot).begin(); - } - iterator end(unsigned Slot) const { return getSlotAttributes(Slot).end(); } + typedef const AttributeSet *iterator; + iterator begin() const { return getTrailingObjects(); } + iterator end() const { return begin() + NumAttrSets; } void Profile(FoldingSetNodeID &ID) const; - static void Profile(FoldingSetNodeID &ID, - ArrayRef> Nodes); + static void Profile(FoldingSetNodeID &ID, ArrayRef Nodes); void dump() const; }; diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index adb31d127a2e..19b7c3027232 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -507,7 +507,7 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef Attrs) { } AttributeSet AttributeSet::addAttribute(LLVMContext &C, - Attribute::AttrKind Kind) const { + Attribute::AttrKind Kind) const { if (hasAttribute(Kind)) return *this; AttrBuilder B; B.addAttribute(Kind); @@ -515,7 +515,7 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, } AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind, - StringRef Value) const { + StringRef Value) const { AttrBuilder B; B.addAttribute(Kind, Value); return addAttributes(C, AttributeSet::get(C, B)); @@ -788,48 +788,44 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const { // AttributeListImpl Definition //===----------------------------------------------------------------------===// -AttributeListImpl::AttributeListImpl( - LLVMContext &C, ArrayRef> Slots) - : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) { -#ifndef NDEBUG - assert(!Slots.empty() && "pointless AttributeListImpl"); - if (Slots.size() >= 2) { - auto &PrevPair = Slots.front(); - for (auto &CurPair : Slots.drop_front()) { - assert(PrevPair.first <= CurPair.first && "Attribute set not ordered!"); - } - } -#endif +/// Map from AttributeList index to the internal array index. Adding one works: +/// FunctionIndex: ~0U -> 0 +/// ReturnIndex: 0 -> 1 +/// FirstArgIndex: 1.. -> 2.. +static constexpr unsigned attrIdxToArrayIdx(unsigned Index) { + // MSVC warns about '~0U + 1' wrapping around when this is called on + // FunctionIndex, so cast to int first. + return static_cast(Index) + 1; +} + +AttributeListImpl::AttributeListImpl(LLVMContext &C, + ArrayRef Sets) + : AvailableFunctionAttrs(0), Context(C), NumAttrSets(Sets.size()) { + assert(!Sets.empty() && "pointless AttributeListImpl"); // There's memory after the node where we can store the entries in. - std::copy(Slots.begin(), Slots.end(), getTrailingObjects()); + std::copy(Sets.begin(), Sets.end(), getTrailingObjects()); // Initialize AvailableFunctionAttrs summary bitset. static_assert(Attribute::EndAttrKinds <= sizeof(AvailableFunctionAttrs) * CHAR_BIT, "Too many attributes"); - static_assert(AttributeList::FunctionIndex == ~0u, - "FunctionIndex should be biggest possible index"); - const auto &Last = Slots.back(); - if (Last.first == AttributeList::FunctionIndex) { - AttributeSet Node = Last.second; - for (Attribute I : Node) { - if (!I.isStringAttribute()) - AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum(); - } + static_assert(attrIdxToArrayIdx(AttributeList::FunctionIndex) == 0U, + "function should be stored in slot 0"); + for (Attribute I : Sets[0]) { + if (!I.isStringAttribute()) + AvailableFunctionAttrs |= 1ULL << I.getKindAsEnum(); } } void AttributeListImpl::Profile(FoldingSetNodeID &ID) const { - Profile(ID, makeArrayRef(getSlotPair(0), getNumSlots())); + Profile(ID, makeArrayRef(begin(), end())); } -void AttributeListImpl::Profile( - FoldingSetNodeID &ID, ArrayRef> Nodes) { - for (const auto &Node : Nodes) { - ID.AddInteger(Node.first); - ID.AddPointer(Node.second.SetNode); - } +void AttributeListImpl::Profile(FoldingSetNodeID &ID, + ArrayRef Sets) { + for (const auto &Set : Sets) + ID.AddPointer(Set.SetNode); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -842,24 +838,13 @@ LLVM_DUMP_METHOD void AttributeListImpl::dump() const { // AttributeList Construction and Mutation Methods //===----------------------------------------------------------------------===// -AttributeList AttributeList::getImpl( - LLVMContext &C, ArrayRef> Attrs) { - assert(!Attrs.empty() && "creating pointless AttributeList"); -#ifndef NDEBUG - unsigned LastIndex = 0; - bool IsFirst = true; - for (auto &&AttrPair : Attrs) { - assert((IsFirst || LastIndex < AttrPair.first) && - "unsorted or duplicate AttributeList indices"); - assert(AttrPair.second.hasAttributes() && "pointless AttributeList slot"); - LastIndex = AttrPair.first; - IsFirst = false; - } -#endif +AttributeList AttributeList::getImpl(LLVMContext &C, + ArrayRef AttrSets) { + assert(!AttrSets.empty() && "pointless AttributeListImpl"); LLVMContextImpl *pImpl = C.pImpl; FoldingSetNodeID ID; - AttributeListImpl::Profile(ID, Attrs); + AttributeListImpl::Profile(ID, AttrSets); void *InsertPoint; AttributeListImpl *PA = @@ -870,8 +855,8 @@ AttributeList AttributeList::getImpl( if (!PA) { // Coallocate entries after the AttributeListImpl itself. void *Mem = ::operator new( - AttributeListImpl::totalSizeToAlloc(Attrs.size())); - PA = new (Mem) AttributeListImpl(C, Attrs); + AttributeListImpl::totalSizeToAlloc(AttrSets.size())); + PA = new (Mem) AttributeListImpl(C, AttrSets); pImpl->AttrsLists.InsertNode(PA, InsertPoint); } @@ -912,7 +897,7 @@ AttributeList::get(LLVMContext &C, AttrPairVec.emplace_back(Index, AttributeSet::get(C, AttrVec)); } - return getImpl(C, AttrPairVec); + return get(C, AttrPairVec); } AttributeList @@ -922,35 +907,76 @@ AttributeList::get(LLVMContext &C, if (Attrs.empty()) return AttributeList(); - return getImpl(C, Attrs); + assert(std::is_sorted(Attrs.begin(), Attrs.end(), + [](const std::pair &LHS, + const std::pair &RHS) { + return LHS.first < RHS.first; + }) && + "Misordered Attributes list!"); + assert(none_of(Attrs, + [](const std::pair &Pair) { + return !Pair.second.hasAttributes(); + }) && + "Pointless attribute!"); + + unsigned MaxIndex = Attrs.back().first; + + SmallVector AttrVec(attrIdxToArrayIdx(MaxIndex) + 1); + for (auto Pair : Attrs) + AttrVec[attrIdxToArrayIdx(Pair.first)] = Pair.second; + + return getImpl(C, AttrVec); } AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs, AttributeSet RetAttrs, ArrayRef ArgAttrs) { - SmallVector, 8> AttrPairs; - if (RetAttrs.hasAttributes()) - AttrPairs.emplace_back(ReturnIndex, RetAttrs); - size_t Index = 1; - for (AttributeSet AS : ArgAttrs) { - if (AS.hasAttributes()) - AttrPairs.emplace_back(Index, AS); - ++Index; + // Scan from the end to find the last argument with attributes. Most + // arguments don't have attributes, so it's nice if we can have fewer unique + // AttributeListImpls by dropping empty attribute sets at the end of the list. + unsigned NumSets = 0; + for (size_t I = ArgAttrs.size(); I != 0; --I) { + if (ArgAttrs[I - 1].hasAttributes()) { + NumSets = I + 2; + break; + } } - if (FnAttrs.hasAttributes()) - AttrPairs.emplace_back(FunctionIndex, FnAttrs); - if (AttrPairs.empty()) + if (NumSets == 0) { + // Check function and return attributes if we didn't have argument + // attributes. + if (RetAttrs.hasAttributes()) + NumSets = 2; + else if (FnAttrs.hasAttributes()) + NumSets = 1; + } + + // If all attribute sets were empty, we can use the empty attribute list. + if (NumSets == 0) return AttributeList(); - return getImpl(C, AttrPairs); + + SmallVector AttrSets; + AttrSets.reserve(NumSets); + // If we have any attributes, we always have function attributes. + AttrSets.push_back(FnAttrs); + if (NumSets > 1) + AttrSets.push_back(RetAttrs); + if (NumSets > 2) { + // Drop the empty argument attribute sets at the end. + ArgAttrs = ArgAttrs.take_front(NumSets - 2); + AttrSets.insert(AttrSets.end(), ArgAttrs.begin(), ArgAttrs.end()); + } + + return getImpl(C, AttrSets); } AttributeList AttributeList::get(LLVMContext &C, unsigned Index, const AttrBuilder &B) { if (!B.hasAttributes()) return AttributeList(); - AttributeSet AS = AttributeSet::get(C, B); - std::pair Arr[1] = {{Index, AS}}; - return getImpl(C, Arr); + Index = attrIdxToArrayIdx(Index); + SmallVector AttrSets(Index + 1); + AttrSets[Index] = AttributeSet::get(C, B); + return getImpl(C, AttrSets); } AttributeList AttributeList::get(LLVMContext &C, unsigned Index, @@ -973,32 +999,22 @@ AttributeList AttributeList::get(LLVMContext &C, ArrayRef Attrs) { if (Attrs.empty()) return AttributeList(); - if (Attrs.size() == 1) return Attrs[0]; + if (Attrs.size() == 1) + return Attrs[0]; - SmallVector, 8> AttrNodeVec; - AttributeListImpl *A0 = Attrs[0].pImpl; - if (A0) - AttrNodeVec.append(A0->getSlotPair(0), A0->getSlotPair(A0->getNumSlots())); - // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec - // ordered by index. Because we know that each list in Attrs is ordered by - // index we only need to merge each successive list in rather than doing a - // full sort. - for (unsigned I = 1, E = Attrs.size(); I != E; ++I) { - AttributeListImpl *ALI = Attrs[I].pImpl; - if (!ALI) continue; - SmallVector, 8>::iterator - ANVI = AttrNodeVec.begin(), ANVE; - for (const IndexAttrPair *AI = ALI->getSlotPair(0), - *AE = ALI->getSlotPair(ALI->getNumSlots()); - AI != AE; ++AI) { - ANVE = AttrNodeVec.end(); - while (ANVI != ANVE && ANVI->first <= AI->first) - ++ANVI; - ANVI = AttrNodeVec.insert(ANVI, *AI) + 1; - } + unsigned MaxSize = 0; + for (AttributeList List : Attrs) + MaxSize = std::max(MaxSize, List.getNumAttrSets()); + + SmallVector NewAttrSets(MaxSize); + for (unsigned I = 0; I < MaxSize; ++I) { + AttrBuilder CurBuilder; + for (AttributeList List : Attrs) + CurBuilder.merge(List.getAttributes(I - 1)); + NewAttrSets[I] = AttributeSet::get(C, CurBuilder); } - return getImpl(C, AttrNodeVec); + return getImpl(C, NewAttrSets); } AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index, @@ -1022,29 +1038,19 @@ AttributeList AttributeList::addAttribute(LLVMContext &C, Attribute A) const { assert(std::is_sorted(Indices.begin(), Indices.end())); - unsigned I = 0, E = pImpl ? pImpl->getNumSlots() : 0; - SmallVector AttrVec; - for (unsigned Index : Indices) { - // Add all attribute slots before the current index. - for (; I < E && getSlotIndex(I) < Index; ++I) - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); + SmallVector AttrSets(this->begin(), this->end()); + unsigned MaxIndex = attrIdxToArrayIdx(Indices.back()); + if (MaxIndex >= AttrSets.size()) + AttrSets.resize(MaxIndex + 1); - // Add the attribute at this index. If we already have attributes at this - // index, merge them into a new set. - AttrBuilder B; - if (I < E && getSlotIndex(I) == Index) { - B.merge(AttrBuilder(pImpl->getSlotAttributes(I))); - ++I; - } + for (unsigned Index : Indices) { + Index = attrIdxToArrayIdx(Index); + AttrBuilder B(AttrSets[Index]); B.addAttribute(A); - AttrVec.emplace_back(Index, AttributeSet::get(C, B)); + AttrSets[Index] = AttributeSet::get(C, B); } - // Add remaining attributes. - for (; I < E; ++I) - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); - - return get(C, AttrVec); + return getImpl(C, AttrSets); } AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, @@ -1064,33 +1070,16 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index, "Attempt to change alignment!"); #endif - SmallVector AttrVec; - uint64_t NumAttrs = pImpl->getNumSlots(); - unsigned I; + Index = attrIdxToArrayIdx(Index); + SmallVector AttrSets(this->begin(), this->end()); + if (Index >= AttrSets.size()) + AttrSets.resize(Index + 1); - // Add all the attribute slots before the one we need to merge. - for (I = 0; I < NumAttrs; ++I) { - if (getSlotIndex(I) >= Index) - break; - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); - } + AttrBuilder Merged(AttrSets[Index]); + Merged.merge(B); + AttrSets[Index] = AttributeSet::get(C, Merged); - AttrBuilder NewAttrs; - if (I < NumAttrs && getSlotIndex(I) == Index) { - // We need to merge the attribute sets. - NewAttrs.merge(pImpl->getSlotAttributes(I)); - ++I; - } - NewAttrs.merge(B); - - // Add the new or merged attribute set at this index. - AttrVec.emplace_back(Index, AttributeSet::get(C, NewAttrs)); - - // Add the remaining entries. - for (; I < NumAttrs; ++I) - AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I)); - - return get(C, AttrVec); + return getImpl(C, AttrSets); } AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, @@ -1109,54 +1098,38 @@ AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index, return removeAttributes(C, Index, B); } -AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index, - const AttrBuilder &Attrs) const { +AttributeList +AttributeList::removeAttributes(LLVMContext &C, unsigned Index, + const AttrBuilder &AttrsToRemove) const { if (!pImpl) return AttributeList(); // FIXME it is not obvious how this should work for alignment. // For now, say we can't pass in alignment, which no current use does. - assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!"); + assert(!AttrsToRemove.hasAlignmentAttr() && "Attempt to change alignment!"); - // Add the attribute slots before the one we're trying to add. - SmallVector AttrSets; - uint64_t NumAttrs = pImpl->getNumSlots(); - AttrBuilder B; - uint64_t LastIndex = 0; - for (unsigned I = 0, E = NumAttrs; I != E; ++I) { - if (getSlotIndex(I) >= Index) { - if (getSlotIndex(I) == Index) - B = AttrBuilder(getSlotAttributes(LastIndex++)); - break; - } - LastIndex = I + 1; - AttrSets.push_back({getSlotIndex(I), getSlotAttributes(I)}); - } + Index = attrIdxToArrayIdx(Index); + SmallVector AttrSets(this->begin(), this->end()); + if (Index >= AttrSets.size()) + AttrSets.resize(Index + 1); - // Remove the attributes from the existing set and add them. - B.remove(Attrs); - if (B.hasAttributes()) - AttrSets.push_back({Index, AttributeSet::get(C, B)}); + AttrBuilder B(AttrSets[Index]); + B.remove(AttrsToRemove); + AttrSets[Index] = AttributeSet::get(C, B); - // Add the remaining attribute slots. - for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I) - AttrSets.push_back({getSlotIndex(I), getSlotAttributes(I)}); - - return get(C, AttrSets); + return getImpl(C, AttrSets); } AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned WithoutIndex) const { if (!pImpl) return AttributeList(); - - SmallVector, 4> AttrSet; - for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) { - unsigned Index = getSlotIndex(I); - if (Index != WithoutIndex) - AttrSet.push_back({Index, pImpl->getSlotAttributes(I)}); - } - return get(C, AttrSet); + WithoutIndex = attrIdxToArrayIdx(WithoutIndex); + if (WithoutIndex >= getNumAttrSets()) + return *this; + SmallVector AttrSets(this->begin(), this->end()); + AttrSets[WithoutIndex] = AttributeSet(); + return getImpl(C, AttrSets); } AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C, @@ -1225,20 +1198,20 @@ bool AttributeList::hasFnAttribute(StringRef Kind) const { bool AttributeList::hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const { - return hasAttribute(ArgNo + 1, Kind); + return hasAttribute(ArgNo + FirstArgIndex, Kind); } bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr, unsigned *Index) const { if (!pImpl) return false; - for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) - for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I); - II != IE; ++II) - if (II->hasAttribute(Attr)) { - if (Index) *Index = pImpl->getSlotIndex(I); - return true; - } + for (unsigned I = index_begin(), E = index_end(); I != E; ++I) { + if (hasAttribute(I, Attr)) { + if (Index) + *Index = I; + return true; + } + } return false; } @@ -1282,60 +1255,35 @@ std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const { } AttributeSet AttributeList::getAttributes(unsigned Index) const { - if (!pImpl) return AttributeSet(); - - // Loop through to find the attribute node we want. - for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) - if (pImpl->getSlotIndex(I) == Index) - return pImpl->getSlotAttributes(I); - - return AttributeSet(); + Index = attrIdxToArrayIdx(Index); + if (!pImpl || Index >= getNumAttrSets()) + return AttributeSet(); + return pImpl->begin()[Index]; } -AttributeList::iterator AttributeList::begin(unsigned Slot) const { - if (!pImpl) - return ArrayRef().begin(); - return pImpl->begin(Slot); +AttributeList::iterator AttributeList::begin() const { + return pImpl ? pImpl->begin() : nullptr; } -AttributeList::iterator AttributeList::end(unsigned Slot) const { - if (!pImpl) - return ArrayRef().end(); - return pImpl->end(Slot); +AttributeList::iterator AttributeList::end() const { + return pImpl ? pImpl->end() : nullptr; } //===----------------------------------------------------------------------===// // AttributeList Introspection Methods //===----------------------------------------------------------------------===// -unsigned AttributeList::getNumSlots() const { - return pImpl ? pImpl->getNumSlots() : 0; -} - -unsigned AttributeList::getSlotIndex(unsigned Slot) const { - assert(pImpl && Slot < pImpl->getNumSlots() && - "Slot # out of range!"); - return pImpl->getSlotIndex(Slot); -} - -AttributeSet AttributeList::getSlotAttributes(unsigned Slot) const { - assert(pImpl && Slot < pImpl->getNumSlots() && - "Slot # out of range!"); - return pImpl->getSlotAttributes(Slot); +unsigned AttributeList::getNumAttrSets() const { + return pImpl ? pImpl->NumAttrSets : 0; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void AttributeList::dump() const { dbgs() << "PAL[\n"; - for (unsigned i = 0, e = getNumSlots(); i < e; ++i) { - uint64_t Index = getSlotIndex(i); - dbgs() << " { "; - if (Index == ~0U) - dbgs() << "~0U"; - else - dbgs() << Index; - dbgs() << " => " << getAsString(Index) << " }\n"; + for (unsigned i = index_begin(), e = index_end(); i != e; ++i) { + if (getAttributes(i).hasAttributes()) + dbgs() << " { " << i << " => " << getAsString(i) << " }\n"; } dbgs() << "]\n"; @@ -1346,26 +1294,16 @@ LLVM_DUMP_METHOD void AttributeList::dump() const { // AttrBuilder Method Implementations //===----------------------------------------------------------------------===// +// FIXME: Remove this ctor, use AttributeSet. AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) { - AttributeListImpl *pImpl = AL.pImpl; - if (!pImpl) return; - - for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) { - if (pImpl->getSlotIndex(I) != Index) continue; - - for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I); - II != IE; ++II) - addAttribute(*II); - - break; - } + AttributeSet AS = AL.getAttributes(Index); + for (const Attribute &A : AS) + addAttribute(A); } AttrBuilder::AttrBuilder(AttributeSet AS) { - if (AS.hasAttributes()) { - for (const Attribute &A : AS) - addAttribute(A); - } + for (const Attribute &A : AS) + addAttribute(A); } void AttrBuilder::clear() { diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 90ca21ab91f8..1f8659d4e2ca 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -263,6 +263,10 @@ const BasicBlock *BasicBlock::getUniqueSuccessor() const { return SuccBB; } +iterator_range BasicBlock::phis() { + return make_range(dyn_cast(&front()), nullptr); +} + /// This method is used to notify a BasicBlock that the /// specified Predecessor of the block is no longer able to reach it. This is /// actually not used to update the Predecessor list, but is actually used to @@ -389,13 +393,11 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) { // Loop over any phi nodes in the basic block, updating the BB field of // incoming values... BasicBlock *Successor = *I; - PHINode *PN; - for (BasicBlock::iterator II = Successor->begin(); - (PN = dyn_cast(II)); ++II) { - int IDX = PN->getBasicBlockIndex(this); - while (IDX != -1) { - PN->setIncomingBlock((unsigned)IDX, New); - IDX = PN->getBasicBlockIndex(this); + for (auto &PN : Successor->phis()) { + int Idx = PN.getBasicBlockIndex(this); + while (Idx != -1) { + PN.setIncomingBlock((unsigned)Idx, New); + Idx = PN.getBasicBlockIndex(this); } } } diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp index 3168ec6944a3..b7e3f0c6779e 100644 --- a/lib/IR/DebugLoc.cpp +++ b/lib/IR/DebugLoc.cpp @@ -163,7 +163,7 @@ void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP, // Fix up debug variables to point to NewSP. auto reparentVar = [&](DILocalVariable *Var) { - return DILocalVariable::getDistinct( + return DILocalVariable::get( Ctx, cast( reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)), diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index 01d4ed6c8eef..d7baa9ebc223 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -454,6 +454,9 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i, // question is a call argument; or be indirectly implied by the kind of its // containing operand bundle, if the operand is a bundle operand. + if (i == AttributeList::ReturnIndex) + return hasRetAttr(Kind); + // FIXME: Avoid these i - 1 calculations and update the API to use zero-based // indices. if (i < (getNumArgOperands() + 1)) @@ -779,6 +782,9 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i, // question is an invoke argument; or be indirectly implied by the kind of its // containing operand bundle, if the operand is a bundle operand. + if (i == AttributeList::ReturnIndex) + return hasRetAttr(Kind); + // FIXME: Avoid these i - 1 calculations and update the API to use zero-based // indices. if (i < (getNumArgOperands() + 1)) diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp index c9814a96bea6..94e115a6a78d 100644 --- a/lib/IR/IntrinsicInst.cpp +++ b/lib/IR/IntrinsicInst.cpp @@ -97,7 +97,9 @@ Value *InstrProfIncrementInst::getStep() const { ConstrainedFPIntrinsic::RoundingMode ConstrainedFPIntrinsic::getRoundingMode() const { - Metadata *MD = dyn_cast(getOperand(2))->getMetadata(); + unsigned NumOperands = getNumArgOperands(); + Metadata *MD = + dyn_cast(getArgOperand(NumOperands - 2))->getMetadata(); if (!MD || !isa(MD)) return rmInvalid; StringRef RoundingArg = cast(MD)->getString(); @@ -115,7 +117,9 @@ ConstrainedFPIntrinsic::getRoundingMode() const { ConstrainedFPIntrinsic::ExceptionBehavior ConstrainedFPIntrinsic::getExceptionBehavior() const { - Metadata *MD = dyn_cast(getOperand(3))->getMetadata(); + unsigned NumOperands = getNumArgOperands(); + Metadata *MD = + dyn_cast(getArgOperand(NumOperands - 1))->getMetadata(); if (!MD || !isa(MD)) return ebInvalid; StringRef ExceptionArg = cast(MD)->getString(); @@ -125,3 +129,21 @@ ConstrainedFPIntrinsic::getExceptionBehavior() const { .Case("fpexcept.strict", ebStrict) .Default(ebInvalid); } + +bool ConstrainedFPIntrinsic::isUnaryOp() const { + switch (getIntrinsicID()) { + default: + return false; + case Intrinsic::experimental_constrained_sqrt: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_rint: + case Intrinsic::experimental_constrained_nearbyint: + return true; + } +} diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp index 12c258d95f52..95673e515a55 100644 --- a/lib/IR/Module.cpp +++ b/lib/IR/Module.cpp @@ -481,7 +481,7 @@ PICLevel::Level Module::getPICLevel() const { } void Module::setPICLevel(PICLevel::Level PL) { - addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL); + addModuleFlag(ModFlagBehavior::Max, "PIC Level", PL); } PIELevel::Level Module::getPIELevel() const { @@ -495,7 +495,7 @@ PIELevel::Level Module::getPIELevel() const { } void Module::setPIELevel(PIELevel::Level PL) { - addModuleFlag(ModFlagBehavior::Error, "PIE Level", PL); + addModuleFlag(ModFlagBehavior::Max, "PIE Level", PL); } void Module::setProfileSummary(Metadata *M) { diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 21e8048442be..a8523236ac9f 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -1282,6 +1282,13 @@ Verifier::visitModuleFlag(const MDNode *Op, // These behavior types accept any value. break; + case Module::Max: { + Assert(mdconst::dyn_extract_or_null(Op->getOperand(2)), + "invalid value for 'max' module flag (expected constant integer)", + Op->getOperand(2)); + break; + } + case Module::Require: { // The value should itself be an MDNode with two operands, a flag ID (an // MDString), and a value. @@ -1729,17 +1736,9 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) { } bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) { - if (Attrs.getNumSlots() == 0) - return true; - - unsigned LastSlot = Attrs.getNumSlots() - 1; - unsigned LastIndex = Attrs.getSlotIndex(LastSlot); - if (LastIndex <= Params || - (LastIndex == AttributeList::FunctionIndex && - (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params))) - return true; - - return false; + // There shouldn't be more attribute sets than there are parameters plus the + // function and return value. + return Attrs.getNumAttrSets() <= Params + 2; } /// Verify that statepoint intrinsic is well formed. @@ -3967,6 +3966,18 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) { case Intrinsic::experimental_constrained_fmul: case Intrinsic::experimental_constrained_fdiv: case Intrinsic::experimental_constrained_frem: + case Intrinsic::experimental_constrained_sqrt: + case Intrinsic::experimental_constrained_pow: + case Intrinsic::experimental_constrained_powi: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_rint: + case Intrinsic::experimental_constrained_nearbyint: visitConstrainedFPIntrinsic( cast(*CS.getInstruction())); break; @@ -4336,7 +4347,12 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) { } void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { - Assert(isa(FPI.getOperand(2)), + unsigned NumOperands = FPI.getNumArgOperands(); + Assert(((NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)), + "invalid arguments for constrained FP intrinsic", &FPI); + Assert(isa(FPI.getArgOperand(NumOperands-1)), + "invalid exception behavior argument", &FPI); + Assert(isa(FPI.getArgOperand(NumOperands-2)), "invalid rounding mode argument", &FPI); Assert(FPI.getRoundingMode() != ConstrainedFPIntrinsic::rmInvalid, "invalid rounding mode argument", &FPI); diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index c73b6b6b15c1..9efc095f9fcf 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -114,7 +114,10 @@ static void computeCacheKey( AddUnsigned((unsigned)Conf.Options.DebuggerTuning); for (auto &A : Conf.MAttrs) AddString(A); - AddUnsigned(Conf.RelocModel); + if (Conf.RelocModel) + AddUnsigned(*Conf.RelocModel); + else + AddUnsigned(-1); AddUnsigned(Conf.CodeModel); AddUnsigned(Conf.CGOptLevel); AddUnsigned(Conf.CGFileType); @@ -539,16 +542,10 @@ Error LTO::addRegularLTO(BitcodeModule BM, if (Sym.isUndefined()) continue; Keep.push_back(GV); - switch (GV->getLinkage()) { - default: - break; - case GlobalValue::LinkOnceAnyLinkage: - GV->setLinkage(GlobalValue::WeakAnyLinkage); - break; - case GlobalValue::LinkOnceODRLinkage: - GV->setLinkage(GlobalValue::WeakODRLinkage); - break; - } + GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage(); + if (GlobalValue::isLinkOnceLinkage(OriginalLinkage)) + GV->setLinkage(GlobalValue::getWeakLinkage( + GlobalValue::isLinkOnceODRLinkage(OriginalLinkage))); } else if (isa(GV) && (GV->hasLinkOnceODRLinkage() || GV->hasWeakODRLinkage() || GV->hasAvailableExternallyLinkage()) && @@ -999,10 +996,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, ExportedGUIDs.insert(GUID); } - auto isPrevailing = [&](GlobalValue::GUID GUID, - const GlobalValueSummary *S) { - return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); - }; auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) { const auto &ExportList = ExportLists.find(ModuleIdentifier); return (ExportList != ExportLists.end() && @@ -1010,17 +1003,20 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, ExportedGUIDs.count(GUID); }; thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported); - - auto recordNewLinkage = [&](StringRef ModuleIdentifier, - GlobalValue::GUID GUID, - GlobalValue::LinkageTypes NewLinkage) { - ResolvedODR[ModuleIdentifier][GUID] = NewLinkage; - }; - - thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing, - recordNewLinkage); } + auto isPrevailing = [&](GlobalValue::GUID GUID, + const GlobalValueSummary *S) { + return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); + }; + auto recordNewLinkage = [&](StringRef ModuleIdentifier, + GlobalValue::GUID GUID, + GlobalValue::LinkageTypes NewLinkage) { + ResolvedODR[ModuleIdentifier][GUID] = NewLinkage; + }; + thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing, + recordNewLinkage); + std::unique_ptr BackendProc = ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, AddStream, Cache); diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 30447c528af1..668667a53562 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -117,15 +117,22 @@ Error Config::addSaveTemps(std::string OutputFileName, namespace { std::unique_ptr -createTargetMachine(Config &Conf, StringRef TheTriple, - const Target *TheTarget) { +createTargetMachine(Config &Conf, const Target *TheTarget, Module &M) { + StringRef TheTriple = M.getTargetTriple(); SubtargetFeatures Features; Features.getDefaultSubtargetFeatures(Triple(TheTriple)); for (const std::string &A : Conf.MAttrs) Features.AddFeature(A); + Reloc::Model RelocModel; + if (Conf.RelocModel) + RelocModel = *Conf.RelocModel; + else + RelocModel = + M.getPICLevel() == PICLevel::NotPIC ? Reloc::Static : Reloc::PIC_; + return std::unique_ptr(TheTarget->createTargetMachine( - TheTriple, Conf.CPU, Features.getString(), Conf.Options, Conf.RelocModel, + TheTriple, Conf.CPU, Features.getString(), Conf.Options, RelocModel, Conf.CodeModel, Conf.CGOptLevel)); } @@ -311,7 +318,7 @@ void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream, std::unique_ptr MPartInCtx = std::move(MOrErr.get()); std::unique_ptr TM = - createTargetMachine(C, MPartInCtx->getTargetTriple(), T); + createTargetMachine(C, T, *MPartInCtx); codegen(C, TM.get(), AddStream, ThreadId, *MPartInCtx); }, @@ -360,8 +367,7 @@ Error lto::backend(Config &C, AddStreamFn AddStream, if (!TOrErr) return TOrErr.takeError(); - std::unique_ptr TM = - createTargetMachine(C, Mod->getTargetTriple(), *TOrErr); + std::unique_ptr TM = createTargetMachine(C, *TOrErr, *Mod); // Setup optimization remarks. auto DiagFileOrErr = lto::setupOptimizationRemarks( @@ -397,8 +403,7 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream, if (!TOrErr) return TOrErr.takeError(); - std::unique_ptr TM = - createTargetMachine(Conf, Mod.getTargetTriple(), *TOrErr); + std::unique_ptr TM = createTargetMachine(Conf, *TOrErr, Mod); if (Conf.CodeGenOnly) { codegen(Conf, TM.get(), AddStream, Task, Mod); diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp index c0af21aa148c..defad1904989 100644 --- a/lib/Linker/IRMover.cpp +++ b/lib/Linker/IRMover.cpp @@ -1157,6 +1157,11 @@ Error IRLinker::linkModuleFlagsMetadata() { mdconst::extract(DstOp->getOperand(0)); unsigned DstBehaviorValue = DstBehavior->getZExtValue(); + auto overrideDstValue = [&]() { + DstModFlags->setOperand(DstIndex, SrcOp); + Flags[ID].first = SrcOp; + }; + // If either flag has override behavior, handle it first. if (DstBehaviorValue == Module::Override) { // Diagnose inconsistent flags which both have override behavior. @@ -1167,8 +1172,7 @@ Error IRLinker::linkModuleFlagsMetadata() { continue; } else if (SrcBehaviorValue == Module::Override) { // Update the destination flag to that of the source. - DstModFlags->setOperand(DstIndex, SrcOp); - Flags[ID].first = SrcOp; + overrideDstValue(); continue; } @@ -1204,6 +1208,15 @@ Error IRLinker::linkModuleFlagsMetadata() { } continue; } + case Module::Max: { + ConstantInt *DstValue = + mdconst::extract(DstOp->getOperand(2)); + ConstantInt *SrcValue = + mdconst::extract(SrcOp->getOperand(2)); + if (SrcValue->getZExtValue() > DstValue->getZExtValue()) + overrideDstValue(); + break; + } case Module::Append: { MDNode *DstValue = cast(DstOp->getOperand(2)); MDNode *SrcValue = cast(SrcOp->getOperand(2)); diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 0540c4c47a3f..8c3df36cfb48 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -422,6 +422,7 @@ static void ApplyRelocations( RelEntry.Offset; switch (RelEntry.Type) { case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: { + assert(SymbolIndices.count(RelEntry.Symbol)); uint32_t Index = SymbolIndices[RelEntry.Symbol]; assert(RelEntry.Addend == 0); @@ -429,6 +430,7 @@ static void ApplyRelocations( break; } case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: { + assert(SymbolIndices.count(RelEntry.Symbol)); uint32_t Index = SymbolIndices[RelEntry.Symbol]; assert(RelEntry.Addend == 0); @@ -448,6 +450,7 @@ static void ApplyRelocations( break; } case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: { + assert(SymbolIndices.count(RelEntry.Symbol)); uint32_t Index = SymbolIndices[RelEntry.Symbol]; assert(RelEntry.Addend == 0); @@ -478,6 +481,7 @@ WriteRelocations(ArrayRef Relocations, uint64_t Offset = RelEntry.Offset + RelEntry.FixupSection->getSectionOffset() + HeaderSize; + assert(SymbolIndices.count(RelEntry.Symbol)); uint32_t Index = SymbolIndices[RelEntry.Symbol]; int64_t Addend = RelEntry.Addend; @@ -726,10 +730,6 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, if (IsAddressTaken.count(&WS)) TableElems.push_back(Index); } else { - // For now, ignore temporary non-function symbols. - if (S.isTemporary()) - continue; - if (WS.getOffset() != 0) report_fatal_error("data sections must contain one variable each"); if (!WS.getSize()) @@ -777,20 +777,18 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, } } - // For each external global, prepare a corresponding wasm global - // holding its address. - if (WS.isExternal()) { - Index = NumGlobalImports + Globals.size(); + // For each global, prepare a corresponding wasm global holding its + // address. For externals these will also be named exports. + Index = NumGlobalImports + Globals.size(); - WasmGlobal Global; - Global.Type = PtrType; - Global.IsMutable = false; - Global.HasImport = false; - Global.InitialValue = DataSection.getSectionOffset(); - Global.ImportIndex = 0; - SymbolIndices[&WS] = Index; - Globals.push_back(Global); - } + WasmGlobal Global; + Global.Type = PtrType; + Global.IsMutable = false; + Global.HasImport = false; + Global.InitialValue = DataSection.getSectionOffset(); + Global.ImportIndex = 0; + SymbolIndices[&WS] = Index; + Globals.push_back(Global); } } diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index 28531feccfe1..7372f24cb9a8 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -293,6 +293,10 @@ uint64_t COFFObjectFile::getSectionAddress(DataRefImpl Ref) const { return Result; } +uint64_t COFFObjectFile::getSectionIndex(DataRefImpl Sec) const { + return toSec(Sec) - SectionTable; +} + uint64_t COFFObjectFile::getSectionSize(DataRefImpl Ref) const { return getSectionSize(toSec(Ref)); } diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp index 3d3fa07db3f4..bfb8875f47d4 100644 --- a/lib/Object/MachOObjectFile.cpp +++ b/lib/Object/MachOObjectFile.cpp @@ -1820,6 +1820,10 @@ uint64_t MachOObjectFile::getSectionAddress(DataRefImpl Sec) const { return getSection(Sec).addr; } +uint64_t MachOObjectFile::getSectionIndex(DataRefImpl Sec) const { + return Sec.d.a; +} + uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { // In the case if a malformed Mach-O file where the section offset is past // the end of the file or some part of the section size is past the end of diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index 058686e4db9e..f565d7a33e55 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -743,6 +743,10 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec, uint64_t WasmObjectFile::getSectionAddress(DataRefImpl Sec) const { return 0; } +uint64_t WasmObjectFile::getSectionIndex(DataRefImpl Sec) const { + return Sec.d.a; +} + uint64_t WasmObjectFile::getSectionSize(DataRefImpl Sec) const { const WasmSection &S = Sections[Sec.d.a]; return S.Content.size(); diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp index 7eafb00855d7..b00d21ec8f67 100644 --- a/lib/Option/OptTable.cpp +++ b/lib/Option/OptTable.cpp @@ -186,6 +186,20 @@ static unsigned matchOption(const OptTable::Info *I, StringRef Str, return 0; } +std::vector OptTable::findByPrefix(StringRef Cur) const { + std::vector Ret; + for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) { + if (!In.Prefixes) + continue; + for (int I = 0; In.Prefixes[I]; I++) { + std::string S = std::string(In.Prefixes[I]) + std::string(In.Name); + if (StringRef(S).startswith(Cur)) + Ret.push_back(S); + } + } + return Ret; +} + Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index, unsigned FlagsToInclude, unsigned FlagsToExclude) const { diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 6ece7965ce64..abc53e97aa72 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -155,6 +155,11 @@ static cl::opt cl::Hidden, cl::ZeroOrMore, cl::desc("Run Partial inlinining pass")); +static cl::opt + RunNewGVN("enable-npm-newgvn", cl::init(false), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Run NewGVN instead of GVN")); + static cl::opt EnableGVNHoist( "enable-npm-gvn-hoist", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); @@ -336,10 +341,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // Rotate Loop - disable header duplication at -Oz LPM1.addPass(LoopRotatePass(Level != Oz)); LPM1.addPass(LICMPass()); -#if 0 - // The LoopUnswitch pass isn't yet ported to the new pass manager. - LPM1.addPass(LoopUnswitchPass(/* OptimizeForSize */ Level != O3)); -#endif + LPM1.addPass(SimpleLoopUnswitchPass()); LPM2.addPass(IndVarSimplifyPass()); LPM2.addPass(LoopIdiomRecognizePass()); LPM2.addPass(LoopDeletionPass()); @@ -357,7 +359,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, if (Level != O1) { // These passes add substantial compile time so skip them at O1. FPM.addPass(MergedLoadStoreMotionPass()); - FPM.addPass(GVN()); + if (RunNewGVN) + FPM.addPass(NewGVNPass()); + else + FPM.addPass(GVN()); } // Specially optimize memory movement as it doesn't look like dataflow in SSA. @@ -429,6 +434,11 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline))); } + // Delete anything that is now dead to make sure that we don't instrument + // dead code. Instrumentation can end up keeping dead code around and + // dramatically increase code size. + MPM.addPass(GlobalDCEPass()); + if (RunProfileGen) { MPM.addPass(PGOInstrumentationGen()); @@ -774,7 +784,10 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // FIXME: once we fix LoopPass Manager, add LICM here. // FIXME: once we provide support for enabling MLSM, add it here. // FIXME: once we provide support for enabling NewGVN, add it here. - MainFPM.addPass(GVN()); + if (RunNewGVN) + MainFPM.addPass(NewGVNPass()); + else + MainFPM.addPass(GVN()); // Remove dead memcpy()'s. MainFPM.addPass(MemCpyOptPass()); diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp index 64a65ccc11a1..a2b7c94f9dec 100644 --- a/lib/ProfileData/InstrProf.cpp +++ b/lib/ProfileData/InstrProf.cpp @@ -355,7 +355,7 @@ void InstrProfSymtab::create(Module &M, bool InLTO) { finalizeSymtab(); } -Error collectPGOFuncNameStrings(const std::vector &NameStrs, +Error collectPGOFuncNameStrings(ArrayRef NameStrs, bool doCompression, std::string &Result) { assert(!NameStrs.empty() && "No name data to emit"); @@ -403,7 +403,7 @@ StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar) { return NameStr; } -Error collectPGOFuncNameStrings(const std::vector &NameVars, +Error collectPGOFuncNameStrings(ArrayRef NameVars, std::string &Result, bool doCompression) { std::vector NameStrs; for (auto *NameVar : NameVars) { @@ -978,22 +978,22 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) { } // Parse the value profile options. -void getMemOPSizeRangeFromOption(std::string MemOPSizeRange, - int64_t &RangeStart, int64_t &RangeLast) { +void getMemOPSizeRangeFromOption(StringRef MemOPSizeRange, int64_t &RangeStart, + int64_t &RangeLast) { static const int64_t DefaultMemOPSizeRangeStart = 0; static const int64_t DefaultMemOPSizeRangeLast = 8; RangeStart = DefaultMemOPSizeRangeStart; RangeLast = DefaultMemOPSizeRangeLast; if (!MemOPSizeRange.empty()) { - auto Pos = MemOPSizeRange.find(":"); + auto Pos = MemOPSizeRange.find(':'); if (Pos != std::string::npos) { if (Pos > 0) - RangeStart = atoi(MemOPSizeRange.substr(0, Pos).c_str()); + MemOPSizeRange.substr(0, Pos).getAsInteger(10, RangeStart); if (Pos < MemOPSizeRange.size() - 1) - RangeLast = atoi(MemOPSizeRange.substr(Pos + 1).c_str()); + MemOPSizeRange.substr(Pos + 1).getAsInteger(10, RangeLast); } else - RangeLast = atoi(MemOPSizeRange.c_str()); + MemOPSizeRange.getAsInteger(10, RangeLast); } assert(RangeLast >= RangeStart); } diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index 2a916b14bc22..e9716e3b1e87 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -2045,7 +2045,7 @@ void APInt::toString(SmallVectorImpl &Str, unsigned Radix, if (isSingleWord()) { char Buffer[65]; - char *BufPtr = Buffer+65; + char *BufPtr = std::end(Buffer); uint64_t N; if (!Signed) { @@ -2069,7 +2069,7 @@ void APInt::toString(SmallVectorImpl &Str, unsigned Radix, *--BufPtr = Digits[N % Radix]; N /= Radix; } - Str.append(BufPtr, Buffer+65); + Str.append(BufPtr, std::end(Buffer)); return; } diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp index 5c277448a765..862232971162 100644 --- a/lib/Support/BinaryStreamReader.cpp +++ b/lib/Support/BinaryStreamReader.cpp @@ -42,29 +42,30 @@ Error BinaryStreamReader::readBytes(ArrayRef &Buffer, uint32_t Size) { } Error BinaryStreamReader::readCString(StringRef &Dest) { - // TODO: This could be made more efficient by using readLongestContiguousChunk - // and searching for null terminators in the resulting buffer. - - uint32_t Length = 0; - // First compute the length of the string by reading 1 byte at a time. uint32_t OriginalOffset = getOffset(); - const char *C; + uint32_t FoundOffset = 0; while (true) { - if (auto EC = readObject(C)) + uint32_t ThisOffset = getOffset(); + ArrayRef Buffer; + if (auto EC = readLongestContiguousChunk(Buffer)) return EC; - if (*C == '\0') + StringRef S(reinterpret_cast(Buffer.begin()), Buffer.size()); + size_t Pos = S.find_first_of('\0'); + if (LLVM_LIKELY(Pos != StringRef::npos)) { + FoundOffset = Pos + ThisOffset; break; - ++Length; + } } - // Now go back and request a reference for that many bytes. - uint32_t NewOffset = getOffset(); + assert(FoundOffset >= OriginalOffset); + setOffset(OriginalOffset); + size_t Length = FoundOffset - OriginalOffset; if (auto EC = readFixedString(Dest, Length)) return EC; - // Now set the offset back to where it was after we calculated the length. - setOffset(NewOffset); + // Now set the offset back to after the null terminator. + setOffset(FoundOffset + 1); return Error::success(); } diff --git a/lib/Support/ConvertUTF.cpp b/lib/Support/ConvertUTF.cpp index 39fd218d3f07..aa9507c189ed 100644 --- a/lib/Support/ConvertUTF.cpp +++ b/lib/Support/ConvertUTF.cpp @@ -53,6 +53,35 @@ #endif #include + +/* + * This code extensively uses fall-through switches. + * Keep the compiler from warning about that. + */ +#if defined(__clang__) && defined(__has_warning) +# if __has_warning("-Wimplicit-fallthrough") +# define ConvertUTF_DISABLE_WARNINGS \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"") +# define ConvertUTF_RESTORE_WARNINGS \ + _Pragma("clang diagnostic pop") +# endif +#elif defined(__GNUC__) && __GNUC__ > 6 +# define ConvertUTF_DISABLE_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"") +# define ConvertUTF_RESTORE_WARNINGS \ + _Pragma("GCC diagnostic pop") +#endif +#ifndef ConvertUTF_DISABLE_WARNINGS +# define ConvertUTF_DISABLE_WARNINGS +#endif +#ifndef ConvertUTF_RESTORE_WARNINGS +# define ConvertUTF_RESTORE_WARNINGS +#endif + +ConvertUTF_DISABLE_WARNINGS + namespace llvm { static const int halfShift = 10; /* used for shifting by 10 bits */ @@ -708,3 +737,5 @@ ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, --------------------------------------------------------------------- */ } // namespace llvm + +ConvertUTF_RESTORE_WARNINGS diff --git a/lib/Support/DebugCounter.cpp b/lib/Support/DebugCounter.cpp index 29dae8a20f00..a10ac8e85396 100644 --- a/lib/Support/DebugCounter.cpp +++ b/lib/Support/DebugCounter.cpp @@ -6,6 +6,7 @@ using namespace llvm; +namespace { // This class overrides the default list implementation of printing so we // can pretty print the list of debug counter options. This type of // dynamic option is pretty rare (basically this and pass lists). @@ -40,6 +41,7 @@ private: } } }; +} // namespace // Create our command line option. static DebugCounterList DebugCounterOption( diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp index 1541a5726302..9398789cea87 100644 --- a/lib/Support/DynamicLibrary.cpp +++ b/lib/Support/DynamicLibrary.cpp @@ -127,10 +127,15 @@ void DynamicLibrary::AddSymbol(StringRef SymbolName, void *SymbolValue) { DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *FileName, std::string *Err) { - SmartScopedLock Lock(*SymbolsMutex); + // Force OpenedHandles to be added into the ManagedStatic list before any + // ManagedStatic can be added from static constructors in HandleSet::DLOpen. + HandleSet& HS = *OpenedHandles; + void *Handle = HandleSet::DLOpen(FileName, Err); - if (Handle != &Invalid) - OpenedHandles->AddLibrary(Handle, /*IsProcess*/ FileName == nullptr); + if (Handle != &Invalid) { + SmartScopedLock Lock(*SymbolsMutex); + HS.AddLibrary(Handle, /*IsProcess*/ FileName == nullptr); + } return DynamicLibrary(Handle); } diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp index d0e1d50e8ccb..f70b77da8de4 100644 --- a/lib/Support/GraphWriter.cpp +++ b/lib/Support/GraphWriter.cpp @@ -43,6 +43,7 @@ std::string llvm::DOT::EscapeString(const std::string &Label) { Str.erase(Str.begin()+i); continue; default: break; } + LLVM_FALLTHROUGH; case '{': case '}': case '<': case '>': case '|': case '"': diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp index 6a0b64fb884d..234f7439a546 100644 --- a/lib/Support/Host.cpp +++ b/lib/Support/Host.cpp @@ -1401,6 +1401,7 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["prefetchwt1"] = HasLeaf7 && (ECX & 1); Features["avx512vbmi"] = HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save; + Features["avx512vpopcntdq"] = HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save; // Enable protection keys Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1); diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp index 9fd6652ce4b8..80bef558258d 100644 --- a/lib/Support/Path.cpp +++ b/lib/Support/Path.cpp @@ -1156,6 +1156,7 @@ file_magic identify_magic(StringRef Magic) { case 0xc4: // ARMNT Windows if (Magic[1] == 0x01) return file_magic::coff_object; + LLVM_FALLTHROUGH; case 0x90: // PA-RISC Windows case 0x68: // mc68K Windows diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp index b0e3d6898cae..318e21da999d 100644 --- a/lib/Support/Triple.cpp +++ b/lib/Support/Triple.cpp @@ -34,6 +34,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) { case mips64: return "mips64"; case mips64el: return "mips64el"; case msp430: return "msp430"; + case nios2: return "nios2"; case ppc64: return "powerpc64"; case ppc64le: return "powerpc64le"; case ppc: return "powerpc"; @@ -98,6 +99,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) { case mips64: case mips64el: return "mips"; + case nios2: return "nios2"; + case hexagon: return "hexagon"; case amdgcn: return "amdgcn"; @@ -262,6 +265,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { .Case("mips64", mips64) .Case("mips64el", mips64el) .Case("msp430", msp430) + .Case("nios2", nios2) .Case("ppc64", ppc64) .Case("ppc32", ppc) .Case("ppc", ppc) @@ -384,6 +388,7 @@ static Triple::ArchType parseArch(StringRef ArchName) { .Cases("mipsel", "mipsallegrexel", Triple::mipsel) .Cases("mips64", "mips64eb", Triple::mips64) .Case("mips64el", Triple::mips64el) + .Case("nios2", Triple::nios2) .Case("r600", Triple::r600) .Case("amdgcn", Triple::amdgcn) .Case("riscv32", Triple::riscv32) @@ -625,6 +630,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { case Triple::mips64el: case Triple::mipsel: case Triple::msp430: + case Triple::nios2: case Triple::nvptx: case Triple::nvptx64: case Triple::ppc64le: @@ -643,11 +649,13 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { case Triple::tce: case Triple::tcele: case Triple::thumbeb: - case Triple::wasm32: - case Triple::wasm64: case Triple::xcore: return Triple::ELF; + case Triple::wasm32: + case Triple::wasm64: + return Triple::Wasm; + case Triple::ppc: case Triple::ppc64: if (T.isOSDarwin()) @@ -1160,6 +1168,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::le32: case llvm::Triple::mips: case llvm::Triple::mipsel: + case llvm::Triple::nios2: case llvm::Triple::nvptx: case llvm::Triple::ppc: case llvm::Triple::r600: @@ -1243,6 +1252,7 @@ Triple Triple::get32BitArchVariant() const { case Triple::le32: case Triple::mips: case Triple::mipsel: + case Triple::nios2: case Triple::nvptx: case Triple::ppc: case Triple::r600: @@ -1290,6 +1300,7 @@ Triple Triple::get64BitArchVariant() const { case Triple::kalimba: case Triple::lanai: case Triple::msp430: + case Triple::nios2: case Triple::r600: case Triple::tce: case Triple::tcele: @@ -1361,6 +1372,7 @@ Triple Triple::getBigEndianArchVariant() const { case Triple::le32: case Triple::le64: case Triple::msp430: + case Triple::nios2: case Triple::nvptx64: case Triple::nvptx: case Triple::r600: @@ -1447,6 +1459,7 @@ bool Triple::isLittleEndian() const { case Triple::mips64el: case Triple::mipsel: case Triple::msp430: + case Triple::nios2: case Triple::nvptx64: case Triple::nvptx: case Triple::ppc64le: diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp index c17a6f6e1ea6..f1496393e55e 100644 --- a/lib/Support/YAMLParser.cpp +++ b/lib/Support/YAMLParser.cpp @@ -2116,6 +2116,7 @@ void MappingNode::increment() { break; default: setError("Unexpected token. Expected Key or Block End", T); + LLVM_FALLTHROUGH; case Token::TK_Error: IsAtEnd = true; CurrentEntry = nullptr; @@ -2128,6 +2129,7 @@ void MappingNode::increment() { return increment(); case Token::TK_FlowMappingEnd: getNext(); + LLVM_FALLTHROUGH; case Token::TK_Error: // Set this to end iterator. IsAtEnd = true; @@ -2170,6 +2172,7 @@ void SequenceNode::increment() { default: setError( "Unexpected token. Expected Block Entry or Block End." , T); + LLVM_FALLTHROUGH; case Token::TK_Error: IsAtEnd = true; CurrentEntry = nullptr; @@ -2198,6 +2201,7 @@ void SequenceNode::increment() { return increment(); case Token::TK_FlowSequenceEnd: getNext(); + LLVM_FALLTHROUGH; case Token::TK_Error: // Set this to end iterator. IsAtEnd = true; diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp index 33d3de5daf33..09f9759ce7da 100644 --- a/lib/TableGen/Record.cpp +++ b/lib/TableGen/Record.cpp @@ -219,7 +219,6 @@ ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef Range) { BitsInit *BitsInit::get(ArrayRef Range) { static FoldingSet ThePool; - static std::vector TheActualPool; FoldingSetNodeID ID; ProfileBitsInit(ID, Range); @@ -234,7 +233,6 @@ BitsInit *BitsInit::get(ArrayRef Range) { std::uninitialized_copy(Range.begin(), Range.end(), I->getTrailingObjects()); ThePool.InsertNode(I, IP); - TheActualPool.push_back(I); return I; } @@ -456,7 +454,6 @@ static void ProfileListInit(FoldingSetNodeID &ID, ListInit *ListInit::get(ArrayRef Range, RecTy *EltTy) { static FoldingSet ThePool; - static std::vector TheActualPool; FoldingSetNodeID ID; ProfileListInit(ID, Range, EltTy); @@ -471,7 +468,6 @@ ListInit *ListInit::get(ArrayRef Range, RecTy *EltTy) { std::uninitialized_copy(Range.begin(), Range.end(), I->getTrailingObjects()); ThePool.InsertNode(I, IP); - TheActualPool.push_back(I); return I; } @@ -606,7 +602,6 @@ ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, RecTy *Type) { UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) { static FoldingSet ThePool; - static std::vector TheActualPool; FoldingSetNodeID ID; ProfileUnOpInit(ID, Opc, LHS, Type); @@ -617,7 +612,6 @@ UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) { UnOpInit *I = new(Allocator) UnOpInit(Opc, LHS, Type); ThePool.InsertNode(I, IP); - TheActualPool.push_back(I); return I; } @@ -752,7 +746,6 @@ ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *RHS, BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, Init *RHS, RecTy *Type) { static FoldingSet ThePool; - static std::vector TheActualPool; FoldingSetNodeID ID; ProfileBinOpInit(ID, Opc, LHS, RHS, Type); @@ -763,7 +756,6 @@ BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS, BinOpInit *I = new(Allocator) BinOpInit(Opc, LHS, RHS, Type); ThePool.InsertNode(I, IP); - TheActualPool.push_back(I); return I; } @@ -910,7 +902,6 @@ ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS, TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS, RecTy *Type) { static FoldingSet ThePool; - static std::vector TheActualPool; FoldingSetNodeID ID; ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type); @@ -921,7 +912,6 @@ TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS, TernOpInit *I = new(Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type); ThePool.InsertNode(I, IP); - TheActualPool.push_back(I); return I; } @@ -1503,7 +1493,6 @@ DagInit * DagInit::get(Init *V, StringInit *VN, ArrayRef ArgRange, ArrayRef NameRange) { static FoldingSet ThePool; - static std::vector TheActualPool; FoldingSetNodeID ID; ProfileDagInit(ID, V, VN, ArgRange, NameRange); @@ -1512,9 +1501,13 @@ DagInit::get(Init *V, StringInit *VN, ArrayRef ArgRange, if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP)) return I; - DagInit *I = new(Allocator) DagInit(V, VN, ArgRange, NameRange); + void *Mem = Allocator.Allocate(totalSizeToAlloc(ArgRange.size(), NameRange.size()), alignof(BitsInit)); + DagInit *I = new(Mem) DagInit(V, VN, ArgRange.size(), NameRange.size()); + std::uninitialized_copy(ArgRange.begin(), ArgRange.end(), + I->getTrailingObjects()); + std::uninitialized_copy(NameRange.begin(), NameRange.end(), + I->getTrailingObjects()); ThePool.InsertNode(I, IP); - TheActualPool.push_back(I); return I; } @@ -1533,7 +1526,7 @@ DagInit::get(Init *V, StringInit *VN, } void DagInit::Profile(FoldingSetNodeID &ID) const { - ProfileDagInit(ID, Val, ValName, Args, ArgNames); + ProfileDagInit(ID, Val, ValName, makeArrayRef(getTrailingObjects(), NumArgs), makeArrayRef(getTrailingObjects(), NumArgNames)); } Init *DagInit::convertInitializerTo(RecTy *Ty) const { @@ -1545,9 +1538,9 @@ Init *DagInit::convertInitializerTo(RecTy *Ty) const { Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const { SmallVector NewArgs; - NewArgs.reserve(Args.size()); + NewArgs.reserve(arg_size()); bool ArgsChanged = false; - for (const Init *Arg : Args) { + for (const Init *Arg : args()) { Init *NewArg = Arg->resolveReferences(R, RV); NewArgs.push_back(NewArg); ArgsChanged |= NewArg != Arg; @@ -1555,7 +1548,7 @@ Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const { Init *Op = Val->resolveReferences(R, RV); if (Op != Val || ArgsChanged) - return DagInit::get(Op, ValName, NewArgs, ArgNames); + return DagInit::get(Op, ValName, NewArgs, getArgNames()); return const_cast(this); } @@ -1564,12 +1557,12 @@ std::string DagInit::getAsString() const { std::string Result = "(" + Val->getAsString(); if (ValName) Result += ":" + ValName->getAsUnquotedString(); - if (!Args.empty()) { - Result += " " + Args[0]->getAsString(); - if (ArgNames[0]) Result += ":$" + ArgNames[0]->getAsUnquotedString(); - for (unsigned i = 1, e = Args.size(); i != e; ++i) { - Result += ", " + Args[i]->getAsString(); - if (ArgNames[i]) Result += ":$" + ArgNames[i]->getAsUnquotedString(); + if (!arg_empty()) { + Result += " " + getArg(0)->getAsString(); + if (getArgName(0)) Result += ":$" + getArgName(0)->getAsUnquotedString(); + for (unsigned i = 1, e = getNumArgs(); i != e; ++i) { + Result += ", " + getArg(i)->getAsString(); + if (getArgName(i)) Result += ":$" + getArgName(i)->getAsUnquotedString(); } } return Result + ")"; diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 056ffd58b521..981fd22c213c 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: return true; // Unknown modifier. + case 'a': // Print 'a' modifier + PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); + return false; case 'w': // Print W register case 'x': // Print X register if (MO.isReg()) @@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) + if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a') return true; // Unknown modifier. const MachineOperand &MO = MI->getOperand(OpNum); diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 629ad5c61b78..33fec74998d6 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -584,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } -static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MBB->addLiveIn(*I); -} - bool AArch64ExpandPseudo::expandCMP_SWAP( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); - - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + unsigned NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -616,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( MF->insert(++StoreBB->getIterator(), DoneBB); // .Lloadcmp: + // mov wStatus, 0 // ldaxr xDest, [xAddr] // cmp xDest, xDesired // b.ne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - + if (!StatusDead) + BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg) + .addImm(0).addImm(0); BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) - .addReg(Addr.getReg()); + .addReg(AddrReg); BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .add(Desired) + .addReg(DesiredReg) .addImm(ExtendImm); BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) .addImm(AArch64CC::NE) @@ -640,25 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( // .Lstore: // stlxr wStatus, xNew, [xAddr] // cbnz wStatus, .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - - BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg).add(New).add(Addr); + BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) + .addReg(NewReg) + .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -671,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( MachineOperand &DestLo = MI.getOperand(0); MachineOperand &DestHi = MI.getOperand(1); unsigned StatusReg = MI.getOperand(2).getReg(); - MachineOperand &Addr = MI.getOperand(3); - MachineOperand &DesiredLo = MI.getOperand(4); - MachineOperand &DesiredHi = MI.getOperand(5); - MachineOperand &NewLo = MI.getOperand(6); - MachineOperand &NewHi = MI.getOperand(7); - - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(2).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(3).getReg(); + unsigned DesiredLoReg = MI.getOperand(4).getReg(); + unsigned DesiredHiReg = MI.getOperand(5).getReg(); + unsigned NewLoReg = MI.getOperand(6).getReg(); + unsigned NewHiReg = MI.getOperand(7).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -696,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // cmp xDestLo, xDesiredLo // sbcs xDestHi, xDesiredHi // b.ne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(DestLo.getReg()); - LoadCmpBB->addLiveIn(DestHi.getReg()); - LoadCmpBB->addLiveIn(DesiredLo.getReg()); - LoadCmpBB->addLiveIn(DesiredHi.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) .addReg(DestLo.getReg(), RegState::Define) .addReg(DestHi.getReg(), RegState::Define) - .addReg(Addr.getReg()); + .addReg(AddrReg); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) - .add(DesiredLo) + .addReg(DesiredLoReg) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(AArch64::WZR) @@ -717,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) - .add(DesiredHi) + .addReg(DesiredHiReg) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(StatusReg, RegState::Kill) .addUse(StatusReg, RegState::Kill) .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW)) - .addUse(StatusReg, RegState::Kill) + .addUse(StatusReg, getKillRegState(StatusDead)) .addMBB(DoneBB); LoadCmpBB->addSuccessor(DoneBB); LoadCmpBB->addSuccessor(StoreBB); @@ -732,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // .Lstore: // stlxp wStatus, xNewLo, xNewHi, [xAddr] // cbnz wStatus, .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(NewLo.getReg()); - StoreBB->addLiveIn(NewHi.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) - .add(NewLo) - .add(NewHi) - .add(Addr); + .addReg(NewLoReg) + .addReg(NewHiReg) + .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute liveness bottom up. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass in the loop to get the loop carried dependencies right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 1aec602a2a36..0b92249580c8 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -267,12 +267,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { return AArch64::X9; const AArch64Subtarget &Subtarget = MF->getSubtarget(); - const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); LivePhysRegs LiveRegs(TRI); LiveRegs.addLiveIns(*MBB); // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF); + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); for (unsigned i = 0; CSRegs[i]; ++i) LiveRegs.addReg(CSRegs[i]); @@ -991,6 +991,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( SmallVector RegPairs; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { @@ -1022,9 +1023,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( dbgs() << ")\n"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - MBB.addLiveIn(Reg1); + if (!MRI.isReserved(Reg1)) + MBB.addLiveIn(Reg1); if (RPI.isPaired()) { - MBB.addLiveIn(Reg2); + if (!MRI.isReserved(Reg2)) + MBB.addLiveIn(Reg2); MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 1af36086ad90..62f4c953830b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -886,18 +886,21 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, // Create the new constant immediate node. EVT VT = Op.getValueType(); SDLoc DL(Op); + SDValue New; // If the new constant immediate is all-zeros or all-ones, let the target // independent DAG combine optimize this node. - if (NewImm == 0 || NewImm == OrigMask) - return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT)); - + if (NewImm == 0 || NewImm == OrigMask) { + New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), + TLO.DAG.getConstant(NewImm, DL, VT)); // Otherwise, create a machine node so that target independent DAG combine // doesn't undo this optimization. - Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); - SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); - SDValue New( - TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + } else { + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + New = SDValue( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + } return TLO.CombineTo(Op, New); } @@ -9219,16 +9222,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); + uint64_t BaseOffset = 0; + const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); + // As this in ISel, we will not merge this add which may degrade results. + if (BasePtr->getOpcode() == ISD::ADD && + isa(BasePtr->getOperand(1))) { + BaseOffset = cast(BasePtr->getOperand(1))->getSExtValue(); + BasePtr = BasePtr->getOperand(0); + } + unsigned Offset = EltOffset; while (--NumVecElts) { unsigned Alignment = MinAlign(OrigAlignment, Offset); - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, DL, MVT::i64)); + SDValue OffsetPtr = + DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index c42738da7ab0..faf39be9b41e 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -763,15 +763,126 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } -bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const { - if (MI.getNumOperands() < 4) +bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: return false; - unsigned ShOpVal = MI.getOperand(3).getImm(); - unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal); - if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL && - ShImm < 4) - return true; - return false; + + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + if (ShiftVal == 0) + return true; + return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; + } + + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: { + unsigned Imm = MI.getOperand(3).getImm(); + switch (AArch64_AM::getArithExtendType(Imm)) { + default: + return false; + case AArch64_AM::UXTB: + case AArch64_AM::UXTH: + case AArch64_AM::UXTW: + case AArch64_AM::UXTX: + return AArch64_AM::getArithShiftValue(Imm) <= 4; + } + } + + case AArch64::SUBWrs: + case AArch64::SUBSWrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + return ShiftVal == 0 || + (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); + } + + case AArch64::SUBXrs: + case AArch64::SUBSXrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + return ShiftVal == 0 || + (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); + } + + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: { + unsigned Imm = MI.getOperand(3).getImm(); + switch (AArch64_AM::getArithExtendType(Imm)) { + default: + return false; + case AArch64_AM::UXTB: + case AArch64_AM::UXTH: + case AArch64_AM::UXTW: + case AArch64_AM::UXTX: + return AArch64_AM::getArithShiftValue(Imm) == 0; + } + } + + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRQroW: + case AArch64::LDRQroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + case AArch64::PRFMroW: + case AArch64::PRFMroX: + case AArch64::STRBBroW: + case AArch64::STRBBroX: + case AArch64::STRBroW: + case AArch64::STRBroX: + case AArch64::STRDroW: + case AArch64::STRDroX: + case AArch64::STRHHroW: + case AArch64::STRHHroX: + case AArch64::STRHroW: + case AArch64::STRHroX: + case AArch64::STRQroW: + case AArch64::STRQroX: + case AArch64::STRSroW: + case AArch64::STRSroX: + case AArch64::STRWroW: + case AArch64::STRWroX: + case AArch64::STRXroW: + case AArch64::STRXroX: { + unsigned IsSigned = MI.getOperand(3).getImm(); + return !IsSigned; + } + } } bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 4cd14db633b9..59f3405fe439 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -270,7 +270,7 @@ public: bool IsTailCall) const override; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. - bool isFalkorLSLFast(const MachineInstr &MI) const; + bool isFalkorShiftExtFast(const MachineInstr &MI) const; private: /// \brief Sets the offsets on outlined instructions in \p MBB which use SP diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index da68f3165c5e..ad24612239fa 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -442,7 +442,7 @@ def MSRpstateImm4 : MSRpstateImm0_15; // TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects. let hasSideEffects = 0 in def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), - [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>; + [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp index a6926a6700e1..3b71d529db59 100644 --- a/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -232,6 +232,19 @@ static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) { dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " << DAG->TII->getName(SecondMI->getOpcode()) << '\n'; ); + if (&SecondSU != &DAG->ExitSU) + // Make instructions dependent on FirstSU also dependent on SecondSU to + // prevent them from being scheduled between FirstSU and and SecondSU. + for (SUnit::const_succ_iterator + SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end(); + SI != SE; ++SI) { + if (!SI->getSUnit() || SI->getSUnit() == &SecondSU) + continue; + DEBUG(dbgs() << " Copy Succ "; + SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';); + DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial)); + } + ++NumFused; return true; } diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td index cf1c0b66db58..44fd94fc3d48 100644 --- a/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/lib/Target/AArch64/AArch64SchedFalkor.td @@ -61,56 +61,42 @@ let SchedModel = FalkorModel in { let SchedModel = FalkorModel in { -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes - { let Latency = 1; let NumMicroOps = 2; } -def : WriteRes - { let Latency = 2; let NumMicroOps = 2; } -def : WriteRes - { let Latency = 2; let NumMicroOps = 2; } -def : WriteRes { let Latency = 1; } -def : WriteRes - { let Latency = 8; let NumMicroOps = 2; } -def : WriteRes - { let Latency = 16; let NumMicroOps = 2; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 5; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 3; } -def : WriteRes - { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes - { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 5; } -def : WriteRes - { let Latency = 0; let NumMicroOps = 2; } -def : WriteRes - { let Latency = 3; let NumMicroOps = 2; } -def : WriteRes { let Latency = 2; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } -def : WriteRes - { let Latency = 6; let NumMicroOps = 2; } -def : WriteRes - { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1 -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 3; } -def : WriteRes - { let Latency = 0; let NumMicroOps = 2; } +// These WriteRes entries are not used in the Falkor sched model. +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } - -def : WriteRes { let Latency = 3; } - -def : WriteRes { let Unsupported = 1; } - -// No forwarding logic is modelled yet. +// These ReadAdvance entries are not used in the Falkor sched model. def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index a9b4d44a523e..d098cf7a5a37 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -12,7 +12,509 @@ // //===----------------------------------------------------------------------===// -include "AArch64SchedFalkorWriteRes.td" +// Contains all of the Falkor specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: FalkorWr +// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD) +// Latency: #cyc +// +// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued +// down one Z pipe, six SD pipes, four VX pipes and the total latency is +// six cycles. +// +// Contains all of the Falkor specific ReadAdvance types for forwarding logic. +// +// Contains all of the Falkor specific WriteVariant types for immediate zero +// and LSLFast. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define 0 micro-op types +def FalkorWr_none_1cyc : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; +} +def FalkorWr_none_3cyc : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} +def FalkorWr_none_4cyc : SchedWriteRes<[]> { + let Latency = 4; + let NumMicroOps = 0; +} + +//===----------------------------------------------------------------------===// +// Define 1 micro-op types + +def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } +def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } +def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } +def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } +def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } +def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } +def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } +def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } +def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } +def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } +def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } + +def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } +def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } +def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } +def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } +def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } + +def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } +def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } +def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } + +def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } +def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } +def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Define 2 micro-op types + +def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 1; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} +def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2, 8]; +} + +def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [2, 16]; +} + +def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define 3 micro-op types + +def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 0; + let NumMicroOps = 3; +} + +def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 3; +} +def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 3; +} +//===----------------------------------------------------------------------===// +// Define 4 micro-op types + +def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, + FalkorUnitSD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define 5 micro-op types + +def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 7; + let NumMicroOps = 5; +} +def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitST, + FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 5; +} +def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitST, + FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 5; +} +//===----------------------------------------------------------------------===// +// Define 6 micro-op types + +def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitXYZ, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define 8 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 8; +} + +def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define 9 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitLD, + FalkorUnitLD, FalkorUnitXYZ, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitXYZ, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +//===----------------------------------------------------------------------===// +// Define 10 micro-op types + +def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define 12 micro-op types + +def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 12; +} + +// Forwarding logic is modeled for multiply add/accumulate. +// ----------------------------------------------------------------------------- +def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; +def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; +def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; +def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; +def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; + +// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast +// ----------------------------------------------------------------------------- +def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; +def FalkorFMOVZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || + MI->getOperand(1).getReg() == AArch64::XZR}]>; +def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>; + +def FalkorWr_FMOV : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_MOVZ : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_ADDSUBsx : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_LDRro : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_LDRSro : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_PRFMro : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_STRVro : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_STRQro : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +def FalkorWr_STRro : SchedWriteVariant<[ + SchedVar, + SchedVar]>; //===----------------------------------------------------------------------===// // Specialize the coarse model by associating instruction groups with the @@ -22,63 +524,76 @@ include "AArch64SchedFalkorWriteRes.td" // Miscellaneous // ----------------------------------------------------------------------------- -def : InstRW<[WriteI], (instrs COPY)>; +// FIXME: This could be better modeled by looking at the regclasses of the operands. +def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>; // SIMD Floating-point Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)v2f32$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(16|32|64)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)v2f32$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instrs FMULX32)>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instregex "^(FMUL|FMULX)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instrs FMULX64)>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs FCVTLv4i16, FCVTLv2i32)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)v2f32$")>; -def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs FCVTLv8i16, FCVTLv4i32)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>; -def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], + (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], + (instregex "^(FMUL|FMULX)v2i64_indexed$")>; -def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; +def : InstRW<[FalkorWr_3VXVY_4cyc], (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>; +def : InstRW<[FalkorWr_3VXVY_5cyc], (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>; -def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>; +def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32)$")>; -def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; - -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>; -def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], + (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], + (instregex "^FML(A|S)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], + (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], + (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; // SIMD Integer Instructions // ----------------------------------------------------------------------------- @@ -92,12 +607,14 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$" def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHLv1i64$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHRd$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs PMULv8i8)>; def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHLd$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>; @@ -110,6 +627,8 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN) def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHRd$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs ADDVv4i16v)>; @@ -120,10 +639,14 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64) def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^SQDMULL(i16|i32)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>; @@ -154,7 +677,7 @@ def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>; -def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL2?(v8i8|v16i8)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL(v8i8|v16i8)$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>; def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; @@ -165,14 +688,18 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16 def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>; -def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL(v1i64|v2i64)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^SQDMULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>; @@ -186,99 +713,114 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>; def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>; -def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^SQD(MLAL|MLSL)v[248].*$")>; // SIMD Load Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; -def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLD], (instrs LD2i64)>; -def : InstRW<[WriteVLD, WriteAdr], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; -def : InstRW<[WriteVLD, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[WriteVLD, WriteAdr], (instrs LD2i64_POST)>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instrs LD2i64)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], (instrs LD2i64_POST)>; -def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "LD1i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>; -def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD3i64_POST)>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr], (instrs LD4i64_POST)>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD3i64)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD3i64_POST)>; +def : InstRW<[FalkorWr_2LD_3cyc], (instrs LD4i64)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc], (instrs LD4i64_POST)>; -def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr], (instregex "^LD2i(8|16|32)_POST$")>; +def : InstRW<[FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>; -def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instrs LD3Threev2d_POST)>; -def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_3LD_3cyc], (instrs LD3Threev2d)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instrs LD3Threev2d_POST)>; +def : InstRW<[FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc], (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "LD3i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr], (instregex "LD3i(8|16|32)_POST$")>; +def : InstRW<[FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>; -def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; -def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instrs LD4Fourv2d_POST)>; -def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_4LD_3cyc], (instrs LD4Fourv2d)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instrs LD4Fourv2d_POST)>; +def : InstRW<[FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc], (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; -def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>; -def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr], (instregex "^LD4i(8|16|32)_POST$")>; +def : InstRW<[FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "LD3Threev(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc], (instregex "^LD3Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc], + (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc], (instregex "^LD4Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc], + (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "LD3Threev(16b|8h|4s)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD3Threev(16b|8h|4s)$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>; -def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc], (instregex "^LD4Fourv(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc], + (instregex "^LD3Threev(16b|8h|4s)_POST$")>; + +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc], + (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; // Arithmetic and Logical Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_ADD], (instregex "^ADD(S)?(W|X)r(s|x)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADC(S)?(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADD(S)?(W|X)r(r|i)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^AND(S)?(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>; def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(i|r|s)$")>; -def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^SUB(S)?(W|X)r(s|x)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>; +def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>; +def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>; // SIMD Miscellaneous Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; @@ -287,35 +829,42 @@ def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "(S|U)QXTU?Nv.*$")>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instrs FRECPS64, FRSQRTS64)>; -def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], + (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>; -def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], + (instrs FRECPSv4f32, FRSQRTSv4f32)>; -def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], + (instrs FRECPSv2f64, FRSQRTSv2f64)>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>; @@ -328,50 +877,95 @@ def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; // SIMD Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteVST], (instregex "^STP(D|S)(i)$")>; -def : InstRW<[WriteVST, WriteAdr], (instregex "^STP(D|S)(post|pre)$")>; -def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>; -def : InstRW<[WriteVST], (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; -def : InstRW<[WriteVST], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; -def : InstRW<[WriteVST, WriteAdr], (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; -def : InstRW<[WriteVST, WriteAdr], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^STR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[FalkorWr_STRVro], (instregex "^STR(D|S|H|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^STPQ(post|pre)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^STP(D|S)(post|pre)$")>; +def : InstRW<[FalkorWr_STRQro], (instregex "^STRQro(W|X)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST3(i8|i16|i32|i64)$")>; -def : InstRW<[WriteVST, WriteVST], (instregex "^ST4(i8|i16|i32|i64)$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST3(i8|i16|i32|i64)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^ST4(i8|i16|i32|i64)_POST$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc], + (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; -def : InstRW<[WriteV, WriteVST, WriteVST], (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>; -def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST3(i8|i16|i32|i64)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc], + (instregex "^ST4(i8|i16|i32|i64)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST], (instrs ST3Threev2d)>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST3Threev2d_POST)>; +def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc], + (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc], + (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>; -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST], (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>; -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr], (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>; +def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc], + (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc], + (instrs ST3Threev2d_POST)>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], (instrs ST4Fourv2d)>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr], (instrs ST4Fourv2d_POST)>; +def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc], + (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc], + (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>; -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST3Three(v16b|v8h|v4s)$")>; -def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; +def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc], + (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc], + (instrs ST4Fourv2d_POST)>; -def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST], (instregex "^ST4Four(v16b|v8h|v4s)$")>; -def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; +def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc], + (instregex "^ST3Three(v16b|v8h|v4s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc], + (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; + +def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc], + (instregex "^ST4Four(v16b|v8h|v4s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc], + (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; // Branch Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1none_0cyc], (instrs B)>; +def : InstRW<[FalkorWr_1none_0cyc], (instrs B, TCRETURNdi)>; def : InstRW<[FalkorWr_1Z_0cyc], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>; +def : InstRW<[FalkorWr_1Z_0cyc], (instrs RET_ReallyLR, TCRETURNri)>; def : InstRW<[FalkorWr_1ZB_0cyc], (instrs Bcc)>; def : InstRW<[FalkorWr_1XYZB_0cyc], (instrs BL)>; def : InstRW<[FalkorWr_1Z_1XY_0cyc], (instrs BLR)>; @@ -388,89 +982,103 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>; // FP Load Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteLD], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; -def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; -def : InstRW<[WriteLD], (instregex "^LDUR(Q|D|S|H|B)i$")>; -def : InstRW<[FalkorWr_LDR], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>; -def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>; -def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], + (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(Q|D|S|H|B)i$")>; +def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], + (instrs LDNPQi)>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], + (instrs LDPQi)>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], + (instregex "LDNP(D|S)i$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], + (instregex "LDP(D|S)i$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc], + (instregex "LDP(D|S)(pre|post)$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDPQ(pre|post)$")>; // FP Data Processing Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>; -def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(H|S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(H|S|D)rrr$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(S|D)r(r|i)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(S|D)rrr$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTHSr, FCVTHDr)>; -def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTSHr, FCVTDHr)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(16|32|64)$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTHSr, FCVTHDr)>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instregex "^F(N)?MULSrr$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instregex "^F(N)?MULDrr$")>; -def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>; -def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(S|D)rr$")>; +def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(S|D)r$")>; -def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; -def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], + (instregex "^F(N)?M(ADD|SUB)Srrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], + (instregex "^F(N)?M(ADD|SUB)Drrr$")>; // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>; -def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; -def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; -def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>; -// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0 +def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>; +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(S|D)i$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; +// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>; def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>; // Load Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>; - -def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDNP(W|X)i$")>; -def : InstRW<[WriteLD, WriteLDHi], (instregex "^LDP(W|X)i$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(B|H|W|X)ui$")>; -def : InstRW<[WriteLD, WriteAdr], (instregex "^LDR(B|H|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDNP(W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDP(W|X)i$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc], + (instregex "^LDP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc], + (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_LDRro], (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>; def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDR(W|X)l$")>; def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDTR(B|H|W|X)i$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(B|H|W|X)i$")>; - +def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^LDUR(BB|HH|W|X)i$")>; +def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc], + (instrs LDPSWi)>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc], + (instregex "^LDPSW(post|pre)$")>; def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc], + (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; +def : InstRW<[FalkorWr_LDRSro], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; def : InstRW<[FalkorWr_1LD_4cyc], (instrs LDRSWl)>; def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; def : InstRW<[FalkorWr_1LD_4cyc], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; -def : InstRW<[FalkorWr_PRFM], (instregex "^PRFMro(W|X)$")>; -def : InstRW<[FalkorWr_LDR], (instregex "^LDR(B|H|W|X)ro(W|X)$")>; - -def : InstRW<[FalkorWr_LDRS], (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; - -def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; -def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; -def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>; -def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>; - // Miscellaneous Data-Processing Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>; @@ -480,17 +1088,22 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>; // Divide and Multiply Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; -def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], + (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; +def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], + (instregex "^M(ADD|SUB)Wrrr$")>; -def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; -def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], + (instregex "^M(ADD|SUB)Xrrr$")>; -def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; -def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; +def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; +def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>; -def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^(S|U)MULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^(S|U)(MLAL|MLSL)v.*$")>; // Move and Shift Instructions // ----------------------------------------------------------------------------- @@ -498,6 +1111,11 @@ def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W| def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^ADRP?$")>; def : InstRW<[FalkorWr_1XYZB_1cyc], (instregex "^MOVN(W|X)i$")>; def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs MOVi32imm, MOVi64imm)>; +def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>], + (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>; +def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>], + (instrs LOADgot)>; // Other Instructions // ----------------------------------------------------------------------------- @@ -507,13 +1125,12 @@ def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>; def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>; def : InstRW<[FalkorWr_1LD_3cyc], (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>; -def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS)>; +def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>; def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; -def : InstRW<[WriteVST], (instrs STNPDi, STNPSi)>; -def : InstRW<[WriteSTP], (instrs STNPWi, STNPXi)>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>; def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>; @@ -523,20 +1140,16 @@ def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>; def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>; def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>; -def : InstRW<[WriteVST, WriteVST], (instrs STNPQi)>; // Store Instructions // ----------------------------------------------------------------------------- -def : InstRW<[WriteST], (instregex "^STP(W|X)i$")>; -def : InstRW<[WriteST, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; -def : InstRW<[WriteST], (instregex "^STR(Q|D|S|BB|HH)ui$")>; -def : InstRW<[WriteST], (instregex "^STUR(Q|D|S|BB|HH)i$")>; -def : InstRW<[WriteST], (instregex "^STR(B|H|W|X)ui$")>; -def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)(post|pre)$")>; -def : InstRW<[WriteST], (instregex "^STTR(B|H|W|X)i$")>; -def : InstRW<[WriteST], (instregex "^STUR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STP(W|X)i$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc], + (instregex "^STP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc], + (instregex "^STR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_STRro], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instregex "^STUR(BB|HH|W|X)i$")>; -def : InstRW<[WriteST, WriteAdr], (instregex "^STR(B|H|W|X)ro(W|X)$")>; - -def : InstRW<[WriteVST, WriteVST], (instregex "^STPQi$")>; -def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>; diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td deleted file mode 100644 index 6526cc28e806..000000000000 --- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td +++ /dev/null @@ -1,403 +0,0 @@ -//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Contains all of the Falkor specific SchedWriteRes types. The approach -// below is to define a generic SchedWriteRes for every combination of -// latency and microOps. The naming conventions is to use a prefix, one field -// for latency, and one or more microOp count/type designators. -// Prefix: FalkorWr -// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD) -// Latency: #cyc -// -// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued -// down one Z pipe, six SD pipes, four VX pipes and the total latency is -// six cycles. -// -// Contains all of the Falkor specific ReadAdvance types for forwarding logic. -// -// Contains all of the Falkor specific WriteVariant types for immediate zero -// and LSLFast. -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Define 1 micro-op types - -def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } -def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } -def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } -def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } -def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } -def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } -def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } -def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } -def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } -def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } -def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } -def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } -def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } - -def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } -def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } -def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } -def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } -def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } -def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } -def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } -def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } - -def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } -def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } -def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } - -def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } -def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } -def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } - -//===----------------------------------------------------------------------===// -// Define 2 micro-op types - -def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 1; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 2; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 3; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 5; - let NumMicroOps = 2; -} -def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 5; - let NumMicroOps = 2; -} -def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 2; -} -def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { - let Latency = 10; - let NumMicroOps = 2; -} - -def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> { - let Latency = 4; - let NumMicroOps = 2; -} -def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> { - let Latency = 0; - let NumMicroOps = 2; -} - -def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { - let Latency = 8; - let ResourceCycles = [2, 8]; -} - -def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { - let Latency = 16; - let ResourceCycles = [2, 16]; -} - -def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { - let Latency = 0; - let NumMicroOps = 2; -} - -//===----------------------------------------------------------------------===// -// Define 3 micro-op types - -def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, - FalkorUnitLD]> { - let Latency = 0; - let NumMicroOps = 3; -} - -def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, - FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 5; - let NumMicroOps = 3; -} - -def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 3; -} - -def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 3; -} - -def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 3; -} - -def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitZ]> { - let Latency = 3; - let NumMicroOps = 3; -} - -//===----------------------------------------------------------------------===// -// Define 4 micro-op types - -def FalkorWr_2VX_2VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, - FalkorUnitVX, FalkorUnitVY]> { - let Latency = 2; - let NumMicroOps = 4; -} - -def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 2; - let NumMicroOps = 4; -} -def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 3; - let NumMicroOps = 4; -} -def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 4; -} -def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 6; - let NumMicroOps = 4; -} - -def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 4; -} - -def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 4; -} - -def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 4; -} - -def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, - FalkorUnitSD, FalkorUnitLD]> { - let Latency = 3; - let NumMicroOps = 4; -} - -//===----------------------------------------------------------------------===// -// Define 5 micro-op types - -def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 5; -} -def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 5; -} -def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitVXVY]> { - let Latency = 7; - let NumMicroOps = 5; -} - -//===----------------------------------------------------------------------===// -// Define 6 micro-op types - -def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 6; -} - -def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, - FalkorUnitVSD, FalkorUnitXYZ, - FalkorUnitST, FalkorUnitVSD]> { - let Latency = 0; - let NumMicroOps = 6; -} - -//===----------------------------------------------------------------------===// -// Define 8 micro-op types - -def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY, - FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 8; -} - -//===----------------------------------------------------------------------===// -// Define 9 micro-op types - -def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, - FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitLD, - FalkorUnitLD, FalkorUnitXYZ, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 9; -} - -def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, - FalkorUnitLD, FalkorUnitVXVY, - FalkorUnitVXVY, FalkorUnitXYZ, - FalkorUnitLD, FalkorUnitLD, - FalkorUnitVXVY, FalkorUnitVXVY]> { - let Latency = 4; - let NumMicroOps = 9; -} - -// Forwarding logic is modeled for multiply add/accumulate. -// ----------------------------------------------------------------------------- -def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; -def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; -def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; -def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; -def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; - -// SchedPredicates and WriteVariants for Immediate Zero and LSLFast -// ----------------------------------------------------------------------------- -def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; -def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; - -def FalkorWr_FMOV : SchedWriteVariant<[ - SchedVar, - SchedVar]>; - -def FalkorWr_MOVZ : SchedWriteVariant<[ - SchedVar, - SchedVar]>; - -def FalkorWr_LDR : SchedWriteVariant<[ - SchedVar, - SchedVar]>; - -def FalkorWr_ADD : SchedWriteVariant<[ - SchedVar, - SchedVar]>; - -def FalkorWr_PRFM : SchedWriteVariant<[ - SchedVar, - SchedVar]>; - -def FalkorWr_LDRS : SchedWriteVariant<[ - SchedVar, - SchedVar]>; diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index b369ee7e4ba2..d3cab1ad3397 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -90,7 +90,6 @@ void AArch64Subtarget::initializeProperties() { break; case Falkor: MaxInterleaveFactor = 4; - VectorInsertExtractBaseCost = 2; // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 132f192f2a9a..cb3f72a524f5 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -10,10 +10,10 @@ // //===----------------------------------------------------------------------===// +#include "AArch64TargetMachine.h" #include "AArch64.h" #include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" @@ -23,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" @@ -277,7 +278,7 @@ public: ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { const AArch64Subtarget &ST = C->MF->getSubtarget(); - if (ST.hasFuseLiterals()) { + if (ST.hasFuseAES() || ST.hasFuseLiterals()) { // Run the Macro Fusion after RA again since literals are expanded from // pseudos then (v. addPreSched2()). ScheduleDAGMI *DAG = createGenericSchedPostRA(C); @@ -295,6 +296,7 @@ public: bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; #endif bool addILPOpts() override; @@ -404,6 +406,12 @@ bool AArch64PassConfig::addRegBankSelect() { return false; } +void AArch64PassConfig::addPreGlobalInstructionSelect() { + // Workaround the deficiency of the fast register allocator. + if (TM->getOptLevel() == CodeGenOpt::None) + addPass(new Localizer()); +} + bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); return false; diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index b279bd61e180..e7ebb37a9d62 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP, + FeatureFastFMAF32, FeatureSDWA, FeatureDPP, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; @@ -534,10 +534,12 @@ def AMDGPUAsmVariants { int VOP3_ID = 1; string SDWA = "SDWA"; int SDWA_ID = 2; + string SDWA9 = "SDWA9"; + int SDWA9_ID = 3; string DPP = "DPP"; - int DPP_ID = 3; + int DPP_ID = 4; string Disable = "Disable"; - int Disable_ID = 4; + int Disable_ID = 5; } def DefaultAMDGPUAsmParserVariant : AsmParserVariant { @@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant { let Name = AMDGPUAsmVariants.SDWA; } +def SDWA9AsmParserVariant : AsmParserVariant { + let Variant = AMDGPUAsmVariants.SDWA9_ID; + let Name = AMDGPUAsmVariants.SDWA9; +} + + def DPPAsmParserVariant : AsmParserVariant { let Variant = AMDGPUAsmVariants.DPP_ID; let Name = AMDGPUAsmVariants.DPP; @@ -567,6 +575,7 @@ def AMDGPU : Target { let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant, VOP3AsmParserVariant, SDWAAsmParserVariant, + SDWA9AsmParserVariant, DPPAsmParserVariant]; let AssemblyWriters = [AMDGPUAsmWriter]; } @@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; def HasSDWA : Predicate<"Subtarget->hasSDWA()">, - AssemblerPredicate<"FeatureSDWA">; + AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; + +def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"FeatureSDWA,FeatureGFX9">; def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5ec46a8294c0..723e8a7b54e2 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) +{ + assert(Op.getOpcode() == ISD::OR); + + SDValue N0 = Op->getOperand(0); + SDValue N1 = Op->getOperand(1); + EVT VT = N0.getValueType(); + + if (VT.isInteger() && !VT.isVector()) { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N0, LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N1, RHSKnown); + + if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) + return true; + } + } + + return false; +} + AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (VT != MVT::i64) - return SDValue(); ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); if (!RHS) @@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { // shl (ext x) => zext (shl x), if shift does not overflow int + if (VT != MVT::i64) + break; KnownBits Known; SDValue X = LHS->getOperand(0); DAG.computeKnownBits(X, Known); @@ -2628,7 +2651,22 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } + case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break; + case ISD::ADD: { // Fall through from above + // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) + if (ConstantSDNode *C2 = dyn_cast(LHS->getOperand(1))) { + SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), + SDValue(RHS, 0)); + SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, + SDLoc(C2), VT); + return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); + } + break; } + } + + if (VT != MVT::i64) + return SDValue(); // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) @@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DL); } - if ((OffsetVal + WidthVal) >= 32) { + if ((OffsetVal + WidthVal) >= 32 && + !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, BitsFrom, ShiftVal); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fb2f15022d25..0d066cdbdff4 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -34,6 +34,9 @@ private: /// compare. SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; +public: + static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + protected: const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9de302994e68..57905be18813 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); + setAction({G_FCONSTANT, S32}, Legal); + setAction({G_GEP, P1}, Legal); setAction({G_GEP, P2}, Legal); setAction({G_GEP, 1, S64}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 85184b363905..07f92918a43f 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -97,6 +97,9 @@ private: Instruction *UseInst, int OpIdx0, int OpIdx1) const; + /// Check whether we have enough local memory for promotion. + bool hasSufficientLocalMem(const Function &F); + public: static char ID; @@ -107,7 +110,7 @@ public: StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - void handleAlloca(AllocaInst &I); + bool handleAlloca(AllocaInst &I, bool SufficientLDS); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { const AMDGPUSubtarget &ST = TM->getSubtarget(F); if (!ST.isPromoteAllocaEnabled()) return false; + AS = AMDGPU::getAMDGPUAS(*F.getParent()); - FunctionType *FTy = F.getFunctionType(); - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (Type *ParamTy : FTy->params()) { - PointerType *PtrTy = dyn_cast(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { - LocalMemLimit = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - return false; - } - } - - LocalMemLimit = ST.getLocalMemorySize(); - if (LocalMemLimit == 0) - return false; - - const DataLayout &DL = Mod->getDataLayout(); - - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; - for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) - continue; - - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast(U); - if (!Use) - continue; - - if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); - - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); - CurrentLocalMemUsage += AllocSize; - break; - } - } - } - - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, - F); - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - - - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - - // Program is possibly broken by using more local mem than available. - if (CurrentLocalMemUsage > MaxSizeWithWaveCount) - return false; - - LocalMemLimit = MaxSizeWithWaveCount; - - DEBUG( - dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n" - ); - + bool SufficientLDS = hasSufficientLocalMem(F); + bool Changed = false; BasicBlock &EntryBB = *F.begin(); for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { AllocaInst *AI = dyn_cast(I); ++I; if (AI) - handleAlloca(*AI); + Changed |= handleAlloca(*AI, SufficientLDS); } - return true; + return Changed; } std::pair @@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( return true; } +bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { + + FunctionType *FTy = F.getFunctionType(); + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + return false; + } + } + + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast(U); + if (!Use) + continue; + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; + } + } + } + + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); + + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. + + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint = ST.getWavesPerEU(F).second; + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + return true; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. -void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { +bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) - return; + return false; IRBuilder<> Builder(&I); @@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, AS)) { - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - return; - } + if (tryPromoteAllocaToVector(&I, AS)) + return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { break; default: DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); - return; + return false; } + // Not likely to have sufficient local memory for promotion. + if (!SufficientLDS) + return false; + const AMDGPUSubtarget &ST = TM->getSubtarget(ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; @@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); - return; + return false; } CurrentLocalMemUsage = NewSize; @@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; + return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); @@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + return true; } FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index e543cae07ada..660879426810 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -416,6 +416,10 @@ public: return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; } + bool hasSDWA() const { + return HasSDWA; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { @@ -670,10 +674,6 @@ public: return HasInv2PiInlineImm; } - bool hasSDWA() const { - return HasSDWA; - } - bool hasDPP() const { return HasDPP; } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b52ea2b3a2c6..f5541e08e1b7 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -881,6 +881,10 @@ public: return AMDGPU::isVI(getSTI()); } + bool isGFX9() const { + return AMDGPU::isGFX9(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -989,7 +993,6 @@ private: bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; - bool isSGPR(unsigned Reg); public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); @@ -1042,9 +1045,10 @@ public: OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); void cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType); + uint64_t BasicInstType, bool skipVcc = false); }; struct OptionalOperand { @@ -1966,7 +1970,8 @@ ArrayRef AMDGPUAsmParser::getMatchedVariants() const { } if (isForcedSDWA()) { - static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA}; + static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA, + AMDGPUAsmVariants::SDWA9}; return makeArrayRef(Variants); } @@ -1977,7 +1982,7 @@ ArrayRef AMDGPUAsmParser::getMatchedVariants() const { static const unsigned Variants[] = { AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3, - AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP + AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP }; return makeArrayRef(Variants); @@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const { return AMDGPU::NoRegister; } -bool AMDGPUAsmParser::isSGPR(unsigned Reg) { - const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); - const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); - return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || - Reg == AMDGPU::SCC; -} - // NB: This code is correct only when used to check constant // bus limitations because GFX7 support no f16 inline constants. // Note that there are no cases when a GFX7 opcode violates @@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) { if (MO.isImm()) { return !isInlineConstant(Inst, OpIdx); } - return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg())); + return !MO.isReg() || + isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo()); } bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { @@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) { if (Desc.TSFlags & (SIInstrFlags::VOPC | SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | - SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) { + SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | + SIInstrFlags::SDWA)) { // Check special imm operands (used by madmk, etc) if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { @@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { cvtSDWA(Inst, Operands, SIInstrFlags::VOP2); } +void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true); +} + void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { - cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); + cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI()); } void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, - uint64_t BasicInstType) { + uint64_t BasicInstType, bool skipVcc) { using namespace llvm::AMDGPU::SDWA; OptionalImmIndexMap OptionalIdx; + bool skippedVcc = false; unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); @@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - // Add the register arguments - if ((BasicInstType == SIInstrFlags::VOPC || - BasicInstType == SIInstrFlags::VOP2)&& - Op.isReg() && - Op.Reg.RegNo == AMDGPU::VCC) { - // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. - // Skip it. - continue; - } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { + // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst. + // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3) + // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand. + // Skip VCC only if we didn't skip it on previous iteration. + if (BasicInstType == SIInstrFlags::VOP2 && + (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) { + skippedVcc = true; + continue; + } else if (BasicInstType == SIInstrFlags::VOPC && + Inst.getNumOperands() == 0) { + skippedVcc = true; + continue; + } + } + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments @@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } else { llvm_unreachable("Invalid operand type"); } + skippedVcc = false; } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - - if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { + if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 && + Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) { // V_NOP_sdwa_vi has no optional sdwa arguments switch (BasicInstType) { case SIInstrFlags::VOP1: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (isGFX9() && + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOP2: + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + if (isGFX9() && + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); @@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, break; case SIInstrFlags::VOPC: + if (isVI()) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); break; @@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi) { auto it = Inst.begin(); std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); + it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); Inst.insert(it, Inst.getOperand(0)); // src2 = dst } - } /// Force static initialization. diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 137b5cca96ce..9b3cde7c4df6 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, return addOperand(Inst, MCOperand::createImm(Imm)); } -#define DECODE_OPERAND2(RegClass, DecName) \ -static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ - unsigned Imm, \ - uint64_t /*Addr*/, \ - const void *Decoder) { \ +#define DECODE_OPERAND(StaticDecoderName, DecoderName) \ +static DecodeStatus StaticDecoderName(MCInst &Inst, \ + unsigned Imm, \ + uint64_t /*Addr*/, \ + const void *Decoder) { \ auto DAsm = static_cast(Decoder); \ - return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ + return addOperand(Inst, DAsm->DecoderName(Imm)); \ } -#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) +#define DECODE_OPERAND_REG(RegClass) \ +DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) -DECODE_OPERAND(VGPR_32) -DECODE_OPERAND(VS_32) -DECODE_OPERAND(VS_64) +DECODE_OPERAND_REG(VGPR_32) +DECODE_OPERAND_REG(VS_32) +DECODE_OPERAND_REG(VS_64) -DECODE_OPERAND(VReg_64) -DECODE_OPERAND(VReg_96) -DECODE_OPERAND(VReg_128) +DECODE_OPERAND_REG(VReg_64) +DECODE_OPERAND_REG(VReg_96) +DECODE_OPERAND_REG(VReg_128) -DECODE_OPERAND(SReg_32) -DECODE_OPERAND(SReg_32_XM0_XEXEC) -DECODE_OPERAND(SReg_64) -DECODE_OPERAND(SReg_64_XEXEC) -DECODE_OPERAND(SReg_128) -DECODE_OPERAND(SReg_256) -DECODE_OPERAND(SReg_512) +DECODE_OPERAND_REG(SReg_32) +DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) +DECODE_OPERAND_REG(SReg_64) +DECODE_OPERAND_REG(SReg_64_XEXEC) +DECODE_OPERAND_REG(SReg_128) +DECODE_OPERAND_REG(SReg_256) +DECODE_OPERAND_REG(SReg_512) static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, @@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +#define DECODE_SDWA9(DecName) \ +DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName) + +DECODE_SDWA9(Src32) +DECODE_SDWA9(Src16) +DECODE_SDWA9(VopcDst) + #include "AMDGPUGenDisassemblerTables.inc" //===----------------------------------------------------------------------===// @@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); if (Res) break; + + Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); + if (Res) break; } // Reinitialize Bytes as DPP64 could have eaten too much @@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { return errOperand(Val, "unknown operand encoding " + Twine(Val)); } +MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width, + unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (SDWA9EncValues::SRC_VGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), + Val - SDWA9EncValues::SRC_VGPR_MIN); + } + if (SDWA9EncValues::SRC_SGPR_MIN <= Val && + Val <= SDWA9EncValues::SRC_SGPR_MAX) { + return createSRegOperand(getSgprClassId(Width), + Val - SDWA9EncValues::SRC_SGPR_MIN); + } + + return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const { + return decodeSDWA9Src(OPW16, Val); +} + +MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const { + return decodeSDWA9Src(OPW32, Val); +} + + +MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const { + using namespace AMDGPU::SDWA; + + if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { + Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + if (Val > AMDGPU::EncValues::SGPR_MAX) { + return decodeSpecialReg64(Val); + } else { + return createSRegOperand(getSgprClassId(OPW64), Val); + } + } else { + return createRegOperand(AMDGPU::VCC); + } +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 620bae0a6d1a..0ff405a71e9b 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -104,6 +104,11 @@ public: MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; + + MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWA9Src16(unsigned Val) const; + MCOperand decodeSDWA9Src32(unsigned Val) const; + MCOperand decodeSDWA9VopcDst(unsigned Val) const; }; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 3bb5c9bc22b7..8ead48067336 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -191,6 +191,7 @@ public: } }; +namespace { // just a stub to make base class happy class SchedStrategyStub : public MachineSchedStrategy { public: @@ -202,6 +203,7 @@ public: void releaseTopNode(SUnit *SU) override {} void releaseBottomNode(SUnit *SU) override {} }; +} // namespace GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, StrategyKind S) diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index c6d0f2179950..d378df674be9 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -17,6 +17,7 @@ using namespace llvm; #define DEBUG_TYPE "misched" +namespace { class GCNMinRegScheduler { struct Candidate : ilist_node { const SUnit *SU; @@ -71,6 +72,7 @@ public: std::vector schedule(ArrayRef TopRoots, const ScheduleDAG &DAG); }; +} // namespace void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { NumPreds.resize(SUnits.size()); diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 18374dca3f84..390a8286c76a 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO, return getLiveLaneMask(MO.getReg(), SI, LIS, MRI); } -SmallVector collectVirtualRegUses(const MachineInstr &MI, - const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { +static SmallVector +collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { SmallVector Res; for (const auto &MO : MI.operands()) { if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 3d3858ab47ec..a856b17a228f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -52,6 +52,18 @@ public: return 0; } + virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + protected: uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; void verifyInstructionPredicates(const MCInst &MI, diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index bda0928036fd..e02acf516c0d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -69,6 +69,14 @@ public: unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const override; + + unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; }; } // end anonymous namespace @@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, return getMachineOpValue(MI, MO, Fixups, STI); } +unsigned +SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; + if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { + RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + } + return RegEnc; +} + +unsigned +SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + if (Reg != AMDGPU::VCC) { + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; + } + return RegEnc; +} + uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 3590a9b05e1d..60b913cfd39a 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { + // Local and Private addresses do not handle vectors. Limit to i32 + if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { + return (MemVT.getSizeInBits() <= 32); + } + return true; +} + bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 9700ce14c6f3..d6a0876a6ee7 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -44,6 +44,8 @@ public: EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index cc667d985a82..3c1e8527284c 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add R600_Addr, R600_KC0, R600_KC1, ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM, OQAP + ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR )>; def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a01330cb9171..80967edee0ab 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -118,6 +118,10 @@ namespace AMDGPU { // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, + // Operand for GFX9 SDWA instructions + OPERAND_SDWA9_SRC, + OPERAND_SDWA9_VOPC_DST, + /// Operand with 32-bit immediate that uses the constant bus. OPERAND_KIMM32, OPERAND_KIMM16 @@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants { DEFAULT = 0, VOP3 = 1, SDWA = 2, - DPP = 3 + SDWA9 = 3, + DPP = 4 }; } @@ -294,6 +299,18 @@ enum DstUnused { UNUSED_PRESERVE = 2, }; +enum SDWA9EncValues{ + SRC_SGPR_MASK = 0x100, + SRC_VGPR_MASK = 0xFF, + VOPC_DST_VCC_MASK = 0x80, + VOPC_DST_SGPR_MASK = 0x7F, + + SRC_VGPR_MIN = 0, + SRC_VGPR_MAX = 255, + SRC_SGPR_MIN = 256, + SRC_SGPR_MAX = 357, +}; + } // namespace SDWA } // namespace AMDGPU diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 01c1f78e7ca4..76c2644867aa 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } } +bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { + if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { + return (MemVT.getSizeInBits() <= 4 * 32); + } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); + return (MemVT.getSizeInBits() <= MaxPrivateBits); + } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + return (MemVT.getSizeInBits() <= 2 * 32); + } + return true; +} + bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, @@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SDValue RHS = N->getOperand(1); - if (VT == MVT::i64) { - const ConstantSDNode *CRHS = dyn_cast(RHS); - if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) - return Split; + const ConstantSDNode *CRHS = dyn_cast(RHS); + if (VT == MVT::i64 && CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + + if (CRHS && VT == MVT::i32) { + // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb + // nb = number of trailing zeroes in mask + // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, + // given that we are selecting 8 or 16 bit fields starting at byte boundary. + uint64_t Mask = CRHS->getZExtValue(); + unsigned Bits = countPopulation(Mask); + if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && + (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { + if (auto *CShift = dyn_cast(LHS->getOperand(1))) { + unsigned Shift = CShift->getZExtValue(); + unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned Offset = NB + Shift; + if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. + SDLoc SL(N); + SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); + SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, + DAG.getValueType(NarrowVT)); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, + DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); + return Shl; + } + } } } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index e68837747491..8e2ec40b224c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -150,6 +150,8 @@ public: bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *IsFast) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 38a16b525a75..36d29b8ecf06 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI, bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) + return true; + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 7b052844f177..c5287c7f64ba 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand { let ParserMatchClass = VReg32OrOffClass; } +class SDWA9Src : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA9_SRC"; + let EncoderMethod = "getSDWA9SrcEncoding"; +} + +def SDWA9Src32 : SDWA9Src { + let DecoderMethod = "decodeSDWA9Src32"; +} + +def SDWA9Src16 : SDWA9Src { + let DecoderMethod = "decodeSDWA9Src16"; +} + +def SDWA9VopcDst : VOPDstOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_SDWA9_VOPC_DST"; + let EncoderMethod = "getSDWA9VopcDstEncoding"; + let DecoderMethod = "decodeSDWA9VopcDst"; +} + class NamedMatchClass : AsmOperandClass { let Name = "Imm"#CName; let PredicateMethod = "is"#CName; @@ -588,6 +609,16 @@ class IntInputMods : InputMods def Int32InputMods : IntInputMods; def Int64InputMods : IntInputMods; +def FPRegInputModsMatchClass : AsmOperandClass { + let Name = "RegWithFPInputMods"; + let ParserMethod = "parseRegWithFPInputMods"; + let PredicateMethod = "isRegKind"; +} + +def FPRegInputMods : InputMods { + let PrintMethod = "printOperandAndFPInputMods"; +} + def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; @@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods { let PrintMethod = "printOperandAndFPInputMods"; } + +def IntRegInputModsMatchClass : AsmOperandClass { + let Name = "RegWithIntInputMods"; + let ParserMethod = "parseRegWithIntInputMods"; + let PredicateMethod = "isRegKind"; +} + +def IntRegInputMods : InputMods { + let PrintMethod = "printOperandAndIntInputMods"; +} + def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; @@ -783,6 +825,14 @@ class getVALUDstForVT { VOPDstOperand)))); // else VT == i1 } +// Returns the register class to use for the destination of VOP[12C] +// instructions with GFX9 SDWA extension +class getSDWA9DstForVT { + RegisterOperand ret = !if(!eq(VT.Size, 1), + SDWA9VopcDst, // VOPC + VOPDstOperand); // VOP1/2 32-bit dst +} + // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT { @@ -823,6 +873,9 @@ class getVregSrcForVT { !if(!eq(VT.Size, 64), VReg_64, VGPR_32)); } +class getSDWA9SrcForVT { + RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32); +} // Returns the register class to use for sources of VOP3 instructions for the // given VT. @@ -926,6 +979,15 @@ class getSrcModExt { Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods); } +// Return type of input modifiers operand specified input operand for SDWA 9 +class getSrcModSDWA9 { + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + !if(!eq(VT.Value, f64.Value), 1, + 0))); + Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods); +} + // Returns the input arguments for VOP[12C] instructions for the given SrcVT. class getIns32 { dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 @@ -1062,6 +1124,7 @@ class getInsSDWA { + + dag ret = !if(!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins), + !if(!eq(NumSrcArgs, 1), + // VOP1 + !if(!eq(HasSDWAOMod, 0), + // VOP1_SDWA9 without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel), + // VOP1_SDWA9 with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel)), + !if(!eq(NumSrcArgs, 2), + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA9 + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA9 + !if(!eq(HasSDWAOMod, 0), + // VOP2_SDWA9 without omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP1_SDWA9 with omod + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel))), + (ins)/* endif */))); +} + // Outs for DPP and SDWA -class getOutsExt { +class getOutsExt { dag ret = !if(HasDst, !if(!eq(DstVT.Size, 1), (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions - (outs DstRCDPP:$vdst)), + (outs DstRCExt:$vdst)), + (outs)); // V_NOP +} + +// Outs for GFX9 SDWA +class getOutsSDWA9 { + dag ret = !if(HasDst, + !if(!eq(DstVT.Size, 1), + (outs DstRCSDWA9:$sdst), + (outs DstRCSDWA9:$vdst)), (outs)); // V_NOP } @@ -1153,8 +1269,7 @@ class getAsmDPP { +class getAsmSDWA { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), " vcc", // use vcc token as dst for VOPC instructioins @@ -1182,6 +1297,35 @@ class getAsmSDWA { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", // VOPC + "$vdst"), // VOP1/2 + ""); + string src0 = "$src0_modifiers"; + string src1 = "$src1_modifiers"; + string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod"); + string args = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + ", "#src0, + ", "#src0#", "#src1 + ) + ); + string sdwa = !if(!eq(NumSrcArgs, 0), "", + !if(!eq(NumSrcArgs, 1), + out_mods#" $dst_sel $dst_unused $src0_sel", + !if(!eq(DstVT.Size, 1), + " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC + out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel" + ) + ) + ); + string ret = dst#args#sdwa; +} + + // Function that checks if instruction supports DPP and SDWA class getHasExt { @@ -1219,6 +1363,7 @@ class VOPProfile _ArgVT> { field RegisterOperand DstRC = getVALUDstForVT.ret; field RegisterOperand DstRCDPP = getVALUDstForVT.ret; field RegisterOperand DstRCSDWA = getVALUDstForVT.ret; + field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT.ret; field RegisterClass Src1RC32 = getVregSrcForVT.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT.ret; @@ -1228,6 +1373,8 @@ class VOPProfile _ArgVT> { field RegisterClass Src1DPP = getVregSrcForVT.ret; field RegisterClass Src0SDWA = getVregSrcForVT.ret; field RegisterClass Src1SDWA = getVregSrcForVT.ret; + field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT.ret; + field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT.ret; field Operand Src0Mod = getSrcMod.ret; field Operand Src1Mod = getSrcMod.ret; field Operand Src2Mod = getSrcMod.ret; @@ -1235,6 +1382,8 @@ class VOPProfile _ArgVT> { field Operand Src1ModDPP = getSrcModExt.ret; field Operand Src0ModSDWA = getSrcModExt.ret; field Operand Src1ModSDWA = getSrcModExt.ret; + field Operand Src0ModSDWA9 = getSrcModSDWA9.ret; + field Operand Src1ModSDWA9 = getSrcModSDWA9.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); @@ -1261,14 +1410,16 @@ class VOPProfile _ArgVT> { field bit HasSrc2Mods = !if(HasModifiers, BitOr.ret, 0); field bit HasClamp = HasModifiers; - field bit HasSDWAClamp = HasSrc0; + field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd.ret, HasClamp>.ret; field bit IsPacked = isPackedType.ret; field bit HasOpSel = IsPacked; field bit HasOMod = !if(HasOpSel, 0, HasModifiers); + field bit HasSDWAOMod = isFloatType.ret; field bit HasExt = getHasExt.ret; + field bit HasSDWA9 = HasExt; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1282,6 +1433,7 @@ class VOPProfile _ArgVT> { field dag Outs64 = Outs; field dag OutsDPP = getOutsExt.ret; field dag OutsSDWA = getOutsExt.ret; + field dag OutsSDWA9 = getOutsSDWA9.ret; field dag Ins32 = getIns32.ret; field dag Ins64 = getIns64 _ArgVT> { field dag InsSDWA = getInsSDWA.ret; + field dag InsSDWA9 = getInsSDWA9.ret; field string Asm32 = getAsm32.ret; field string Asm64 = getAsm64.ret; field string AsmVOP3P = getAsmVOP3P.ret; field string AsmDPP = getAsmDPP.ret; - field string AsmSDWA = getAsmSDWA.ret; + field string AsmSDWA = getAsmSDWA.ret; + field string AsmSDWA9 = getAsmSDWA9.ret; } class VOP_NO_EXT : VOPProfile { let HasExt = 0; + let HasSDWA9 = 0; } def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; @@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping { let ValueCols = [["SDWA"]]; } +// Maps ordinary instructions to their SDWA GFX9 counterparts +def getSDWA9Op : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA9"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index f2d8b6f7b7a4..ec29a66c8bbb 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32 <"s_bitset0_b32">; def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">; def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">; def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">; -def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">; +def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64", + [(set i64:$sdst, (int_amdgcn_s_getpc))] +>; let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 2abd4afad3b6..630f469eabf0 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; } +bool isGFX9(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; +} + +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { + const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); + const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || + Reg == AMDGPU::SCC; +} + unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { switch(Reg) { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 8e74aa2cc9a8..19888ad7556a 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) { bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); +bool isGFX9(const MCSubtargetInfo &STI); + +/// \brief Is Reg - scalar register +bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 1febc6bf8ec2..95b5ef0a49db 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -30,6 +30,15 @@ class VOP1_SDWAe op, VOPProfile P> : VOP_SDWAe

{ let Inst{31-25} = 0x3f; // encoding } +class VOP1_SDWA9Ae op, VOPProfile P> : VOP_SDWA9Ae

{ + bits<8> vdst; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{31-25} = 0x3f; // encoding +} + class VOP1_Pseudo pattern=[], bit VOP1Only = 0> : InstSI , VOP , @@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_SDWA9_Pseudo pattern=[]> : + VOP_SDWA9_Pseudo { + let AsmMatchConverter = "cvtSdwaVOP1"; +} + class getVOP1Pat64 : LetDummies { list ret = !if(P.HasModifiers, @@ -103,6 +117,7 @@ multiclass VOP1Inst ; def _e64 : VOP3_Pseudo .ret>; def _sdwa : VOP1_SDWA_Pseudo ; + def _sdwa9 : VOP1_SDWA9_Pseudo ; } // Special profile for instructions which have clamp @@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC64 = VRegSrc_32; let HasExt = 0; + let HasSDWA9 = 0; } // Special case because there are no true output operands. Hack vdst @@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); - let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0, + let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel); + let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; let Asm64 = getAsm64<1, 1, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; - let AsmSDWA = getAsmSDWA<1, 1, 0>.ret; + let AsmSDWA = getAsmSDWA<1, 1>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; let HasExt = 0; + let HasSDWA9 = 0; let HasDst = 0; let EmitDst = 1; // force vdst emission } @@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; @@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { def : Pat< (f32 (f16_to_fp i16:$src)), @@ -523,6 +544,10 @@ multiclass VOP1_Real_vi op> { VOP_SDWA_Real (NAME#"_sdwa")>, VOP1_SDWAe (NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real (NAME#"_sdwa9")>, + VOP1_SDWA9Ae (NAME#"_sdwa9").Pfl>; + // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP1_DPP(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 4a11d9471f1d..657cacaa792c 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -48,6 +48,18 @@ class VOP2_SDWAe op, VOPProfile P> : VOP_SDWAe

{ let Inst{31} = 0x0; // encoding } +class VOP2_SDWA9Ae op, VOPProfile P> : VOP_SDWA9Ae

{ + bits<8> vdst; + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0); + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + class VOP2_Pseudo pattern=[], string suffix = "_e32"> : InstSI , VOP , @@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_SDWA9_Pseudo pattern=[]> : + VOP_SDWA9_Pseudo { + let AsmMatchConverter = "cvtSdwaVOP2"; +} + class getVOP2Pat64 : LetDummies { list ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -121,10 +138,10 @@ multiclass VOP2Inst .ret>, Commutable_REV; - def _sdwa : VOP2_SDWA_Pseudo ; + def _sdwa : VOP2_SDWA_Pseudo ; + def _sdwa9 : VOP2_SDWA9_Pseudo ; } -// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst multiclass VOP2bInst , Commutable_REV; - def _sdwa : VOP2_SDWA_Pseudo ; + def _sdwa : VOP2_SDWA_Pseudo { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } + + def _sdwa9 : VOP2_SDWA9_Pseudo { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } } def _e64 : VOP3_Pseudo .ret>, @@ -203,13 +226,21 @@ class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> { VGPR_32:$src2, // stub argument clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + VGPR_32:$src2, // stub argument + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; - let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret; + let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; + let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; let HasExt = 1; + let HasSDWA9 = 0; } def VOP_MAC_F16 : VOP_MAC { @@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { let Asm32 = "$vdst, vcc, $src0, $src1"; let Asm64 = "$vdst, $sdst, $src0, $src1"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); @@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // implicit VCC use. let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); - let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0, - Src1Mod:$src1_modifiers, Src1SDWA:$src1, + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + clampmod:$clamp, omod:$omod, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, Src1Mod:$src1_modifiers, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; + let HasSDWA9 = 1; } // Read in from vcc or arbitrary SGPR @@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; } // End let SubtargetPredicate = SICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; @@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } } // End isCommutable = 1 -} // End SubtargetPredicate = isVI +} // End SubtargetPredicate = Has16BitInsts // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats { @@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat : Pat < (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) >; -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { defm : Arithmetic_i16_Pats; defm : Arithmetic_i16_Pats; @@ -513,7 +553,7 @@ def : Pat< (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) >; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts] //===----------------------------------------------------------------------===// // SI @@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real op> { VOP2_SDWAe (NAME#"_sdwa").Pfl>; } +multiclass VOP2_SDWA9_Real op> { + def _sdwa_gfx9 : + VOP_SDWA9_Real (NAME#"_sdwa9")>, + VOP2_SDWA9Ae (NAME#"_sdwa9").Pfl>; +} + multiclass VOP2be_Real_e32e64_vi op> : - Base_VOP2be_Real_e32e64_vi, VOP2_SDWA_Real { + Base_VOP2be_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP(NAME#"_e32")>; } multiclass VOP2_Real_e32e64_vi op> : - Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real { + Base_VOP2_Real_e32e64_vi, VOP2_SDWA_Real, VOP2_SDWA9_Real { // For now left dpp only for asm/dasm // TODO: add corresponding pseudo def _dpp : VOP2_DPP(NAME#"_e32")>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index c0b5069948fb..001fc960b228 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = Has16BitInsts in { let isCommutable = 1 in { @@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>; } // End isCommutable = 1 +} // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = isVI in { def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile>; - } // End SubtargetPredicate = isVI -let Predicates = [isVI] in { +let Predicates = [Has16BitInsts] in { multiclass Ternary_i16_Pats { @@ -288,7 +289,7 @@ def : Pat< defm: Ternary_i16_Pats; defm: Ternary_i16_Pats; -} // End Predicates = [isVI] +} // End Predicates = [Has16BitInsts] let SubtargetPredicate = isGFX9 in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index a3550a63677b..cd347b86d305 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -34,6 +34,17 @@ class VOPC_SDWAe op, VOPProfile P> : VOP_SDWAe

{ let Inst{44-43} = SDWA.UNUSED_PRESERVE; } +class VOPC_SDWA9e op, VOPProfile P> : VOP_SDWA9Be

{ + bits<9> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0); + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr +} + + //===----------------------------------------------------------------------===// // VOPC classes //===----------------------------------------------------------------------===// @@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo pattern=[]> : let AsmMatchConverter = "cvtSdwaVOPC"; } +class VOPC_SDWA9_Pseudo pattern=[]> : + VOP_SDWA9_Pseudo { + let AsmMatchConverter = "cvtSdwaVOPC"; +} + // This class is used only with VOPC instructions. Use $sdst for out operand class VOPCInstAlias : InstAlias , PredicateControl { @@ -173,6 +189,13 @@ multiclass VOPC_Pseudos { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = P.Schedule; + let isConvergent = DefExec; + let isCompare = 1; + } } def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>; @@ -520,7 +543,11 @@ class VOPC_Class_Profile sched, ValueType vt> : let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0, + Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1, + src0_sel:$src0_sel, src1_sel:$src1_sel); let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel"; + //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel"; let HasSrc1Mods = 0; let HasClamp = 0; let HasOMod = 0; @@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos { let SchedRW = p.Schedule; let isConvergent = DefExec; } + + def _sdwa9 : VOPC_SDWA9_Pseudo { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = p.Schedule; + let isConvergent = DefExec; + } } def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>; @@ -920,6 +953,10 @@ multiclass VOPC_Real_vi op> { VOP_SDWA_Real (NAME#"_sdwa")>, VOPC_SDWAe (NAME#"_sdwa").Pfl>; + def _sdwa_gfx9 : + VOP_SDWA9_Real (NAME#"_sdwa9")>, + VOPC_SDWA9e (NAME#"_sdwa9").Pfl>; + def : VOPCInstAlias (NAME#"_e64"), !cast(NAME#"_e32_vi")> { let AssemblerPredicate = isVI; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 69906c419db3..4da654f84f9d 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -293,11 +293,52 @@ class VOP_SDWAe : Enc64 { let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); - let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); - let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); + let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); +} + +// gfx9 SDWA basic encoding +class VOP_SDWA9e : Enc64 { + bits<9> src0; // {src0_sgpr{0}, src0{7-0}} + bits<3> src0_sel; + bits<2> src0_modifiers; // float: {abs,neg}, int {sext} + bits<3> src1_sel; + bits<2> src1_modifiers; + bits<1> src1_sgpr; + + let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); + let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); + let Inst{55} = !if(P.HasSrc0, src0{8}, 0); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); + let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); + let Inst{63} = 0; // src1_sgpr - should be specified in subclass +} + +// gfx9 SDWA-A +class VOP_SDWA9Ae : VOP_SDWA9e

{ + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + bits<2> omod; + + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); + let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); +} + +// gfx9 SDWA-B +class VOP_SDWA9Be : VOP_SDWA9e

{ + bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}} + + let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0); + let Inst{47} = !if(P.EmitDst, sdst{7}, 0); } class VOP_SDWA_Pseudo pattern=[]> : @@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo pattern=[]> : VOPProfile Pfl = P; } +// GFX9 adds two features to SDWA: +// 1. Add 3 fields to the SDWA microcode word: S0, S1 and OMOD. +// a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather +// than VGPRs (at most 1 can be an SGPR); +// b. OMOD is the standard output modifier (result *2, *4, /2) +// 2. Add a new version of the SDWA microcode word for VOPC: SDWAB. This +// replaces OMOD and the dest fields with SD and SDST (SGPR destination) +// field. +// a. When SD=1, the SDST is used as the destination for the compare result; +// b.when SD=0, VCC is used. +// +// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA + +class VOP_SDWA9_Pseudo pattern=[]> : + InstSI , + VOP , + SIMCInstr , + MnemonicAlias { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + string AsmOperands = P.AsmSDWA9; + + let Size = 8; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + + let VALU = 1; + let SDWA = 1; + let Uses = [EXEC]; + + let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); + let DecoderNamespace = "SDWA9"; + + VOPProfile Pfl = P; +} + class VOP_SDWA_Real : InstSI , SIMCInstr { @@ -358,6 +443,33 @@ class VOP_SDWA_Real : let TSFlags = ps.TSFlags; } +class VOP_SDWA9_Real : + InstSI , + SIMCInstr { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + class VOP_DPPe : Enc64 { bits<2> src0_modifiers; bits<8> src0; diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 46ac4d0ad933..31a2f499a9a7 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -34,6 +34,9 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI) static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T) { + if (T->isArrayTy()) + return true; + EVT VT = TLI.getValueType(DL, T, true); if (!VT.isSimple() || VT.isVector() || !(VT.isInteger() || VT.isFloatingPoint())) @@ -148,23 +151,47 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { }; } // End anonymous namespace. -void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, - SmallVectorImpl &SplitArgs, - const DataLayout &DL, - MachineRegisterInfo &MRI) const { +void ARMCallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, + MachineFunction &MF, const SplitArgTy &PerformArgSplit) const { const ARMTargetLowering &TLI = *getTLI(); LLVMContext &Ctx = OrigArg.Ty->getContext(); + const DataLayout &DL = MF.getDataLayout(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function *F = MF.getFunction(); SmallVector SplitVTs; SmallVector Offsets; ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); - assert(SplitVTs.size() == 1 && "Unsupported type"); + if (SplitVTs.size() == 1) { + // Even if there is no splitting to do, we still want to replace the + // original type (e.g. pointer type -> integer). + SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), + OrigArg.Flags, OrigArg.IsFixed); + return; + } - // Even if there is no splitting to do, we still want to replace the original - // type (e.g. pointer type -> integer). - SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags, OrigArg.IsFixed); + unsigned FirstRegIdx = SplitArgs.size(); + for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) { + EVT SplitVT = SplitVTs[i]; + Type *SplitTy = SplitVT.getTypeForEVT(Ctx); + auto Flags = OrigArg.Flags; + bool NeedsConsecutiveRegisters = + TLI.functionArgumentNeedsConsecutiveRegisters( + SplitTy, F->getCallingConv(), F->isVarArg()); + if (NeedsConsecutiveRegisters) { + Flags.setInConsecutiveRegs(); + if (i == e - 1) + Flags.setInConsecutiveRegsLast(); + } + SplitArgs.push_back( + ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), + SplitTy, Flags, OrigArg.IsFixed}); + } + + for (unsigned i = 0; i < Offsets.size(); ++i) + PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } /// Lower the return value for the already existing \p Ret. This assumes that @@ -187,7 +214,9 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, SmallVector SplitVTs; ArgInfo RetInfo(VReg, Val->getType()); setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); - splitToValueTypes(RetInfo, SplitVTs, DL, MF.getRegInfo()); + splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, VReg, Offset); + }); CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg()); @@ -307,6 +336,26 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { return 1; } + /// Merge the values in \p SrcRegs into \p DstReg at offsets \p SrcOffsets. + /// Note that the source registers are not required to have homogeneous types, + /// so we use G_INSERT rather than G_MERGE_VALUES. + // FIXME: Use G_MERGE_VALUES if the types are homogeneous. + void mergeRegisters(unsigned DstReg, ArrayRef SrcRegs, + ArrayRef SrcOffsets) { + LLT Ty = MRI.getType(DstReg); + + unsigned Dst = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Dst); + + for (unsigned i = 0; i < SrcRegs.size(); ++i) { + unsigned Tmp = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInsert(Tmp, Dst, SrcRegs[i], SrcOffsets[i]); + Dst = Tmp; + } + + MIRBuilder.buildCopy(DstReg, Dst); + } + /// Marking a physical register as used is different between formal /// parameters, where it's a basic block live-in, and call returns, where it's /// an implicit-def of the call instruction. @@ -335,6 +384,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, return false; auto &MF = MIRBuilder.getMF(); + auto &MBB = MIRBuilder.getMBB(); auto DL = MF.getDataLayout(); auto &TLI = *getTLI(); @@ -350,17 +400,34 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCAssignFn *AssignFn = TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg()); + FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(), + AssignFn); + SmallVector ArgInfos; + SmallVector SplitRegs; + SmallVector RegOffsets; unsigned Idx = 0; for (auto &Arg : F.args()) { ArgInfo AInfo(VRegs[Idx], Arg.getType()); setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F); - splitToValueTypes(AInfo, ArgInfos, DL, MF.getRegInfo()); + + SplitRegs.clear(); + RegOffsets.clear(); + + splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) { + SplitRegs.push_back(Reg); + RegOffsets.push_back(Offset); + }); + + if (!SplitRegs.empty()) + ArgHandler.mergeRegisters(VRegs[Idx], SplitRegs, RegOffsets); + Idx++; } - FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(), - AssignFn); + if (!MBB.empty()) + MIRBuilder.setInstr(*MBB.begin()); + return handleAssignments(MIRBuilder, ArgInfos, ArgHandler); } @@ -407,7 +474,9 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (!Arg.IsFixed) return false; - splitToValueTypes(Arg, ArgInfos, DL, MRI); + splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, Arg.Reg, Offset); + }); } auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); @@ -423,12 +492,24 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; ArgInfos.clear(); - splitToValueTypes(OrigRet, ArgInfos, DL, MRI); + SmallVector RegOffsets; + SmallVector SplitRegs; + splitToValueTypes(OrigRet, ArgInfos, MF, + [&](unsigned Reg, uint64_t Offset) { + RegOffsets.push_back(Offset); + SplitRegs.push_back(Reg); + }); auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false); CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler)) return false; + + if (!RegOffsets.empty()) { + // We have split the value and allocated each individual piece, now build + // it up again. + RetHandler.mergeRegisters(OrigRet.Reg, SplitRegs, RegOffsets); + } } // We now know the size of the stack - update the ADJCALLSTACKDOWN diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h index 6404c7a2689e..f5a6872336f6 100644 --- a/lib/Target/ARM/ARMCallLowering.h +++ b/lib/Target/ARM/ARMCallLowering.h @@ -42,11 +42,14 @@ private: bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val, unsigned VReg, MachineInstrBuilder &Ret) const; + typedef std::function SplitArgTy; + /// Split an argument into one or more arguments that the CC lowering can cope /// with (e.g. replace pointers with integers). void splitToValueTypes(const ArgInfo &OrigArg, SmallVectorImpl &SplitArgs, - const DataLayout &DL, MachineRegisterInfo &MRI) const; + MachineFunction &MF, + const SplitArgTy &PerformArgSplit) const; }; } // End of namespace llvm #endif diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 78a9144bd321..90baabcdb652 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -779,7 +779,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineOperand &Desired = MI.getOperand(3); MachineOperand &New = MI.getOperand(4); - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LivePhysRegs LiveRegs(TII->getRegisterInfo()); LiveRegs.addLiveOuts(MBB); for (auto I = std::prev(MBB.end()); I != MBBI; --I) LiveRegs.stepBackward(*I); @@ -903,7 +903,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0); unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1); - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LivePhysRegs LiveRegs(TII->getRegisterInfo()); LiveRegs.addLiveOuts(MBB); for (auto I = std::prev(MBB.end()); I != MBBI; --I) LiveRegs.stepBackward(*I); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index f8b584db7b99..62e774d869da 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -127,7 +127,7 @@ static cl::opt EnableConstpoolPromotion( "arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), - cl::init(true)); + cl::init(false)); // FIXME: set to true by default once PR32780 is fixed static cl::opt ConstpoolPromotionMaxSize( "arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), @@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, } } - // Lowering to i32/i16 if the size permits. - if (Size >= 4) - return MVT::i32; - else if (Size >= 2) - return MVT::i16; - // Let the target-independent logic figure it out. return MVT::Other; } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 875c06210ae6..26da528c19e6 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -510,7 +510,7 @@ class InstrItineraryData; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; - bool canMergeStoresTo(EVT MemVT) const override { + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const override { // Do not merge to larger than i32. return (MemVT.getSizeInBits() <= 32); } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 51290e5a5b93..858136a82078 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -674,7 +674,7 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { class VLD1D op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd), (ins AddrMode:$Rn), IIC_VLD1, - "vld1", Dt, "$Vd, $Rn", "", []> { + "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -682,7 +682,7 @@ class VLD1D op7_4, string Dt, Operand AddrMode> class VLD1Q op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd), (ins AddrMode:$Rn), IIC_VLD1x2, - "vld1", Dt, "$Vd, $Rn", "", []> { + "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -703,7 +703,7 @@ multiclass VLD1DWB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -711,7 +711,7 @@ multiclass VLD1DWB op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -720,7 +720,7 @@ multiclass VLD1QWB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -728,7 +728,7 @@ multiclass VLD1QWB op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -747,7 +747,7 @@ defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>; class VLD1D3 op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd), (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt, - "$Vd, $Rn", "", []> { + "$Vd, $Rn", "", []>, Sched<[WriteVLD3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -756,7 +756,7 @@ multiclass VLD1D3WB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -764,7 +764,7 @@ multiclass VLD1D3WB op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -780,15 +780,15 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>; defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>; -def VLD1d64TPseudo : VLDQQPseudo; -def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo; -def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo; +def VLD1d64TPseudo : VLDQQPseudo, Sched<[WriteVLD3]>; +def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD3]>; +def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD3]>; // ...with 4 registers class VLD1D4 op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd), (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt, - "$Vd, $Rn", "", []> { + "$Vd, $Rn", "", []>, Sched<[WriteVLD4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -797,7 +797,7 @@ multiclass VLD1D4WB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -805,7 +805,7 @@ multiclass VLD1D4WB op7_4, string Dt, Operand AddrMode> { def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u, "vld1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -821,9 +821,9 @@ defm VLD1d16Qwb : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VLD1d32Qwb : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VLD1d64Qwb : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; -def VLD1d64QPseudo : VLDQQPseudo; -def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo; -def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo; +def VLD1d64QPseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD4]>; +def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD4]>; // VLD2 : Vector Load (multiple 2-element structures) class VLD2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, @@ -837,22 +837,22 @@ class VLD2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, } def VLD2d8 : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2d16 : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2d32 : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2q8 : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; def VLD2q16 : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; def VLD2q32 : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; -def VLD2q8Pseudo : VLDQQPseudo; -def VLD2q16Pseudo : VLDQQPseudo; -def VLD2q32Pseudo : VLDQQPseudo; +def VLD2q8Pseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD2q16Pseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD2q32Pseudo : VLDQQPseudo, Sched<[WriteVLD4]>; // ...with address register writeback: multiclass VLD2WB op11_8, bits<4> op7_4, string Dt, @@ -875,45 +875,45 @@ multiclass VLD2WB op11_8, bits<4> op7_4, string Dt, } defm VLD2d8wb : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2q8wb : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVLD4]>; -def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo; -def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo; -def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo; -def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo; -def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo; -def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo; +def VLD2q8PseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD4]>; +def VLD2q16PseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD4]>; +def VLD2q32PseudoWB_fixed : VLDQQWBfixedPseudo, Sched<[WriteVLD4]>; +def VLD2q8PseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD4]>; +def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD4]>; +def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo, Sched<[WriteVLD4]>; // ...with double-spaced registers def VLD2b8 : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2b16 : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; def VLD2b32 : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2b8wb : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVLD2]>; // VLD3 : Vector Load (multiple 3-element structures) class VLD3D op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), (ins addrmode6:$Rn), IIC_VLD3, - "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> { + "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []>, Sched<[WriteVLD3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; @@ -923,9 +923,9 @@ def VLD3d8 : VLD3D<0b0100, {0,0,0,?}, "8">; def VLD3d16 : VLD3D<0b0100, {0,1,0,?}, "16">; def VLD3d32 : VLD3D<0b0100, {1,0,0,?}, "32">; -def VLD3d8Pseudo : VLDQQPseudo; -def VLD3d16Pseudo : VLDQQPseudo; -def VLD3d32Pseudo : VLDQQPseudo; +def VLD3d8Pseudo : VLDQQPseudo, Sched<[WriteVLD3]>; +def VLD3d16Pseudo : VLDQQPseudo, Sched<[WriteVLD3]>; +def VLD3d32Pseudo : VLDQQPseudo, Sched<[WriteVLD3]>; // ...with address register writeback: class VLD3DWB op11_8, bits<4> op7_4, string Dt> @@ -933,7 +933,7 @@ class VLD3DWB op11_8, bits<4> op7_4, string Dt> (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u, "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; } @@ -942,9 +942,9 @@ def VLD3d8_UPD : VLD3DWB<0b0100, {0,0,0,?}, "8">; def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">; def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">; -def VLD3d8Pseudo_UPD : VLDQQWBPseudo; -def VLD3d16Pseudo_UPD : VLDQQWBPseudo; -def VLD3d32Pseudo_UPD : VLDQQWBPseudo; +def VLD3d8Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD3]>; +def VLD3d16Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD3]>; +def VLD3d32Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD3]>; // ...with double-spaced registers: def VLD3q8 : VLD3D<0b0101, {0,0,0,?}, "8">; @@ -954,25 +954,26 @@ def VLD3q8_UPD : VLD3DWB<0b0101, {0,0,0,?}, "8">; def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">; def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">; -def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo; -def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo; -def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo; +def VLD3q8Pseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; // ...alternate versions to be allocated odd register numbers: -def VLD3q8oddPseudo : VLDQQQQPseudo; -def VLD3q16oddPseudo : VLDQQQQPseudo; -def VLD3q32oddPseudo : VLDQQQQPseudo; +def VLD3q8oddPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; +def VLD3q16oddPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; +def VLD3q32oddPseudo : VLDQQQQPseudo, Sched<[WriteVLD3]>; -def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD3q8oddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; +def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD3]>; // VLD4 : Vector Load (multiple 4-element structures) class VLD4D op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4), (ins addrmode6:$Rn), IIC_VLD4, - "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> { + "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []>, + Sched<[WriteVLD4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; @@ -982,9 +983,9 @@ def VLD4d8 : VLD4D<0b0000, {0,0,?,?}, "8">; def VLD4d16 : VLD4D<0b0000, {0,1,?,?}, "16">; def VLD4d32 : VLD4D<0b0000, {1,0,?,?}, "32">; -def VLD4d8Pseudo : VLDQQPseudo; -def VLD4d16Pseudo : VLDQQPseudo; -def VLD4d32Pseudo : VLDQQPseudo; +def VLD4d8Pseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD4d16Pseudo : VLDQQPseudo, Sched<[WriteVLD4]>; +def VLD4d32Pseudo : VLDQQPseudo, Sched<[WriteVLD4]>; // ...with address register writeback: class VLD4DWB op11_8, bits<4> op7_4, string Dt> @@ -992,7 +993,7 @@ class VLD4DWB op11_8, bits<4> op7_4, string Dt> (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u, "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; } @@ -1001,9 +1002,9 @@ def VLD4d8_UPD : VLD4DWB<0b0000, {0,0,?,?}, "8">; def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">; def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">; -def VLD4d8Pseudo_UPD : VLDQQWBPseudo; -def VLD4d16Pseudo_UPD : VLDQQWBPseudo; -def VLD4d32Pseudo_UPD : VLDQQWBPseudo; +def VLD4d8Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD4]>; +def VLD4d16Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD4]>; +def VLD4d32Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD4]>; // ...with double-spaced registers: def VLD4q8 : VLD4D<0b0001, {0,0,?,?}, "8">; @@ -1013,18 +1014,18 @@ def VLD4q8_UPD : VLD4DWB<0b0001, {0,0,?,?}, "8">; def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">; def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">; -def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo; -def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo; -def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo; +def VLD4q8Pseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; // ...alternate versions to be allocated odd register numbers: -def VLD4q8oddPseudo : VLDQQQQPseudo; -def VLD4q16oddPseudo : VLDQQQQPseudo; -def VLD4q32oddPseudo : VLDQQQQPseudo; +def VLD4q8oddPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; +def VLD4q16oddPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; +def VLD4q32oddPseudo : VLDQQQQPseudo, Sched<[WriteVLD4]>; -def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo; -def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo; +def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; +def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo, Sched<[WriteVLD4]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1076,11 +1077,12 @@ class VLD1LN32 op11_8, bits<4> op7_4, string Dt, ValueType Ty, "$src = $Vd", [(set DPR:$Vd, (vector_insert (Ty DPR:$src), (i32 (LoadOp addrmode6oneL32:$Rn)), - imm:$lane))]> { + imm:$lane))]>, Sched<[WriteVLD1]> { let Rm = 0b1111; let DecoderMethod = "DecodeVLD1LN"; } -class VLD1QLNPseudo : VLDQLNPseudo { +class VLD1QLNPseudo : VLDQLNPseudo, + Sched<[WriteVLD1]> { let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src), (i32 (LoadOp addrmode6:$addr)), imm:$lane))]; @@ -1117,7 +1119,7 @@ class VLD1LNWB op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, am6offset:$Rm, DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn$Rm", - "$src = $Vd, $Rn.addr = $wb", []> { + "$src = $Vd, $Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let DecoderMethod = "DecodeVLD1LN"; } @@ -1134,16 +1136,16 @@ def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> { let Inst{4} = Rn{4}; } -def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo; -def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo; -def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo; +def VLD1LNq8Pseudo_UPD : VLDQLNWBPseudo, Sched<[WriteVLD1]>; +def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo, Sched<[WriteVLD1]>; +def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo, Sched<[WriteVLD1]>; // VLD2LN : Vector Load (single 2-element structure to one lane) class VLD2LN op11_8, bits<4> op7_4, string Dt> : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2), (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn", - "$src1 = $Vd, $src2 = $dst2", []> { + "$src1 = $Vd, $src2 = $dst2", []>, Sched<[WriteVLD1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2LN"; @@ -1159,9 +1161,9 @@ def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNd8Pseudo : VLDQLNPseudo; -def VLD2LNd16Pseudo : VLDQLNPseudo; -def VLD2LNd32Pseudo : VLDQLNPseudo; +def VLD2LNd8Pseudo : VLDQLNPseudo, Sched<[WriteVLD1]>; +def VLD2LNd16Pseudo : VLDQLNPseudo, Sched<[WriteVLD1]>; +def VLD2LNd32Pseudo : VLDQLNPseudo, Sched<[WriteVLD1]>; // ...with double-spaced registers: def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> { @@ -1171,8 +1173,8 @@ def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNq16Pseudo : VLDQQLNPseudo; -def VLD2LNq32Pseudo : VLDQQLNPseudo; +def VLD2LNq16Pseudo : VLDQQLNPseudo, Sched<[WriteVLD1]>; +def VLD2LNq32Pseudo : VLDQQLNPseudo, Sched<[WriteVLD1]>; // ...with address register writeback: class VLD2LNWB op11_8, bits<4> op7_4, string Dt> @@ -1195,9 +1197,9 @@ def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo; -def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo; -def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo; +def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo, Sched<[WriteVLD1]>; +def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo, Sched<[WriteVLD1]>; +def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo, Sched<[WriteVLD1]>; def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -1206,8 +1208,8 @@ def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> { let Inst{7} = lane{0}; } -def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo; -def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo; +def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD1]>; +def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD1]>; // VLD3LN : Vector Load (single 3-element structure to one lane) class VLD3LN op11_8, bits<4> op7_4, string Dt> @@ -1215,7 +1217,7 @@ class VLD3LN op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt, "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn", - "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> { + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []>, Sched<[WriteVLD2]> { let Rm = 0b1111; let DecoderMethod = "DecodeVLD3LN"; } @@ -1230,9 +1232,9 @@ def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNd8Pseudo : VLDQQLNPseudo; -def VLD3LNd16Pseudo : VLDQQLNPseudo; -def VLD3LNd32Pseudo : VLDQQLNPseudo; +def VLD3LNd8Pseudo : VLDQQLNPseudo, Sched<[WriteVLD2]>; +def VLD3LNd16Pseudo : VLDQQLNPseudo, Sched<[WriteVLD2]>; +def VLD3LNd32Pseudo : VLDQQLNPseudo, Sched<[WriteVLD2]>; // ...with double-spaced registers: def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> { @@ -1242,8 +1244,8 @@ def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNq16Pseudo : VLDQQQQLNPseudo; -def VLD3LNq32Pseudo : VLDQQQQLNPseudo; +def VLD3LNq16Pseudo : VLDQQQQLNPseudo, Sched<[WriteVLD2]>; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo, Sched<[WriteVLD2]>; // ...with address register writeback: class VLD3LNWB op11_8, bits<4> op7_4, string Dt> @@ -1254,7 +1256,7 @@ class VLD3LNWB op11_8, bits<4> op7_4, string Dt> IIC_VLD3lnu, "vld3", Dt, "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm", "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb", - []> { + []>, Sched<[WriteVLD2]> { let DecoderMethod = "DecodeVLD3LN"; } @@ -1268,9 +1270,9 @@ def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo; -def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo; -def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo; +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD2]>; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD2]>; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD2]>; def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> { let Inst{7-6} = lane{1-0}; @@ -1279,8 +1281,8 @@ def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> { let Inst{7} = lane{0}; } -def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; -def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo, Sched<[WriteVLD2]>; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo, Sched<[WriteVLD2]>; // VLD4LN : Vector Load (single 4-element structure to one lane) class VLD4LN op11_8, bits<4> op7_4, string Dt> @@ -1289,7 +1291,8 @@ class VLD4LN op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt, "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn", - "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> { + "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>, + Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD4LN"; @@ -1306,9 +1309,9 @@ def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNd8Pseudo : VLDQQLNPseudo; -def VLD4LNd16Pseudo : VLDQQLNPseudo; -def VLD4LNd32Pseudo : VLDQQLNPseudo; +def VLD4LNd8Pseudo : VLDQQLNPseudo, Sched<[WriteVLD2]>; +def VLD4LNd16Pseudo : VLDQQLNPseudo, Sched<[WriteVLD2]>; +def VLD4LNd32Pseudo : VLDQQLNPseudo, Sched<[WriteVLD2]>; // ...with double-spaced registers: def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> { @@ -1319,8 +1322,8 @@ def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNq16Pseudo : VLDQQQQLNPseudo; -def VLD4LNq32Pseudo : VLDQQQQLNPseudo; +def VLD4LNq16Pseudo : VLDQQQQLNPseudo, Sched<[WriteVLD2]>; +def VLD4LNq32Pseudo : VLDQQQQLNPseudo, Sched<[WriteVLD2]>; // ...with address register writeback: class VLD4LNWB op11_8, bits<4> op7_4, string Dt> @@ -1347,9 +1350,9 @@ def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo; -def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo; -def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo; +def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD2]>; +def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD2]>; +def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo, Sched<[WriteVLD2]>; def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -1359,8 +1362,8 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; -def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; +def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo, Sched<[WriteVLD2]>; +def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo, Sched<[WriteVLD2]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1371,7 +1374,8 @@ class VLD1DUP op7_4, string Dt, ValueType Ty, PatFrag LoadOp, (ins AddrMode:$Rn), IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "", [(set VecListOneDAllLanes:$Vd, - (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> { + (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>, + Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; @@ -1434,7 +1438,7 @@ multiclass VLD1QDUPWB op7_4, string Dt, Operand AddrMode> { (outs VecListDPairAllLanes:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD1dupu, "vld1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD1DupInstruction"; @@ -1491,7 +1495,7 @@ multiclass VLD2DUPWB op7_4, string Dt, RegisterOperand VdTy, (outs VdTy:$Vd, GPR:$wb), (ins AddrMode:$Rn), IIC_VLD2dupu, "vld2", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2DupInstruction"; @@ -1500,7 +1504,7 @@ multiclass VLD2DUPWB op7_4, string Dt, RegisterOperand VdTy, (outs VdTy:$Vd, GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu, "vld2", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD2DupInstruction"; } @@ -1524,7 +1528,8 @@ defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes, class VLD3DUP op7_4, string Dt> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3), (ins addrmode6dup:$Rn), IIC_VLD3dup, - "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> { + "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []>, + Sched<[WriteVLD2]> { let Rm = 0b1111; let Inst{4} = 0; let DecoderMethod = "DecodeVLD3DupInstruction"; @@ -1534,9 +1539,9 @@ def VLD3DUPd8 : VLD3DUP<{0,0,0,?}, "8">; def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">; def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">; -def VLD3DUPd8Pseudo : VLDQQPseudo; -def VLD3DUPd16Pseudo : VLDQQPseudo; -def VLD3DUPd32Pseudo : VLDQQPseudo; +def VLD3DUPd8Pseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPd16Pseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD3DUPd32Pseudo : VLDQQPseudo, Sched<[WriteVLD2]>; // ...with double-spaced registers (not used for codegen): def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">; @@ -1548,7 +1553,7 @@ class VLD3DUPWB op7_4, string Dt, Operand AddrMode> : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb), (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu, "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Inst{4} = 0; let DecoderMethod = "DecodeVLD3DupInstruction"; } @@ -1561,9 +1566,9 @@ def VLD3DUPq8_UPD : VLD3DUPWB<{0,0,1,0}, "8", addrmode6dupalign64>; def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>; def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>; -def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo; -def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo; -def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo; +def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; // VLD4DUP : Vector Load (single 4-element structure to all lanes) class VLD4DUP op7_4, string Dt> @@ -1580,9 +1585,9 @@ def VLD4DUPd8 : VLD4DUP<{0,0,0,?}, "8">; def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">; def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; } -def VLD4DUPd8Pseudo : VLDQQPseudo; -def VLD4DUPd16Pseudo : VLDQQPseudo; -def VLD4DUPd32Pseudo : VLDQQPseudo; +def VLD4DUPd8Pseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPd16Pseudo : VLDQQPseudo, Sched<[WriteVLD2]>; +def VLD4DUPd32Pseudo : VLDQQPseudo, Sched<[WriteVLD2]>; // ...with double-spaced registers (not used for codegen): def VLD4DUPq8 : VLD4DUP<{0,0,1,?}, "8">; @@ -1595,7 +1600,7 @@ class VLD4DUPWB op7_4, string Dt> (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb), (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu, "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLD4DupInstruction"; } @@ -1608,9 +1613,9 @@ def VLD4DUPq8_UPD : VLD4DUPWB<{0,0,1,0}, "8">; def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">; def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; } -def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo; -def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo; -def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo; +def VLD4DUPd8Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; +def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo, Sched<[WriteVLD2]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1657,14 +1662,14 @@ class VSTQQQQWBPseudo // VST1 : Vector Store (multiple single elements) class VST1D op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd), - IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> { + IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } class VST1Q op7_4, string Dt, Operand AddrMode> : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd), - IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> { + IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST2]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1685,7 +1690,7 @@ multiclass VST1DWB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST1]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1694,7 +1699,7 @@ multiclass VST1DWB op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd), IIC_VLD1u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST1]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1703,7 +1708,7 @@ multiclass VST1QWB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1712,7 +1717,7 @@ multiclass VST1QWB op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd), IIC_VLD1x2u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1732,7 +1737,7 @@ defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>; class VST1D3 op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b00, 0b0110, op7_4, (outs), (ins AddrMode:$Rn, VecListThreeD:$Vd), - IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> { + IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1741,7 +1746,7 @@ multiclass VST1D3WB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST3]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1750,7 +1755,7 @@ multiclass VST1D3WB op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd), IIC_VLD1x3u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST3]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1766,16 +1771,16 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>; defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>; defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>; -def VST1d64TPseudo : VSTQQPseudo; -def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo; -def VST1d64TPseudoWB_register : VSTQQWBPseudo; +def VST1d64TPseudo : VSTQQPseudo, Sched<[WriteVST3]>; +def VST1d64TPseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST3]>; +def VST1d64TPseudoWB_register : VSTQQWBPseudo, Sched<[WriteVST3]>; // ...with 4 registers class VST1D4 op7_4, string Dt, Operand AddrMode> : NLdSt<0, 0b00, 0b0010, op7_4, (outs), (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "", - []> { + []>, Sched<[WriteVST4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1784,7 +1789,7 @@ multiclass VST1D4WB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u, "vst1", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; @@ -1793,7 +1798,7 @@ multiclass VST1D4WB op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd), IIC_VLD1x4u, "vst1", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST1Instruction"; } @@ -1809,9 +1814,9 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>; defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>; -def VST1d64QPseudo : VSTQQPseudo; -def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo; -def VST1d64QPseudoWB_register : VSTQQWBPseudo; +def VST1d64QPseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST1d64QPseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST4]>; +def VST1d64QPseudoWB_register : VSTQQWBPseudo, Sched<[WriteVST4]>; // VST2 : Vector Store (multiple 2-element structures) class VST2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, @@ -1824,22 +1829,22 @@ class VST2 op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy, } def VST2d8 : VST2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VST2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVST2]>; def VST2d16 : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVST2]>; def VST2d32 : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2, - addrmode6align64or128>; + addrmode6align64or128>, Sched<[WriteVST2]>; def VST2q8 : VST2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VST2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVST4]>; def VST2q16 : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVST4]>; def VST2q32 : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2, - addrmode6align64or128or256>; + addrmode6align64or128or256>, Sched<[WriteVST4]>; -def VST2q8Pseudo : VSTQQPseudo; -def VST2q16Pseudo : VSTQQPseudo; -def VST2q32Pseudo : VSTQQPseudo; +def VST2q8Pseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST2q16Pseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST2q32Pseudo : VSTQQPseudo, Sched<[WriteVST4]>; // ...with address register writeback: multiclass VST2DWB op11_8, bits<4> op7_4, string Dt, @@ -1847,7 +1852,7 @@ multiclass VST2DWB op11_8, bits<4> op7_4, string Dt, def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; @@ -1855,7 +1860,7 @@ multiclass VST2DWB op11_8, bits<4> op7_4, string Dt, def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST2]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; } @@ -1864,7 +1869,7 @@ multiclass VST2QWB op7_4, string Dt, Operand AddrMode> { def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb), (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn!", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Rm = 0b1101; // NLdSt will assign to the right encoding bits. let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; @@ -1873,7 +1878,7 @@ multiclass VST2QWB op7_4, string Dt, Operand AddrMode> { (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd), IIC_VLD1u, "vst2", Dt, "$Vd, $Rn, $Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST2Instruction"; } @@ -1890,12 +1895,12 @@ defm VST2q8wb : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>; defm VST2q16wb : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>; defm VST2q32wb : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>; -def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo; -def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo; -def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo; -def VST2q8PseudoWB_register : VSTQQWBregisterPseudo; -def VST2q16PseudoWB_register : VSTQQWBregisterPseudo; -def VST2q32PseudoWB_register : VSTQQWBregisterPseudo; +def VST2q8PseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST4]>; +def VST2q16PseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST4]>; +def VST2q32PseudoWB_fixed : VSTQQWBfixedPseudo, Sched<[WriteVST4]>; +def VST2q8PseudoWB_register : VSTQQWBregisterPseudo, Sched<[WriteVST4]>; +def VST2q16PseudoWB_register : VSTQQWBregisterPseudo, Sched<[WriteVST4]>; +def VST2q32PseudoWB_register : VSTQQWBregisterPseudo, Sched<[WriteVST4]>; // ...with double-spaced registers def VST2b8 : VST2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VST2, @@ -1915,7 +1920,7 @@ defm VST2b32wb : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, class VST3D op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3, - "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> { + "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []>, Sched<[WriteVST3]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; @@ -1925,9 +1930,9 @@ def VST3d8 : VST3D<0b0100, {0,0,0,?}, "8">; def VST3d16 : VST3D<0b0100, {0,1,0,?}, "16">; def VST3d32 : VST3D<0b0100, {1,0,0,?}, "32">; -def VST3d8Pseudo : VSTQQPseudo; -def VST3d16Pseudo : VSTQQPseudo; -def VST3d32Pseudo : VSTQQPseudo; +def VST3d8Pseudo : VSTQQPseudo, Sched<[WriteVST3]>; +def VST3d16Pseudo : VSTQQPseudo, Sched<[WriteVST3]>; +def VST3d32Pseudo : VSTQQPseudo, Sched<[WriteVST3]>; // ...with address register writeback: class VST3DWB op11_8, bits<4> op7_4, string Dt> @@ -1935,7 +1940,7 @@ class VST3DWB op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u, "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST3]> { let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVLDST3Instruction"; } @@ -1944,9 +1949,9 @@ def VST3d8_UPD : VST3DWB<0b0100, {0,0,0,?}, "8">; def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">; def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">; -def VST3d8Pseudo_UPD : VSTQQWBPseudo; -def VST3d16Pseudo_UPD : VSTQQWBPseudo; -def VST3d32Pseudo_UPD : VSTQQWBPseudo; +def VST3d8Pseudo_UPD : VSTQQWBPseudo, Sched<[WriteVST3]>; +def VST3d16Pseudo_UPD : VSTQQWBPseudo, Sched<[WriteVST3]>; +def VST3d32Pseudo_UPD : VSTQQWBPseudo, Sched<[WriteVST3]>; // ...with double-spaced registers: def VST3q8 : VST3D<0b0101, {0,0,0,?}, "8">; @@ -1956,25 +1961,25 @@ def VST3q8_UPD : VST3DWB<0b0101, {0,0,0,?}, "8">; def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">; def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">; -def VST3q8Pseudo_UPD : VSTQQQQWBPseudo; -def VST3q16Pseudo_UPD : VSTQQQQWBPseudo; -def VST3q32Pseudo_UPD : VSTQQQQWBPseudo; +def VST3q8Pseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST3q16Pseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST3q32Pseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; // ...alternate versions to be allocated odd register numbers: -def VST3q8oddPseudo : VSTQQQQPseudo; -def VST3q16oddPseudo : VSTQQQQPseudo; -def VST3q32oddPseudo : VSTQQQQPseudo; +def VST3q8oddPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; +def VST3q16oddPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; +def VST3q32oddPseudo : VSTQQQQPseudo, Sched<[WriteVST3]>; -def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo; -def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo; -def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo; +def VST3q8oddPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; +def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST3]>; // VST4 : Vector Store (multiple 4-element structures) class VST4D op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn", - "", []> { + "", []>, Sched<[WriteVST4]> { let Rm = 0b1111; let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; @@ -1984,9 +1989,9 @@ def VST4d8 : VST4D<0b0000, {0,0,?,?}, "8">; def VST4d16 : VST4D<0b0000, {0,1,?,?}, "16">; def VST4d32 : VST4D<0b0000, {1,0,?,?}, "32">; -def VST4d8Pseudo : VSTQQPseudo; -def VST4d16Pseudo : VSTQQPseudo; -def VST4d32Pseudo : VSTQQPseudo; +def VST4d8Pseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST4d16Pseudo : VSTQQPseudo, Sched<[WriteVST4]>; +def VST4d32Pseudo : VSTQQPseudo, Sched<[WriteVST4]>; // ...with address register writeback: class VST4DWB op11_8, bits<4> op7_4, string Dt> @@ -1994,7 +1999,7 @@ class VST4DWB op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm", - "$Rn.addr = $wb", []> { + "$Rn.addr = $wb", []>, Sched<[WriteVST4]> { let Inst{5-4} = Rn{5-4}; let DecoderMethod = "DecodeVLDST4Instruction"; } @@ -2003,9 +2008,9 @@ def VST4d8_UPD : VST4DWB<0b0000, {0,0,?,?}, "8">; def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">; def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">; -def VST4d8Pseudo_UPD : VSTQQWBPseudo; -def VST4d16Pseudo_UPD : VSTQQWBPseudo; -def VST4d32Pseudo_UPD : VSTQQWBPseudo; +def VST4d8Pseudo_UPD : VSTQQWBPseudo, Sched<[WriteVST4]>; +def VST4d16Pseudo_UPD : VSTQQWBPseudo, Sched<[WriteVST4]>; +def VST4d32Pseudo_UPD : VSTQQWBPseudo, Sched<[WriteVST4]>; // ...with double-spaced registers: def VST4q8 : VST4D<0b0001, {0,0,?,?}, "8">; @@ -2015,18 +2020,18 @@ def VST4q8_UPD : VST4DWB<0b0001, {0,0,?,?}, "8">; def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">; def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">; -def VST4q8Pseudo_UPD : VSTQQQQWBPseudo; -def VST4q16Pseudo_UPD : VSTQQQQWBPseudo; -def VST4q32Pseudo_UPD : VSTQQQQWBPseudo; +def VST4q8Pseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST4q16Pseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST4q32Pseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; // ...alternate versions to be allocated odd register numbers: -def VST4q8oddPseudo : VSTQQQQPseudo; -def VST4q16oddPseudo : VSTQQQQPseudo; -def VST4q32oddPseudo : VSTQQQQPseudo; +def VST4q8oddPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; +def VST4q16oddPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; +def VST4q32oddPseudo : VSTQQQQPseudo, Sched<[WriteVST4]>; -def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo; -def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo; -def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo; +def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; +def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo, Sched<[WriteVST4]>; } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 @@ -2060,12 +2065,13 @@ class VST1LN op11_8, bits<4> op7_4, string Dt, ValueType Ty, : NLdStLn<1, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane), IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "", - [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> { + [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]>, + Sched<[WriteVST1]> { let Rm = 0b1111; let DecoderMethod = "DecodeVST1LN"; } class VST1QLNPseudo - : VSTQLNPseudo { + : VSTQLNPseudo, Sched<[WriteVST1]> { let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), addrmode6:$addr)]; } @@ -2104,11 +2110,12 @@ class VST1LNWB op11_8, bits<4> op7_4, string Dt, ValueType Ty, "\\{$Vd[$lane]\\}, $Rn$Rm", "$Rn.addr = $wb", [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), - AdrMode:$Rn, am6offset:$Rm))]> { + AdrMode:$Rn, am6offset:$Rm))]>, + Sched<[WriteVST1]> { let DecoderMethod = "DecodeVST1LN"; } class VST1QLNWBPseudo - : VSTQLNWBPseudo { + : VSTQLNWBPseudo, Sched<[WriteVST1]> { let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane), addrmode6:$addr, am6offset:$offset))]; } @@ -2139,7 +2146,7 @@ class VST2LN op11_8, bits<4> op7_4, string Dt> : NLdStLn<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn", - "", []> { + "", []>, Sched<[WriteVST1]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVST2LN"; @@ -2155,9 +2162,9 @@ def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VST2LNd8Pseudo : VSTQLNPseudo; -def VST2LNd16Pseudo : VSTQLNPseudo; -def VST2LNd32Pseudo : VSTQLNPseudo; +def VST2LNd8Pseudo : VSTQLNPseudo, Sched<[WriteVST1]>; +def VST2LNd16Pseudo : VSTQLNPseudo, Sched<[WriteVST1]>; +def VST2LNd32Pseudo : VSTQLNPseudo, Sched<[WriteVST1]>; // ...with double-spaced registers: def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> { @@ -2169,8 +2176,8 @@ def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> { let Inst{4} = Rn{4}; } -def VST2LNq16Pseudo : VSTQQLNPseudo; -def VST2LNq32Pseudo : VSTQQLNPseudo; +def VST2LNq16Pseudo : VSTQQLNPseudo, Sched<[WriteVST1]>; +def VST2LNq32Pseudo : VSTQQLNPseudo, Sched<[WriteVST1]>; // ...with address register writeback: class VST2LNWB op11_8, bits<4> op7_4, string Dt> @@ -2193,9 +2200,9 @@ def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> { let Inst{7} = lane{0}; } -def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo; -def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo; -def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo; +def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo, Sched<[WriteVST1]>; +def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo, Sched<[WriteVST1]>; +def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo, Sched<[WriteVST1]>; def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -2204,15 +2211,16 @@ def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> { let Inst{7} = lane{0}; } -def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo; -def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo; +def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST1]>; +def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST1]>; // VST3LN : Vector Store (single 3-element structure from one lane) class VST3LN op11_8, bits<4> op7_4, string Dt> : NLdStLn<1, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane), IIC_VST3ln, "vst3", Dt, - "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> { + "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []>, + Sched<[WriteVST2]> { let Rm = 0b1111; let DecoderMethod = "DecodeVST3LN"; } @@ -2227,9 +2235,9 @@ def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VST3LNd8Pseudo : VSTQQLNPseudo; -def VST3LNd16Pseudo : VSTQQLNPseudo; -def VST3LNd32Pseudo : VSTQQLNPseudo; +def VST3LNd8Pseudo : VSTQQLNPseudo, Sched<[WriteVST2]>; +def VST3LNd16Pseudo : VSTQQLNPseudo, Sched<[WriteVST2]>; +def VST3LNd32Pseudo : VSTQQLNPseudo, Sched<[WriteVST2]>; // ...with double-spaced registers: def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> { @@ -2263,9 +2271,9 @@ def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> { let Inst{7} = lane{0}; } -def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo; -def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo; -def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo; +def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST2]>; +def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST2]>; +def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST2]>; def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> { let Inst{7-6} = lane{1-0}; @@ -2274,8 +2282,8 @@ def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> { let Inst{7} = lane{0}; } -def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo; -def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo; +def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo, Sched<[WriteVST2]>; +def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo, Sched<[WriteVST2]>; // VST4LN : Vector Store (single 4-element structure from one lane) class VST4LN op11_8, bits<4> op7_4, string Dt> @@ -2283,7 +2291,7 @@ class VST4LN op11_8, bits<4> op7_4, string Dt> (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane), IIC_VST4ln, "vst4", Dt, "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn", - "", []> { + "", []>, Sched<[WriteVST2]> { let Rm = 0b1111; let Inst{4} = Rn{4}; let DecoderMethod = "DecodeVST4LN"; @@ -2300,9 +2308,9 @@ def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNd8Pseudo : VSTQQLNPseudo; -def VST4LNd16Pseudo : VSTQQLNPseudo; -def VST4LNd32Pseudo : VSTQQLNPseudo; +def VST4LNd8Pseudo : VSTQQLNPseudo, Sched<[WriteVST2]>; +def VST4LNd16Pseudo : VSTQQLNPseudo, Sched<[WriteVST2]>; +def VST4LNd32Pseudo : VSTQQLNPseudo, Sched<[WriteVST2]>; // ...with double-spaced registers: def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> { @@ -2313,8 +2321,8 @@ def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNq16Pseudo : VSTQQQQLNPseudo; -def VST4LNq32Pseudo : VSTQQQQLNPseudo; +def VST4LNq16Pseudo : VSTQQQQLNPseudo, Sched<[WriteVST2]>; +def VST4LNq32Pseudo : VSTQQQQLNPseudo, Sched<[WriteVST2]>; // ...with address register writeback: class VST4LNWB op11_8, bits<4> op7_4, string Dt> @@ -2339,9 +2347,9 @@ def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo; -def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo; -def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo; +def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST2]>; +def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST2]>; +def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo, Sched<[WriteVST2]>; def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> { let Inst{7-6} = lane{1-0}; @@ -2351,8 +2359,8 @@ def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> { let Inst{5} = Rn{5}; } -def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo; -def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo; +def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo, Sched<[WriteVST2]>; +def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo, Sched<[WriteVST2]>; } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 87eb4c2b9074..ec5b97cba8cd 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -131,6 +131,17 @@ def WriteFPDIV64 : SchedWrite; def WriteFPSQRT32 : SchedWrite; def WriteFPSQRT64 : SchedWrite; +// Vector load and stores +def WriteVLD1 : SchedWrite; +def WriteVLD2 : SchedWrite; +def WriteVLD3 : SchedWrite; +def WriteVLD4 : SchedWrite; +def WriteVST1 : SchedWrite; +def WriteVST2 : SchedWrite; +def WriteVST3 : SchedWrite; +def WriteVST4 : SchedWrite; + + // Define TII for use in SchedVariant Predicates. def : PredicateProlog<[{ const ARMBaseInstrInfo *TII = diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 8fb8a2a3b6d2..4e72b13d94cb 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -1981,6 +1981,15 @@ def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; } def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; } def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; } +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + // Reserve A9UnitFP for 2 consecutive cycles. def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td index 537e5da9669f..782be9b60a7a 100644 --- a/lib/Target/ARM/ARMScheduleR52.td +++ b/lib/Target/ARM/ARMScheduleR52.td @@ -120,6 +120,12 @@ def : WriteRes { def : WriteRes { let Latency = 7; } def : WriteRes { let Latency = 17; } +// Overriden via InstRW for this processor. +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + def : ReadAdvance; // mul operand read in F1 def : ReadAdvance; // fp-mac operand read in F1 @@ -712,20 +718,20 @@ def R52WriteSTM : SchedWriteVariant<[ // Vector Load/Stores. Can issue only in slot-0. Can dual-issue with // another instruction in slot-1, but only in the last issue. -def R52WriteVLD1Mem : SchedWriteRes<[R52UnitLd]> { let Latency = 5;} -def R52WriteVLD2Mem : SchedWriteRes<[R52UnitLd]> { +def : WriteRes { let Latency = 5;} +def : WriteRes { let Latency = 6; let NumMicroOps = 3; let ResourceCycles = [2]; let SingleIssue = 1; } -def R52WriteVLD3Mem : SchedWriteRes<[R52UnitLd]> { +def : WriteRes { let Latency = 7; let NumMicroOps = 5; let ResourceCycles = [3]; let SingleIssue = 1; } -def R52WriteVLD4Mem : SchedWriteRes<[R52UnitLd]> { +def : WriteRes { let Latency = 8; let NumMicroOps = 7; let ResourceCycles = [4]; @@ -828,95 +834,6 @@ def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VRSHR", "VRSHRN", "VTB")>; def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>; -//--- -// VLDx. Vector Loads -//--- -// 1-element structure load -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>; - -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>; - -// 2-element structure load -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>; - -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>; -def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>; - -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>; -def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>; - -// 3-element structure load -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>; -def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>; - -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>; - -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>; - -// 4-element structure load -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>; - - -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>; -def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>; -def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>; - //--- // VSTx. Vector Stores //--- diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td index dc041c6c6006..b838688c6f04 100644 --- a/lib/Target/ARM/ARMScheduleSwift.td +++ b/lib/Target/ARM/ARMScheduleSwift.td @@ -1070,6 +1070,16 @@ let SchedModel = SwiftModel in { def : ReadAdvance; def : ReadAdvance; + // Overriden via InstRW for this processor. + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + // Not specified. def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>; // Preload. diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 1979cbf50125..c4f23c66e4ea 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -85,9 +85,9 @@ namespace llvm { extern "C" void LLVMInitializeARMTarget() { // Register the target. RegisterTargetMachine X(getTheARMLETarget()); + RegisterTargetMachine A(getTheThumbLETarget()); RegisterTargetMachine Y(getTheARMBETarget()); - RegisterTargetMachine A(getTheThumbLETarget()); - RegisterTargetMachine B(getTheThumbBETarget()); + RegisterTargetMachine B(getTheThumbBETarget()); PassRegistry &Registry = *PassRegistry::getPassRegistry(); initializeGlobalISel(Registry); @@ -263,6 +263,11 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, else this->Options.EABIVersion = EABI::EABI5; } + + initAsmInfo(); + if (!Subtarget.isThumb() && !Subtarget.hasARMOps()) + report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " + "support ARM mode execution!"); } ARMBaseTargetMachine::~ARMBaseTargetMachine() = default; @@ -355,22 +360,6 @@ TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { }); } -void ARMTargetMachine::anchor() {} - -ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional RM, - CodeModel::Model CM, CodeGenOpt::Level OL, - bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { - initAsmInfo(); - if (!Subtarget.hasARMOps()) - report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not " - "support ARM mode execution!"); -} - -void ARMLETargetMachine::anchor() {} ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -378,9 +367,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} - -void ARMBETargetMachine::anchor() {} + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -388,39 +375,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, Optional RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} - -void ThumbTargetMachine::anchor() {} - -ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional RM, - CodeModel::Model CM, - CodeGenOpt::Level OL, bool isLittle) - : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) { - initAsmInfo(); -} - -void ThumbLETargetMachine::anchor() {} - -ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} - -void ThumbBETargetMachine::anchor() {} - -ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Optional RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) - : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index f0ca9427d9fb..e5eb27114c72 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -62,23 +62,9 @@ public: } }; -/// ARM target machine. +/// ARM/Thumb little endian target machine. /// -class ARMTargetMachine : public ARMBaseTargetMachine { - virtual void anchor(); - -public: - ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool isLittle); -}; - -/// ARM little endian target machine. -/// -class ARMLETargetMachine : public ARMTargetMachine { - void anchor() override; - +class ARMLETargetMachine : public ARMBaseTargetMachine { public: ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -86,11 +72,9 @@ public: CodeGenOpt::Level OL); }; -/// ARM big endian target machine. +/// ARM/Thumb big endian target machine. /// -class ARMBETargetMachine : public ARMTargetMachine { - void anchor() override; - +class ARMBETargetMachine : public ARMBaseTargetMachine { public: ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, @@ -98,44 +82,6 @@ public: CodeGenOpt::Level OL); }; -/// Thumb target machine. -/// Due to the way architectures are handled, this represents both -/// Thumb-1 and Thumb-2. -/// -class ThumbTargetMachine : public ARMBaseTargetMachine { - virtual void anchor(); - -public: - ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional RM, CodeModel::Model CM, - CodeGenOpt::Level OL, bool isLittle); -}; - -/// Thumb little endian target machine. -/// -class ThumbLETargetMachine : public ThumbTargetMachine { - void anchor() override; - -public: - ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - -/// Thumb big endian target machine. -/// -class ThumbBETargetMachine : public ThumbTargetMachine { - void anchor() override; - -public: - ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, - StringRef FS, const TargetOptions &Options, - Optional RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp index 94f9e8dfebbf..edbf2b99126c 100644 --- a/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -30,8 +30,8 @@ using namespace dwarf; void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { - const ARMTargetMachine &ARM_TM = static_cast(TM); - bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS; + const ARMBaseTargetMachine &ARM_TM = static_cast(TM); + bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS; genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly(); TargetLoweringObjectFileELF::Initialize(Ctx, TM); diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 1a17d4e33e4f..f917c35b9ceb 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -535,14 +535,14 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, // Look for a temporary register to use. // First, compute the liveness information. - LivePhysRegs UsedRegs(STI.getRegisterInfo()); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + LivePhysRegs UsedRegs(TRI); UsedRegs.addLiveOuts(MBB); // The semantic of pristines changed recently and now, // the callee-saved registers that are touched in the function // are not part of the pristines set anymore. // Add those callee-saved now. - const TargetRegisterInfo *TRI = STI.getRegisterInfo(); - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF); for (unsigned i = 0; CSRegs[i]; ++i) UsedRegs.addReg(CSRegs[i]); @@ -561,12 +561,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, // And some temporary register, just in case. unsigned TemporaryReg = 0; BitVector PopFriendly = - TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID)); + TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID)); assert(PopFriendly.any() && "No allocatable pop-friendly register?!"); // Rebuild the GPRs from the high registers because they are removed // form the GPR reg class for thumb1. BitVector GPRsNoLRSP = - TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID)); + TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::hGPRRegClassID)); GPRsNoLRSP |= PopFriendly; GPRsNoLRSP.reset(ARM::LR); GPRsNoLRSP.reset(ARM::SP); diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index 06ad2b3ffdf8..f10ca394f36c 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -902,7 +902,6 @@ let Defs = [SREG] in // CPI Rd, K // Compares a register with an 8 bit immediate. - let Uses = [SREG] in def CPIRdK : FRdK<0b0011, (outs), (ins GPR8:$rd, imm_ldi8:$k), diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index 6897161c903c..cc7a7c3849bc 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -132,6 +132,10 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128; } +bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + return false; +} + SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { case ISD::BR_CC: @@ -496,8 +500,11 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const { SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + auto N = cast(Op); + assert(N->getOffset() == 0 && "Invalid offset for global address"); + SDLoc DL(Op); - const GlobalValue *GV = cast(Op)->getGlobal(); + const GlobalValue *GV = N->getGlobal(); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64); return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA); diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h index 3d1726be286e..0b8a8ca20c3b 100644 --- a/lib/Target/BPF/BPFISelLowering.h +++ b/lib/Target/BPF/BPFISelLowering.h @@ -42,6 +42,10 @@ public: // This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + // This method decides whether folding a constant offset + // with the given GlobalAddress is legal. + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index a04aca4afa0f..25018b9ed510 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1657,7 +1657,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B, // defined. From the point of view of the liveness tracking, it is ok to // store it as a whole, but if we break it up we may end up storing a // register that is entirely undefined. - LivePhysRegs LPR(&HRI); + LivePhysRegs LPR(HRI); LPR.addLiveIns(B); SmallVector,2> Clobbers; for (auto R = B.begin(); R != It; ++R) { diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 03794511414e..66e07c67958e 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -1254,7 +1254,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const MachineOperand &Op1 = MI.getOperand(1); const MachineOperand &Op2 = MI.getOperand(2); const MachineOperand &Op3 = MI.getOperand(3); - LivePhysRegs LiveAtMI(&HRI); + LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); if (Op0.getReg() != Op2.getReg()) { @@ -1283,7 +1283,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineOperand &Op1 = MI.getOperand(1); MachineOperand &Op2 = MI.getOperand(2); MachineOperand &Op3 = MI.getOperand(3); - LivePhysRegs LiveAtMI(&HRI); + LivePhysRegs LiveAtMI(HRI); getLiveRegsAt(LiveAtMI, MI); bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg()); diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index 0f99dfe342b8..93fb688fc1c0 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -412,6 +412,15 @@ def PS_vstorerwu_ai: STrivv_template, def PS_vstorerwu_ai_128B: STrivv_template, Requires<[HasV60T,UseHVXDbl]>; +let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in { + def PS_vstorerq_ai: Pseudo<(outs), + (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs:$Qt), "", []>, + Requires<[HasV60T,UseHVXSgl]>; + def PS_vstorerq_ai_128B: Pseudo<(outs), + (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs128B:$Qt), "", []>, + Requires<[HasV60T,UseHVXDbl]>; +} + // Vector load pseudos let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in @@ -429,30 +438,16 @@ def PS_vloadrwu_ai: LDrivv_template, def PS_vloadrwu_ai_128B: LDrivv_template, Requires<[HasV60T,UseHVXDbl]>; -// Store vector predicate pseudo. -let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, - isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { - def PS_vstorerq_ai : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXSgl]>; - - def PS_vstorerq_ai_128B : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VectorRegs:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXSgl]>; - - def PS_vloadrq_ai : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXDbl]>; - - def PS_vloadrq_ai_128B : STInst<(outs), - (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1), - ".error \"should not emit\" ", []>, - Requires<[HasV60T,UseHVXDbl]>; +let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in { + def PS_vloadrq_ai: Pseudo<(outs VecPredRegs:$Qd), + (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>, + Requires<[HasV60T,UseHVXSgl]>; + def PS_vloadrq_ai_128B: Pseudo<(outs VecPredRegs128B:$Qd), + (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>, + Requires<[HasV60T,UseHVXDbl]>; } + let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in class VSELInst : InstHexagon; diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 2a1bb63af789..1fc157900ed5 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -50,11 +50,6 @@ bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const { R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1; } -bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const { - return Hexagon::R16 <= Reg && Reg <= Hexagon::R27; -} - - const MCPhysReg * HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF, const TargetRegisterClass *RC) const { diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h index 8a3f175b8488..5f65fad2cc04 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -77,7 +77,6 @@ public: unsigned getFirstCallerSavedNonParamReg() const; bool isEHReturnCalleeSaveReg(unsigned Reg) const; - bool isCalleeSaveReg(unsigned Reg) const; }; } // end namespace llvm diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index c21b6e2515d3..cd474921d4bc 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -214,12 +214,12 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) { for (auto &MB : MF) { auto Begin = MB.begin(), End = MB.end(); while (Begin != End) { - // First the first non-boundary starting from the end of the last + // Find the first non-boundary starting from the end of the last // scheduling region. MachineBasicBlock::iterator RB = Begin; while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF)) ++RB; - // First the first boundary starting from the beginning of the new + // Find the first boundary starting from the beginning of the new // region. MachineBasicBlock::iterator RE = RB; while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF)) diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt index 8be2a898e380..34b966df7761 100644 --- a/lib/Target/LLVMBuild.txt +++ b/lib/Target/LLVMBuild.txt @@ -29,6 +29,7 @@ subdirectories = MSP430 NVPTX Mips + Nios2 PowerPC RISCV Sparc diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td index dfea669f3ba1..203864dd4065 100644 --- a/lib/Target/MSP430/MSP430.td +++ b/lib/Target/MSP430/MSP430.td @@ -22,6 +22,18 @@ def FeatureX : SubtargetFeature<"ext", "ExtendedInsts", "true", "Enable MSP430-X extensions">; +def FeatureHWMult16 + : SubtargetFeature<"hwmult16", "HWMultMode", "HWMult16", + "Enable 16-bit hardware multiplier">; + +def FeatureHWMult32 + : SubtargetFeature<"hwmult32", "HWMultMode", "HWMult32", + "Enable 32-bit hardware multiplier">; + +def FeatureHWMultF5 + : SubtargetFeature<"hwmultf5", "HWMultMode", "HWMultF5", + "Enable F5 series hardware multiplier">; + //===----------------------------------------------------------------------===// // MSP430 supported processors. //===----------------------------------------------------------------------===// @@ -29,6 +41,8 @@ class Proc Features> : Processor; def : Proc<"generic", []>; +def : Proc<"msp430", []>; +def : Proc<"msp430x", [FeatureX]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index cd58eda5d924..0b02f79f472a 100644 --- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -403,12 +403,12 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) { int FI = cast(Node)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16); if (Node->hasOneUse()) { - CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI, + CurDAG->SelectNodeTo(Node, MSP430::ADDframe, MVT::i16, TFI, CurDAG->getTargetConstant(0, dl, MVT::i16)); return; } ReplaceNode(Node, CurDAG->getMachineNode( - MSP430::ADD16ri, dl, MVT::i16, TFI, + MSP430::ADDframe, dl, MVT::i16, TFI, CurDAG->getTargetConstant(0, dl, MVT::i16))); return; } diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index cc6e64043f54..dae14fd301ee 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -38,27 +38,6 @@ using namespace llvm; #define DEBUG_TYPE "msp430-lower" -typedef enum { - NoHWMult, - HWMult16, - HWMult32, - HWMultF5 -} HWMultUseMode; - -static cl::opt -HWMultMode("mhwmult", cl::Hidden, - cl::desc("Hardware multiplier use mode"), - cl::init(NoHWMult), - cl::values( - clEnumValN(NoHWMult, "none", - "Do not use hardware multiplier"), - clEnumValN(HWMult16, "16bit", - "Use 16-bit hardware multiplier"), - clEnumValN(HWMult32, "32bit", - "Use 32-bit hardware multiplier"), - clEnumValN(HWMultF5, "f5series", - "Use F5 series hardware multiplier"))); - MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const MSP430Subtarget &STI) : TargetLowering(TM) { @@ -262,7 +241,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setCmpLibcallCC(LC.Op, LC.Cond); } - if (HWMultMode == HWMult16) { + if (STI.hasHWMult16()) { const struct { const RTLIB::Libcall Op; const char * const Name; @@ -277,7 +256,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); } - } else if (HWMultMode == HWMult32) { + } else if (STI.hasHWMult32()) { const struct { const RTLIB::Libcall Op; const char * const Name; @@ -292,7 +271,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); } - } else if (HWMultMode == HWMultF5) { + } else if (STI.hasHWMultF5()) { const struct { const RTLIB::Libcall Op; const char * const Name; diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index 1cd18611e52c..cec43040f60d 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -122,6 +122,11 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), [(MSP430callseq_end timm:$amt1, timm:$amt2)]>; } +let Defs = [SR], Uses = [SP] in { +def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset), + "# ADDframe PSEUDO", []>; +} + let usesCustomInserter = 1 in { let Uses = [SR] in { def Select8 : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc), diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 9600bc28f100..7a3b7a8bd5ff 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -127,7 +127,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Fold imm into offset Offset += MI.getOperand(FIOperandNum + 1).getImm(); - if (MI.getOpcode() == MSP430::ADD16ri) { + if (MI.getOpcode() == MSP430::ADDframe) { // This is actually "load effective address" of the stack slot // instruction. We have only two-address instructions, thus we need to // expand it into mov + add diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp index 6216348e4d71..776a9dcb11d4 100644 --- a/lib/Target/MSP430/MSP430Subtarget.cpp +++ b/lib/Target/MSP430/MSP430Subtarget.cpp @@ -19,6 +19,20 @@ using namespace llvm; #define DEBUG_TYPE "msp430-subtarget" +static cl::opt +HWMultModeOption("mhwmult", cl::Hidden, + cl::desc("Hardware multiplier use mode for MSP430"), + cl::init(MSP430Subtarget::NoHWMult), + cl::values( + clEnumValN(MSP430Subtarget::NoHWMult, "none", + "Do not use hardware multiplier"), + clEnumValN(MSP430Subtarget::HWMult16, "16bit", + "Use 16-bit hardware multiplier"), + clEnumValN(MSP430Subtarget::HWMult32, "32bit", + "Use 32-bit hardware multiplier"), + clEnumValN(MSP430Subtarget::HWMultF5, "f5series", + "Use F5 series hardware multiplier"))); + #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "MSP430GenSubtargetInfo.inc" @@ -27,7 +41,18 @@ void MSP430Subtarget::anchor() { } MSP430Subtarget & MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { - ParseSubtargetFeatures("generic", FS); + ExtendedInsts = false; + HWMultMode = NoHWMult; + + std::string CPUName = CPU; + if (CPUName.empty()) + CPUName = "msp430"; + + ParseSubtargetFeatures(CPUName, FS); + + if (HWMultModeOption != NoHWMult) + HWMultMode = HWMultModeOption; + return *this; } diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h index 1a00d85e01cb..8828dfd65878 100644 --- a/lib/Target/MSP430/MSP430Subtarget.h +++ b/lib/Target/MSP430/MSP430Subtarget.h @@ -30,8 +30,15 @@ namespace llvm { class StringRef; class MSP430Subtarget : public MSP430GenSubtargetInfo { +public: + enum HWMultEnum { + NoHWMult, HWMult16, HWMult32, HWMultF5 + }; + +private: virtual void anchor(); bool ExtendedInsts; + HWMultEnum HWMultMode; MSP430FrameLowering FrameLowering; MSP430InstrInfo InstrInfo; MSP430TargetLowering TLInfo; @@ -50,6 +57,10 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + bool hasHWMult16() const { return HWMultMode == HWMult16; } + bool hasHWMult32() const { return HWMultMode == HWMult32; } + bool hasHWMultF5() const { return HWMultMode == HWMultF5; } + const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; } diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 3641a70d61b5..8fe4e75f3e18 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -813,28 +813,28 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1)) return SDValue(); - // The shift masks must have the same position and size. - if (SMPos0 != SMPos1 || SMSize0 != SMSize1) - return SDValue(); + // The shift masks must have the same position and size. + if (SMPos0 != SMPos1 || SMSize0 != SMSize1) + return SDValue(); - SDValue Shl = And1.getOperand(0); + SDValue Shl = And1.getOperand(0); - if (!(CN = dyn_cast(Shl.getOperand(1)))) - return SDValue(); + if (!(CN = dyn_cast(Shl.getOperand(1)))) + return SDValue(); - unsigned Shamt = CN->getZExtValue(); + unsigned Shamt = CN->getZExtValue(); - // Return if the shift amount and the first bit position of mask are not the - // same. - EVT ValTy = N->getValueType(0); - if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits())) - return SDValue(); + // Return if the shift amount and the first bit position of mask are not the + // same. + EVT ValTy = N->getValueType(0); + if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits())) + return SDValue(); - SDLoc DL(N); - return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0), - DAG.getConstant(SMPos0, DL, MVT::i32), - DAG.getConstant(SMSize0, DL, MVT::i32), - And0.getOperand(0)); + SDLoc DL(N); + return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0), + DAG.getConstant(SMPos0, DL, MVT::i32), + DAG.getConstant(SMSize0, DL, MVT::i32), + And0.getOperand(0)); } else { // Pattern match DINS. // $dst = or (and $src, mask0), mask1 diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 8f5ecadecdea..1f4e933db2a2 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -59,9 +59,8 @@ static cl::opt void MipsSubtarget::anchor() { } -MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, bool little, - const MipsTargetMachine &TM) +MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + bool little, const MipsTargetMachine &TM) : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault), IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false), NoABICalls(false), IsFP64bit(false), UseOddSPReg(true), @@ -77,8 +76,6 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, FrameLowering(MipsFrameLowering::create(*this)), TLInfo(MipsTargetLowering::create(TM, *this)) { - PreviousInMips16Mode = InMips16Mode; - if (MipsArchVersion == MipsDefault) MipsArchVersion = Mips32; diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index cca2cb8a4660..b4d15ee361ff 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -119,9 +119,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // Mips16 hard float bool InMips16HardFloat; - // PreviousInMips16 -- the function we just processed was in Mips 16 Mode - bool PreviousInMips16Mode; - // InMicroMips -- can process MicroMips instructions bool InMicroMipsMode; @@ -178,8 +175,8 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. - MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, - bool little, const MipsTargetMachine &TM); + MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little, + const MipsTargetMachine &TM); /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/lib/Target/Nios2/CMakeLists.txt b/lib/Target/Nios2/CMakeLists.txt new file mode 100644 index 000000000000..78db452094bd --- /dev/null +++ b/lib/Target/Nios2/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_TARGET_DEFINITIONS Nios2.td) + +#Generate Nios2GenRegisterInfo.inc and Nios2GenInstrInfo.inc which included by +#your hand code C++ files. +#Nios2GenRegisterInfo.inc came from Nios2RegisterInfo.td, Nios2GenInstrInfo.inc +#came from Nios2InstrInfo.td. +tablegen(LLVM Nios2GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM Nios2GenInstrInfo.inc -gen-instr-info) + +#Nios2CommonTableGen must be defined +add_public_tablegen_target(Nios2CommonTableGen) + +#Nios2CodeGen should match with LLVMBuild.txt Nios2CodeGen +add_llvm_target(Nios2CodeGen Nios2TargetMachine.cpp) + +#Should match with "subdirectories = MCTargetDesc TargetInfo" in LLVMBuild.txt +add_subdirectory(TargetInfo) +add_subdirectory(MCTargetDesc) diff --git a/lib/Target/Nios2/LLVMBuild.txt b/lib/Target/Nios2/LLVMBuild.txt new file mode 100644 index 000000000000..b40a76379706 --- /dev/null +++ b/lib/Target/Nios2/LLVMBuild.txt @@ -0,0 +1,61 @@ +;===- ./lib/Target/Nios2/LLVMBuild.txt -------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +#Following comments extracted from http: // llvm.org/docs/LLVMBuild.html + +[common] +subdirectories = + MCTargetDesc + TargetInfo + +[component_0] +#TargetGroup components are an extension of LibraryGroups, specifically for +#defining LLVM targets(which are handled specially in a few places). +type = TargetGroup +#The name of the component should always be the name of the target.(should +#match "def Nios2 : Target" in Nios2.td) +name = Nios2 +#Nios2 component is located in directory Target / +parent = Target +#Whether this target defines an assembly parser, assembly printer, disassembler +#, and supports JIT compilation.They are optional. + +[component_1] +#component_1 is a Library type and name is Nios2CodeGen.After build it will +#in lib / libLLVMNios2CodeGen.a of your build command directory. +type = Library +name = Nios2CodeGen +#Nios2CodeGen component(Library) is located in directory Nios2 / +parent = Nios2 +#If given, a list of the names of Library or LibraryGroup components which +#must also be linked in whenever this library is used.That is, the link time +#dependencies for this component.When tools are built, the build system will +#include the transitive closure of all required_libraries for the components +#the tool needs. +required_libraries = CodeGen + Core + GlobalISel + MC + Nios2Desc + Nios2Info + Support + Target +#end of required_libraries + +#All LLVMBuild.txt in Target / Nios2 and subdirectory use 'add_to_library_groups +#= Nios2' +add_to_library_groups = Nios2 diff --git a/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt new file mode 100644 index 000000000000..21def509a232 --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt @@ -0,0 +1,2 @@ +#MCTargetDesc / CMakeLists.txt +add_llvm_library(LLVMNios2Desc Nios2MCTargetDesc.cpp) diff --git a/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt new file mode 100644 index 000000000000..4dc6995e7f5c --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt @@ -0,0 +1,25 @@ +;===- ./lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = Nios2Desc +parent = Nios2 +required_libraries = MC + Nios2Info + Support +add_to_library_groups = Nios2 diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp new file mode 100644 index 000000000000..d913166399c6 --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp @@ -0,0 +1,25 @@ +//===-- Nios2MCTargetDesc.cpp - Nios2 Target Descriptions -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides Nios2 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "Nios2MCTargetDesc.h" +#include "llvm/MC/MCInstrInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "Nios2GenInstrInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "Nios2GenRegisterInfo.inc" + +extern "C" void LLVMInitializeNios2TargetMC() {} diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h new file mode 100644 index 000000000000..d426062db168 --- /dev/null +++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h @@ -0,0 +1,34 @@ +//===-- Nios2MCTargetDesc.h - Nios2 Target Descriptions ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides Nios2 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H +#define LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H + +namespace llvm { +class Target; +class Triple; + +Target &getTheNios2Target(); + +} // namespace llvm + +// Defines symbolic names for Nios2 registers. This defines a mapping from +// register name to register number. +#define GET_REGINFO_ENUM +#include "Nios2GenRegisterInfo.inc" + +// Defines symbolic names for the Nios2 instructions. +#define GET_INSTRINFO_ENUM +#include "Nios2GenInstrInfo.inc" + +#endif diff --git a/lib/Target/Nios2/Nios2.h b/lib/Target/Nios2/Nios2.h new file mode 100644 index 000000000000..87202f48cfbe --- /dev/null +++ b/lib/Target/Nios2/Nios2.h @@ -0,0 +1,25 @@ +//===-- Nios2.h - Top-level interface for Nios2 representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in +// the LLVM Nios2 back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2_H +#define LLVM_LIB_TARGET_NIOS2_NIOS2_H + +#include "MCTargetDesc/Nios2MCTargetDesc.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class Nios2TargetMachine; +} // namespace llvm + +#endif diff --git a/lib/Target/Nios2/Nios2.td b/lib/Target/Nios2/Nios2.td new file mode 100644 index 000000000000..e8abba863370 --- /dev/null +++ b/lib/Target/Nios2/Nios2.td @@ -0,0 +1,29 @@ +//===-- Nios2.td - Describe the Nios2 Target Machine -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// Target-dependent interfaces +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Calling Conv, Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "Nios2RegisterInfo.td" +include "Nios2InstrInfo.td" + +def Nios2InstrInfo : InstrInfo; + +def Nios2 : Target { let InstructionSet = Nios2InstrInfo; } diff --git a/lib/Target/Nios2/Nios2InstrFormats.td b/lib/Target/Nios2/Nios2InstrFormats.td new file mode 100644 index 000000000000..79868be48a48 --- /dev/null +++ b/lib/Target/Nios2/Nios2InstrFormats.td @@ -0,0 +1,117 @@ +//===-- Nios2InstrFormats.td - Nios2 Instruction Formats ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe NIOS2 instructions format +// +// +//===----------------------------------------------------------------------===// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format val> { + bits<3> Value = val; +} + +def Pseudo : Format<0>; +def FrmI : Format<1>; +def FrmR : Format<2>; +def FrmJ : Format<3>; +def FrmOther : Format<4>; // Instruction w/ a custom format + +// Generic Nios2 Format +class Nios2Inst pattern, Format f> + : Instruction { + field bits<32> Inst; + Format Form = f; + + let Namespace = "Nios2"; + + let Size = 4; + + bits<6> Opcode = 0; + + // Bottom 6 bits are the 'opcode' field + let Inst{5 - 0} = Opcode; + + let OutOperandList = outs; + let InOperandList = ins; + + let AsmString = asmstr; + let Pattern = pattern; + + // + // Attributes specific to Nios2 instructions: + // + bits<3> FormBits = Form.Value; + + // TSFlags layout should be kept in sync with Nios2InstrInfo.h. + let TSFlags{2 - 0} = FormBits; + + let DecoderNamespace = "Nios2"; +} + +// Nios2 Instruction Format +class InstSE pattern, Format f> + : Nios2Inst { +} + +//===----------------------------------------------------------------------===// +// Format I instruction class in Nios2 : <|A|B|immediate|opcode|> +//===----------------------------------------------------------------------===// + +class FI op, dag outs, dag ins, string asmstr, list pattern> + : InstSE { + bits<5> rA; + bits<5> rB; + bits<16> imm; + + let Opcode = op; + + let Inst{31 - 27} = rA; + let Inst{26 - 22} = rB; + let Inst{21 - 6} = imm; +} + +//===----------------------------------------------------------------------===// +// Format R instruction : <|A|B|C|opx|imm|opcode|> +//===----------------------------------------------------------------------===// + +class FR opx, dag outs, dag ins, string asmstr, list pattern> + : InstSE { + bits<5> rA; + bits<5> rB; + bits<5> rC; + bits<5> imm = 0; + + // opcode is always 0x3a for R instr. + let Opcode = 0x3a; + + let Inst{31 - 27} = rA; + let Inst{26 - 22} = rB; + let Inst{21 - 17} = rC; + // opx stands for opcode extension + let Inst{16 - 11} = opx; + // optional 5-bit immediate value + let Inst{10 - 6} = imm; +} + +//===----------------------------------------------------------------------===// +// Format J instruction class in Nios2 : <|address|opcode|> +//===----------------------------------------------------------------------===// + +class FJ op, dag outs, dag ins, string asmstr, list pattern> + : InstSE { + bits<26> addr; + + let Opcode = op; + + let Inst{31 - 6} = addr; +} diff --git a/lib/Target/Nios2/Nios2InstrInfo.td b/lib/Target/Nios2/Nios2InstrInfo.td new file mode 100644 index 000000000000..5e4815ab3e16 --- /dev/null +++ b/lib/Target/Nios2/Nios2InstrInfo.td @@ -0,0 +1,50 @@ +//===- Nios2InstrInfo.td - Target Description for Nios2 ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Nios2 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +include "Nios2InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Nios2 Operand, Complex Patterns and Transformations Definitions. +//===----------------------------------------------------------------------===// + +def simm16 : Operand { + let DecoderMethod= "DecodeSimm16"; +} + +// Node immediate fits as 16-bit sign extended on target immediate. +// e.g. addi, andi +def immSExt16 : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>; + +//===----------------------------------------------------------------------===// +// Instructions specific format +//===----------------------------------------------------------------------===// + +// Arithmetic and logical instructions with 2 register operands. +class ArithLogicI op, string instr_asm, SDNode OpNode, + Operand Od, PatLeaf imm_type, RegisterClass RC> : + FI { + let isReMaterializable = 1; +} + +//===----------------------------------------------------------------------===// +// Nios2 R1 Instructions +//===----------------------------------------------------------------------===// + +/// Arithmetic Instructions (ALU Immediate) +def ADDi : ArithLogicI<0x04, "addi", add, simm16, immSExt16, CPURegs>; diff --git a/lib/Target/Nios2/Nios2RegisterInfo.td b/lib/Target/Nios2/Nios2RegisterInfo.td new file mode 100644 index 000000000000..1808815816f3 --- /dev/null +++ b/lib/Target/Nios2/Nios2RegisterInfo.td @@ -0,0 +1,60 @@ +//===-- Nios2RegisterInfo.td - Nios2 Register defs ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// We have bank of 32 registers. +class Nios2Reg : Register { + field bits<5> Num; + let Namespace = "Nios2"; +} + +// Nios2 CPU Registers +class Nios2GPRReg num, string n> : Nios2Reg { + let Num = num; +} + +//===----------------------------------------------------------------------===// +// Registers +//===----------------------------------------------------------------------===// + +let Namespace = "Nios2" in { + // General Purpose Registers + def ZERO : Nios2GPRReg<0, "zero">, DwarfRegNum<[ 0 ]>; + def AT : Nios2GPRReg<1, "at">, DwarfRegNum<[ 1 ]>; + foreach RegNum = 2 - 23 in { + def R #RegNum : Nios2GPRReg, DwarfRegNum<[ RegNum ]>; + } + def ET : Nios2GPRReg<24, "et">, DwarfRegNum<[ 24 ]>; + def BT : Nios2GPRReg<25, "bt">, DwarfRegNum<[ 25 ]>; + def GP : Nios2GPRReg<26, "gp">, DwarfRegNum<[ 26 ]>; + def SP : Nios2GPRReg<27, "sp">, DwarfRegNum<[ 27 ]>; + def FP : Nios2GPRReg<28, "fp">, DwarfRegNum<[ 28 ]>; + def EA : Nios2GPRReg<29, "ea">, DwarfRegNum<[ 29 ]>; + def BA : Nios2GPRReg<30, "ba">, DwarfRegNum<[ 30 ]>; + def RA : Nios2GPRReg<31, "ra">, DwarfRegNum<[ 31 ]>; + def PC : Nios2Reg<"pc">, DwarfRegNum<[ 32 ]>; +} + +//===----------------------------------------------------------------------===// +// Register Classes +//===----------------------------------------------------------------------===// + +def CPURegs : RegisterClass<"Nios2", [ i32 ], 32, + (add + // Reserved + ZERO, + AT, + // Return Values and Arguments + (sequence "R%u", 2, 7), + // Not preserved across procedure calls + // Caller saved + (sequence "R%u", 8, 15), + // Callee saved + (sequence "R%u", 16, 23), + // Reserved + ET, BT, GP, SP, FP, EA, BA, RA, PC)>; diff --git a/lib/Target/Nios2/Nios2TargetMachine.cpp b/lib/Target/Nios2/Nios2TargetMachine.cpp new file mode 100644 index 000000000000..16d4eabcfaf7 --- /dev/null +++ b/lib/Target/Nios2/Nios2TargetMachine.cpp @@ -0,0 +1,46 @@ +//===-- Nios2TargetMachine.cpp - Define TargetMachine for Nios2 -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implements the info about Nios2 target spec. +// +//===----------------------------------------------------------------------===// + +#include "Nios2TargetMachine.h" +#include "Nios2.h" + +using namespace llvm; + +#define DEBUG_TYPE "nios2" + +extern "C" void LLVMInitializeNios2Target() { + // Register the target. +} + +static std::string computeDataLayout(const Triple &TT, StringRef CPU, + const TargetOptions &Options) { + return "e-p:32:32:32-i8:8:32-i16:16:32-n32"; +} + +static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM, + Optional RM) { + if (!RM.hasValue() || CM == CodeModel::JITDefault) + return Reloc::Static; + return *RM; +} + +Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Optional RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) + : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS, + Options, getEffectiveRelocModel(CM, RM), CM, OL) {} + +Nios2TargetMachine::~Nios2TargetMachine() {} diff --git a/lib/Target/Nios2/Nios2TargetMachine.h b/lib/Target/Nios2/Nios2TargetMachine.h new file mode 100644 index 000000000000..7f145c82f32c --- /dev/null +++ b/lib/Target/Nios2/Nios2TargetMachine.h @@ -0,0 +1,30 @@ +//===-- Nios2TargetMachine.h - Define TargetMachine for Nios2 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Nios2 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H +#define LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class Nios2TargetMachine : public LLVMTargetMachine { +public: + Nios2TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Optional RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + ~Nios2TargetMachine() override; +}; +} // namespace llvm + +#endif diff --git a/lib/Target/Nios2/TargetInfo/CMakeLists.txt b/lib/Target/Nios2/TargetInfo/CMakeLists.txt new file mode 100644 index 000000000000..394d2c2680b7 --- /dev/null +++ b/lib/Target/Nios2/TargetInfo/CMakeLists.txt @@ -0,0 +1 @@ +add_llvm_library(LLVMNios2Info Nios2TargetInfo.cpp) diff --git a/lib/Target/Nios2/TargetInfo/LLVMBuild.txt b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt new file mode 100644 index 000000000000..558f7501ea6b --- /dev/null +++ b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/Nios2/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = Nios2Info +parent = Nios2 +required_libraries = Support +add_to_library_groups = Nios2 diff --git a/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp new file mode 100644 index 000000000000..e317686140f7 --- /dev/null +++ b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp @@ -0,0 +1,24 @@ +//===-- Nios2TargetInfo.cpp - Nios2 Target Implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Nios2.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +Target &llvm::getTheNios2Target() { + static Target TheNios2Target; + return TheNios2Target; +} + +extern "C" void LLVMInitializeNios2TargetInfo() { + RegisterTarget + X(getTheNios2Target(), "nios2", "Nios2"); +} diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp index ebd414baf1d2..41e3190c3eec 100644 --- a/lib/Target/PowerPC/PPCExpandISEL.cpp +++ b/lib/Target/PowerPC/PPCExpandISEL.cpp @@ -339,7 +339,7 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL, // Note: Cannot use stepBackward instead since we are using the Reg // liveness state at the end of MBB (liveOut of MBB) as the liveIn for // NewSuccessor. Otherwise, will cause cyclic dependence. - LivePhysRegs LPR(MF->getSubtarget().getRegisterInfo()); + LivePhysRegs LPR(*MF->getSubtarget().getRegisterInfo()); SmallVector, 2> Clobbers; for (MachineInstr &MI : *MBB) LPR.stepForward(MI, Clobbers); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index e65b1f1aa0a5..b90a5ee28342 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1596,9 +1596,8 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { return true; } -bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, - unsigned &InsertAtByte, bool &Swap, bool IsLE) { // Check that the mask is shuffling words +static bool isWordShuffleMask(ShuffleVectorSDNode *N) { for (unsigned i = 0; i < 4; ++i) { unsigned B0 = N->getMaskElt(i*4); unsigned B1 = N->getMaskElt(i*4+1); @@ -1610,6 +1609,14 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, return false; } + return true; +} + +bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + unsigned &InsertAtByte, bool &Swap, bool IsLE) { + if (!isWordShuffleMask(N)) + return false; + // Now we look at mask elements 0,4,8,12 unsigned M0 = N->getMaskElt(0) / 4; unsigned M1 = N->getMaskElt(4) / 4; @@ -1680,6 +1687,69 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, return false; } +bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE) { + assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8"); + // Ensure each byte index of the word is consecutive. + if (!isWordShuffleMask(N)) + return false; + + // Now we look at mask elements 0,4,8,12, which are the beginning of words. + unsigned M0 = N->getMaskElt(0) / 4; + unsigned M1 = N->getMaskElt(4) / 4; + unsigned M2 = N->getMaskElt(8) / 4; + unsigned M3 = N->getMaskElt(12) / 4; + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + assert(M0 < 4 && "Indexing into an undef vector?"); + if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4) + return false; + + ShiftElts = IsLE ? (4 - M0) % 4 : M0; + Swap = false; + return true; + } + + // Ensure each word index of the ShuffleVector Mask is consecutive. + if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8) + return false; + + if (IsLE) { + if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 3 left elements of the second vector + // (or if there is no shift to be done at all). + Swap = false; + ShiftElts = (8 - M0) % 8; + } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 3 left elements of the first vector + // (or if we're shifting by 4 - thereby simply swapping the vectors). + Swap = true; + ShiftElts = (4 - M0) % 4; + } + + return true; + } else { // BE + if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) { + // Input vectors don't need to be swapped if the leading element + // of the result is one of the 4 elements of the first vector. + Swap = false; + ShiftElts = M0; + } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) { + // Input vectors need to be swapped if the leading element + // of the result is one of the 4 elements of the right vector. + Swap = true; + ShiftElts = M0 - 4; + } + + return true; + } +} + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, @@ -7679,6 +7749,20 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + + if (Subtarget.hasVSX() && + PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + SDValue Conv2 = + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2); + + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl); + } + if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); @@ -8212,10 +8296,12 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDLoc DL(Op); switch (cast(Op.getOperand(ArgStart))->getZExtValue()) { case Intrinsic::ppc_cfence: { + assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, - Op.getOperand(ArgStart + 1))), + Op.getOperand(ArgStart + 1)), + Op.getOperand(0)), 0); } default: diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index acb77943b118..2f9eb95f6de6 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -450,7 +450,11 @@ namespace llvm { /// a VMRGEW or VMRGOW instruction bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG); - + /// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable + /// for a XXSLDWI instruction. + bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + bool &Swap, bool IsLE); + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the /// shift amount, otherwise return -1. int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index a3f894c81a01..165970f9678c 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1001,7 +1001,9 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), isPPC64; // LR8 is a true define, while the rest of the Defs are clobbers. X3 is // explicitly defined when this op is created, so not mentioned here. -let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, +// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be +// correct because the branch select pass is relying on it. +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8, Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), "#GETtlsADDR", diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 46f103141bc1..fd6785e963a6 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1931,6 +1931,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case PPC::DFSTOREf64: { assert(Subtarget.hasP9Vector() && "Invalid D-Form Pseudo-ops on non-P9 target."); + assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() && + "D-form op must have register and immediate operands"); unsigned UpperOpcode, LowerOpcode; switch (MI.getOpcode()) { case PPC::DFLOADf32: diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 0766cfe4a987..26b99eced23c 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -46,7 +46,7 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>, ]>; def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>, - SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> + SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3> ]>; def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index b98140fedfc0..1589ab03e507 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -1066,6 +1066,10 @@ def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>; def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>; +// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and +// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable. +def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>; + // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>; @@ -2379,8 +2383,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Load Vector Indexed def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc, - [(set v2f64:$XT, (load xoaddr:$src))]>; - + [(set v2f64:$XT, (load xaddr:$src))]>; // Load Vector (Left-justified) with Length def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB), "lxvl $XT, $src, $rB", IIC_LdStLoad, @@ -2430,7 +2433,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Store Vector Indexed def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc, - [(store v2f64:$XT, xoaddr:$dst)]>; + [(store v2f64:$XT, xaddr:$dst)]>; // Store Vector (Left-justified) with Length def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB), @@ -2498,21 +2501,38 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>; } // IsLittleEndian, HasP9Vector - def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; - def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), - (STXVX $rS, xoaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), - (STXVX $rS, xoaddr:$dst)>; + // D-Form Load/Store + def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst), + (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst), + (STXV $rS, memrix16:$dst)>; + + + def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>; + def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst), + (STXVX $rS, xaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst), + (STXVX $rS, xaddr:$dst)>; def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), @@ -2704,9 +2724,15 @@ def FltToUIntLoad { def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } +def FltToLongLoadP9 { + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A))))); +} def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } +def FltToULongLoadP9 { + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A))))); +} def FltToLong { dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A)))); } @@ -2728,9 +2754,15 @@ def DblToULong { def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } +def DblToIntLoadP9 { + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A))))); +} def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } +def DblToUIntLoadP9 { + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A))))); +} def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); } @@ -2898,17 +2930,17 @@ let AddedComplexity = 400 in { (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; - def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), + def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; - def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), + def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; - def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), + def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 iaddr:$A), VSFRC)), 0))>; - def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), + def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddr:$A), VSFRC)), 0))>; diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp index 92ce8089c24f..d02db9a617a3 100644 --- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp +++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp @@ -74,7 +74,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB, unsigned CCValid = MI.getOperand(3).getImm(); unsigned CCMask = MI.getOperand(4).getImm(); - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LivePhysRegs LiveRegs(TII->getRegisterInfo()); LiveRegs.addLiveOuts(MBB); for (auto I = std::prev(MBB.end()); I != MBBI; --I) LiveRegs.stepBackward(*I); diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index a30bf34857b5..b34c181124de 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -236,32 +236,30 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode, void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction &MF = *MBB->getParent(); - const unsigned Reg = MI->getOperand(0).getReg(); + const unsigned Reg64 = MI->getOperand(0).getReg(); + const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32); - // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD, - // so they already have operand 0 set to reg. + // EAR can only load the low subregister so us a shift for %a0 to produce + // the GR containing %a0 and %a1. // ear , %a0 - MachineInstr *Ear1MI = MF.CloneMachineInstr(MI); - MBB->insert(MI, Ear1MI); - Ear1MI->setDesc(get(SystemZ::EAR)); - MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32) + .addReg(SystemZ::A0) + .addReg(Reg64, RegState::ImplicitDefine); // sllg , , 32 - MachineInstr *SllgMI = MF.CloneMachineInstr(MI); - MBB->insert(MI, SllgMI); - SllgMI->setDesc(get(SystemZ::SLLG)); - MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::SLLG), Reg64) + .addReg(Reg64) + .addReg(0) + .addImm(32); // ear , %a1 - MachineInstr *Ear2MI = MF.CloneMachineInstr(MI); - MBB->insert(MI, Ear2MI); - Ear2MI->setDesc(get(SystemZ::EAR)); - MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32) + .addReg(SystemZ::A1); // lg , 40() MI->setDesc(get(SystemZ::LG)); - MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0); + MachineInstrBuilder(MF, MI).addReg(Reg64).addImm(40).addReg(0); } // Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 3766ed45b8c4..ad597f5c65f0 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool prefersVectorizedAddressing() { return false; } bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 32ab475f1186..e5d3209ec6a9 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1316,16 +1316,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { while (!Done) { bool UpdateLocLex = true; + AsmToken::TokenKind TK = getLexer().getKind(); // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an // identifier. Don't try an parse it as a register. - if (PrevTK != AsmToken::Error && Tok.getString().startswith(".")) + if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") && + TK != AsmToken::Identifier) break; // If we're parsing an immediate expression, we don't expect a '['. if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; - AsmToken::TokenKind TK = getLexer().getKind(); switch (TK) { default: { if (SM.isValidEndState()) { diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index fc4adddc149b..7471373334f6 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -11,6 +11,7 @@ tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables) +tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables) if(LLVM_BUILD_GLOBAL_ISEL) tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank) tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 3a421fe77392..fe105298f5c1 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -127,6 +127,9 @@ def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", "Enable AVX-512 Conflict Detection Instructions", [FeatureAVX512]>; +def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", + "true", "Enable AVX-512 Population Count Instructions", + [FeatureAVX512]>; def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index a5489b9aa8b7..313920e02c3e 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -1655,8 +1655,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { } void FPS::setKillFlags(MachineBasicBlock &MBB) const { - const TargetRegisterInfo *TRI = - MBB.getParent()->getSubtarget().getRegisterInfo(); + const TargetRegisterInfo &TRI = + *MBB.getParent()->getSubtarget().getRegisterInfo(); LivePhysRegs LPR(TRI); LPR.addLiveOuts(MBB); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 37b248416e4a..86744b064132 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1364,6 +1364,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i64, Legal); } + if (Subtarget.hasVPOPCNTDQ()) { + // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512 + // version of popcntd/q. + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64, + MVT::v4i32, MVT::v2i64}) + setOperationAction(ISD::CTPOP, VT, Legal); + } + // Custom lower several nodes. for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index f9344413bbcf..d8702693884d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2693,22 +2693,22 @@ multiclass avx512_load_vl opc, string OpcodeStr, } multiclass avx512_store opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag st_frag, PatFrag mstore> { + PatFrag st_frag, PatFrag mstore, string Name> { let hasSideEffects = 0 in { def rr_REV : AVX512PI, EVEX; + [], _.ExeDomain>, EVEX, FoldGenData; def rrk_REV : AVX512PI, EVEX, EVEX_K; + [], _.ExeDomain>, EVEX, EVEX_K, FoldGenData; def rrkz_REV : AVX512PI, EVEX, EVEX_KZ; + [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData; } def mr : AVX512PI opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo _, Predicate prd> { + AVX512VLVectorVTInfo _, Predicate prd, + string Name> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; + masked_store_unaligned, Name#Z>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; + masked_store_unaligned, Name#Z256>, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; + masked_store_unaligned, Name#Z128>, EVEX_V128; } } multiclass avx512_alignedstore_vl opc, string OpcodeStr, - AVX512VLVectorVTInfo _, Predicate prd> { + AVX512VLVectorVTInfo _, Predicate prd, + string Name> { let Predicates = [prd] in defm Z : avx512_store, EVEX_V512; + masked_store_aligned512, Name#Z>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store, EVEX_V256; + masked_store_aligned256, Name#Z256>, EVEX_V256; defm Z128 : avx512_store, EVEX_V128; + masked_store_aligned128, Name#Z128>, EVEX_V128; } } defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, HasAVX512>, avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, - HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + HasAVX512, "VMOVAPS">, + PS, EVEX_CD8<32, CD8VF>; defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, HasAVX512>, avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, - HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + HasAVX512, "VMOVAPD">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, null_frag>, - avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, + "VMOVUPS">, PS, EVEX_CD8<32, CD8VF>; defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, null_frag>, - avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, + "VMOVUPD">, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, - HasAVX512>, PD, EVEX_CD8<32, CD8VF>; + HasAVX512, "VMOVDQA32">, + PD, EVEX_CD8<32, CD8VF>; defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, - HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + HasAVX512, "VMOVDQA64">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, - HasBWI>, XD, EVEX_CD8<8, CD8VF>; + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, + HasBWI, "VMOVDQU8">, + XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, - HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; + HasBWI, "VMOVDQU16">, + XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, - HasAVX512>, XS, EVEX_CD8<32, CD8VF>; + HasAVX512, "VMOVDQU32">, + XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, - HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; + HasAVX512, "VMOVDQU64">, + XS, VEX_W, EVEX_CD8<64, CD8VF>; // Special instructions to help with spilling when we don't have VLX. We need // to load or store from a ZMM register instead. These are converted in @@ -3354,17 +3366,52 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; -let hasSideEffects = 0 in -defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, - (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2), - "vmovss.s", "$src2, $src1", "$src1, $src2", []>, - XS, EVEX_4V, VEX_LIG; +let hasSideEffects = 0 in { + def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR32X:$src2), + "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], NoItinerary>, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrr">; -let hasSideEffects = 0 in -defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, - (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2), - "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, - XD, EVEX_4V, VEX_LIG, VEX_W; +let Constraints = "$src0 = $dst" in + def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, + VR128X:$src1, FR32X:$src2), + "vmovss.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrk">; + + def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), + "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrkz">; + + def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, FR64X:$src2), + "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W, + FoldGenData<"VMOVSDZrr">; + +let Constraints = "$src0 = $dst" in + def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, + VR128X:$src1, FR64X:$src2), + "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrk">; + + def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.KRCWM:$mask, VR128X:$src1, + FR64X:$src2), + "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrkz">; +} let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { @@ -8648,6 +8695,41 @@ let Predicates = [HasCDI, NoVLX] in { sub_xmm)>; } +//===---------------------------------------------------------------------===// +// Counts number of ones - VPOPCNTD and VPOPCNTQ +//===---------------------------------------------------------------------===// + +multiclass avx512_unary_rmb_popcnt opc, string OpcodeStr, X86VectorVTInfo VTInfo> { + let Predicates = [HasVPOPCNTDQ] in + defm Z : avx512_unary_rmb, EVEX_V512; +} + +// Use 512bit version to implement 128/256 bit. +multiclass avx512_unary_lowering { + let Predicates = [prd] in { + def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + (EXTRACT_SUBREG + (!cast(NAME # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info256.RC:$src1, + _.info256.SubRegIdx)), + _.info256.SubRegIdx)>; + + def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + (EXTRACT_SUBREG + (!cast(NAME # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info128.RC:$src1, + _.info128.SubRegIdx)), + _.info128.SubRegIdx)>; + } +} + +defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>, + avx512_unary_lowering; +defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>, + avx512_unary_lowering, VEX_W; + //===---------------------------------------------------------------------===// // Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -8795,7 +8877,7 @@ multiclass avx512_extract_elt_w { def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - EVEX, TAPD; + EVEX, TAPD, FoldGenData; defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; } diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 66382014f6e8..e38bbc9b3d36 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -964,10 +964,10 @@ multiclass ArithBinOp_RF BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // isConvertibleToThreeAddress } // isCommutable - def NAME#8rr_REV : BinOpRR_Rev; - def NAME#16rr_REV : BinOpRR_Rev; - def NAME#32rr_REV : BinOpRR_Rev; - def NAME#64rr_REV : BinOpRR_Rev; + def NAME#8rr_REV : BinOpRR_Rev, FoldGenData; + def NAME#16rr_REV : BinOpRR_Rev, FoldGenData; + def NAME#32rr_REV : BinOpRR_Rev, FoldGenData; + def NAME#64rr_REV : BinOpRR_Rev, FoldGenData; def NAME#8rm : BinOpRM_RF; def NAME#16rm : BinOpRM_RF; @@ -1049,10 +1049,10 @@ multiclass ArithBinOp_RFF BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } // isConvertibleToThreeAddress } // isCommutable - def NAME#8rr_REV : BinOpRR_RFF_Rev; - def NAME#16rr_REV : BinOpRR_RFF_Rev; - def NAME#32rr_REV : BinOpRR_RFF_Rev; - def NAME#64rr_REV : BinOpRR_RFF_Rev; + def NAME#8rr_REV : BinOpRR_RFF_Rev, FoldGenData; + def NAME#16rr_REV : BinOpRR_RFF_Rev, FoldGenData; + def NAME#32rr_REV : BinOpRR_RFF_Rev, FoldGenData; + def NAME#64rr_REV : BinOpRR_RFF_Rev, FoldGenData; def NAME#8rm : BinOpRM_RFF; def NAME#16rm : BinOpRM_RFF; @@ -1129,10 +1129,10 @@ multiclass ArithBinOp_F BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // isCommutable - def NAME#8rr_REV : BinOpRR_F_Rev; - def NAME#16rr_REV : BinOpRR_F_Rev; - def NAME#32rr_REV : BinOpRR_F_Rev; - def NAME#64rr_REV : BinOpRR_F_Rev; + def NAME#8rr_REV : BinOpRR_F_Rev, FoldGenData; + def NAME#16rr_REV : BinOpRR_F_Rev, FoldGenData; + def NAME#32rr_REV : BinOpRR_F_Rev, FoldGenData; + def NAME#64rr_REV : BinOpRR_F_Rev, FoldGenData; def NAME#8rm : BinOpRM_F; def NAME#16rm : BinOpRM_F; diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 1941ae57f0f1..3a3cdc9fa574 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -297,7 +297,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_LIG; + VEX_LIG, FoldGenData; } multiclass fma4s_int opc, string OpcodeStr, Operand memop, @@ -321,6 +321,12 @@ let isCodeGenOnly = 1 in { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG; +let hasSideEffects = 0 in + def rr_Int_REV : FMA4, VEX_LIG, FoldGenData; } // isCodeGenOnly = 1 } @@ -372,12 +378,13 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_REV : FMA4; + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + FoldGenData; def Yrr_REV : FMA4, - VEX_L; + VEX_L, FoldGenData; } // isCodeGenOnly = 1 } diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index c2fe786732dc..bfcbf71d252f 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -225,6 +225,12 @@ class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } class XOP { Encoding OpEnc = EncXOP; } class XOP_4V : XOP { bit hasVEX_4V = 1; } +// Specify the alternative register form instruction to replace the current +// instruction in case it was picked during generation of memory folding tables +class FoldGenData { + string FoldGenRegForm = _RegisterForm; +} + class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, InstrItinClass itin, @@ -304,6 +310,10 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, CD8_EltSize, !srl(VectSize, CD8_Form{1-0}))), 0); + // Used in the memory folding generation (TableGen backend) to point to an alternative + // instruction to replace the current one in case it got picked during generation. + string FoldGenRegForm = ?; + // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f7083a7448ce..33fbd41bb631 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -121,172 +121,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) (STI.is64Bit() ? X86::RETQ : X86::RETL)), Subtarget(STI), RI(STI.getTargetTriple()) { - static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { - { X86::ADC32ri, X86::ADC32mi, 0 }, - { X86::ADC32ri8, X86::ADC32mi8, 0 }, - { X86::ADC32rr, X86::ADC32mr, 0 }, - { X86::ADC64ri32, X86::ADC64mi32, 0 }, - { X86::ADC64ri8, X86::ADC64mi8, 0 }, - { X86::ADC64rr, X86::ADC64mr, 0 }, - { X86::ADD16ri, X86::ADD16mi, 0 }, - { X86::ADD16ri8, X86::ADD16mi8, 0 }, - { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, - { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, - { X86::ADD16rr, X86::ADD16mr, 0 }, - { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, - { X86::ADD32ri, X86::ADD32mi, 0 }, - { X86::ADD32ri8, X86::ADD32mi8, 0 }, - { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, - { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, - { X86::ADD32rr, X86::ADD32mr, 0 }, - { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, - { X86::ADD64ri32, X86::ADD64mi32, 0 }, - { X86::ADD64ri8, X86::ADD64mi8, 0 }, - { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, - { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, - { X86::ADD64rr, X86::ADD64mr, 0 }, - { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, - { X86::ADD8ri, X86::ADD8mi, 0 }, - { X86::ADD8rr, X86::ADD8mr, 0 }, - { X86::AND16ri, X86::AND16mi, 0 }, - { X86::AND16ri8, X86::AND16mi8, 0 }, - { X86::AND16rr, X86::AND16mr, 0 }, - { X86::AND32ri, X86::AND32mi, 0 }, - { X86::AND32ri8, X86::AND32mi8, 0 }, - { X86::AND32rr, X86::AND32mr, 0 }, - { X86::AND64ri32, X86::AND64mi32, 0 }, - { X86::AND64ri8, X86::AND64mi8, 0 }, - { X86::AND64rr, X86::AND64mr, 0 }, - { X86::AND8ri, X86::AND8mi, 0 }, - { X86::AND8rr, X86::AND8mr, 0 }, - { X86::DEC16r, X86::DEC16m, 0 }, - { X86::DEC32r, X86::DEC32m, 0 }, - { X86::DEC64r, X86::DEC64m, 0 }, - { X86::DEC8r, X86::DEC8m, 0 }, - { X86::INC16r, X86::INC16m, 0 }, - { X86::INC32r, X86::INC32m, 0 }, - { X86::INC64r, X86::INC64m, 0 }, - { X86::INC8r, X86::INC8m, 0 }, - { X86::NEG16r, X86::NEG16m, 0 }, - { X86::NEG32r, X86::NEG32m, 0 }, - { X86::NEG64r, X86::NEG64m, 0 }, - { X86::NEG8r, X86::NEG8m, 0 }, - { X86::NOT16r, X86::NOT16m, 0 }, - { X86::NOT32r, X86::NOT32m, 0 }, - { X86::NOT64r, X86::NOT64m, 0 }, - { X86::NOT8r, X86::NOT8m, 0 }, - { X86::OR16ri, X86::OR16mi, 0 }, - { X86::OR16ri8, X86::OR16mi8, 0 }, - { X86::OR16rr, X86::OR16mr, 0 }, - { X86::OR32ri, X86::OR32mi, 0 }, - { X86::OR32ri8, X86::OR32mi8, 0 }, - { X86::OR32rr, X86::OR32mr, 0 }, - { X86::OR64ri32, X86::OR64mi32, 0 }, - { X86::OR64ri8, X86::OR64mi8, 0 }, - { X86::OR64rr, X86::OR64mr, 0 }, - { X86::OR8ri, X86::OR8mi, 0 }, - { X86::OR8rr, X86::OR8mr, 0 }, - { X86::ROL16r1, X86::ROL16m1, 0 }, - { X86::ROL16rCL, X86::ROL16mCL, 0 }, - { X86::ROL16ri, X86::ROL16mi, 0 }, - { X86::ROL32r1, X86::ROL32m1, 0 }, - { X86::ROL32rCL, X86::ROL32mCL, 0 }, - { X86::ROL32ri, X86::ROL32mi, 0 }, - { X86::ROL64r1, X86::ROL64m1, 0 }, - { X86::ROL64rCL, X86::ROL64mCL, 0 }, - { X86::ROL64ri, X86::ROL64mi, 0 }, - { X86::ROL8r1, X86::ROL8m1, 0 }, - { X86::ROL8rCL, X86::ROL8mCL, 0 }, - { X86::ROL8ri, X86::ROL8mi, 0 }, - { X86::ROR16r1, X86::ROR16m1, 0 }, - { X86::ROR16rCL, X86::ROR16mCL, 0 }, - { X86::ROR16ri, X86::ROR16mi, 0 }, - { X86::ROR32r1, X86::ROR32m1, 0 }, - { X86::ROR32rCL, X86::ROR32mCL, 0 }, - { X86::ROR32ri, X86::ROR32mi, 0 }, - { X86::ROR64r1, X86::ROR64m1, 0 }, - { X86::ROR64rCL, X86::ROR64mCL, 0 }, - { X86::ROR64ri, X86::ROR64mi, 0 }, - { X86::ROR8r1, X86::ROR8m1, 0 }, - { X86::ROR8rCL, X86::ROR8mCL, 0 }, - { X86::ROR8ri, X86::ROR8mi, 0 }, - { X86::SAR16r1, X86::SAR16m1, 0 }, - { X86::SAR16rCL, X86::SAR16mCL, 0 }, - { X86::SAR16ri, X86::SAR16mi, 0 }, - { X86::SAR32r1, X86::SAR32m1, 0 }, - { X86::SAR32rCL, X86::SAR32mCL, 0 }, - { X86::SAR32ri, X86::SAR32mi, 0 }, - { X86::SAR64r1, X86::SAR64m1, 0 }, - { X86::SAR64rCL, X86::SAR64mCL, 0 }, - { X86::SAR64ri, X86::SAR64mi, 0 }, - { X86::SAR8r1, X86::SAR8m1, 0 }, - { X86::SAR8rCL, X86::SAR8mCL, 0 }, - { X86::SAR8ri, X86::SAR8mi, 0 }, - { X86::SBB32ri, X86::SBB32mi, 0 }, - { X86::SBB32ri8, X86::SBB32mi8, 0 }, - { X86::SBB32rr, X86::SBB32mr, 0 }, - { X86::SBB64ri32, X86::SBB64mi32, 0 }, - { X86::SBB64ri8, X86::SBB64mi8, 0 }, - { X86::SBB64rr, X86::SBB64mr, 0 }, - { X86::SHL16r1, X86::SHL16m1, 0 }, - { X86::SHL16rCL, X86::SHL16mCL, 0 }, - { X86::SHL16ri, X86::SHL16mi, 0 }, - { X86::SHL32r1, X86::SHL32m1, 0 }, - { X86::SHL32rCL, X86::SHL32mCL, 0 }, - { X86::SHL32ri, X86::SHL32mi, 0 }, - { X86::SHL64r1, X86::SHL64m1, 0 }, - { X86::SHL64rCL, X86::SHL64mCL, 0 }, - { X86::SHL64ri, X86::SHL64mi, 0 }, - { X86::SHL8r1, X86::SHL8m1, 0 }, - { X86::SHL8rCL, X86::SHL8mCL, 0 }, - { X86::SHL8ri, X86::SHL8mi, 0 }, - { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, - { X86::SHLD16rri8, X86::SHLD16mri8, 0 }, - { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 }, - { X86::SHLD32rri8, X86::SHLD32mri8, 0 }, - { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 }, - { X86::SHLD64rri8, X86::SHLD64mri8, 0 }, - { X86::SHR16r1, X86::SHR16m1, 0 }, - { X86::SHR16rCL, X86::SHR16mCL, 0 }, - { X86::SHR16ri, X86::SHR16mi, 0 }, - { X86::SHR32r1, X86::SHR32m1, 0 }, - { X86::SHR32rCL, X86::SHR32mCL, 0 }, - { X86::SHR32ri, X86::SHR32mi, 0 }, - { X86::SHR64r1, X86::SHR64m1, 0 }, - { X86::SHR64rCL, X86::SHR64mCL, 0 }, - { X86::SHR64ri, X86::SHR64mi, 0 }, - { X86::SHR8r1, X86::SHR8m1, 0 }, - { X86::SHR8rCL, X86::SHR8mCL, 0 }, - { X86::SHR8ri, X86::SHR8mi, 0 }, - { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 }, - { X86::SHRD16rri8, X86::SHRD16mri8, 0 }, - { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 }, - { X86::SHRD32rri8, X86::SHRD32mri8, 0 }, - { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 }, - { X86::SHRD64rri8, X86::SHRD64mri8, 0 }, - { X86::SUB16ri, X86::SUB16mi, 0 }, - { X86::SUB16ri8, X86::SUB16mi8, 0 }, - { X86::SUB16rr, X86::SUB16mr, 0 }, - { X86::SUB32ri, X86::SUB32mi, 0 }, - { X86::SUB32ri8, X86::SUB32mi8, 0 }, - { X86::SUB32rr, X86::SUB32mr, 0 }, - { X86::SUB64ri32, X86::SUB64mi32, 0 }, - { X86::SUB64ri8, X86::SUB64mi8, 0 }, - { X86::SUB64rr, X86::SUB64mr, 0 }, - { X86::SUB8ri, X86::SUB8mi, 0 }, - { X86::SUB8rr, X86::SUB8mr, 0 }, - { X86::XOR16ri, X86::XOR16mi, 0 }, - { X86::XOR16ri8, X86::XOR16mi8, 0 }, - { X86::XOR16rr, X86::XOR16mr, 0 }, - { X86::XOR32ri, X86::XOR32mi, 0 }, - { X86::XOR32ri8, X86::XOR32mi8, 0 }, - { X86::XOR32rr, X86::XOR32mr, 0 }, - { X86::XOR64ri32, X86::XOR64mi32, 0 }, - { X86::XOR64ri8, X86::XOR64mi8, 0 }, - { X86::XOR64rr, X86::XOR64mr, 0 }, - { X86::XOR8ri, X86::XOR8mi, 0 }, - { X86::XOR8rr, X86::XOR8mr, 0 } - }; +// Generated memory folding tables. +#include "X86GenFoldTables.inc" for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) { AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, @@ -295,744 +131,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } - static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { - { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, - { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, - { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, - { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, - { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, - { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, - { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, - { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, - { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD }, - { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD }, - { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD }, - { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD }, - { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD }, - { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD }, - { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD }, - { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD }, - { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD }, - { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, - { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, - { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, - { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, - { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, - { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, - { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, - { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD }, - { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD }, - { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD }, - { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD }, - { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD }, - { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD }, - { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, - { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, - { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, - { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, - { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, - { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, - { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, - { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, - { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, - { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, - { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, - { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, - { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, - { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, - { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, - { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, - { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, - { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, - { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, - { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, - { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, - { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, - { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, - { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, - { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, - { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, - { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, - { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, - { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, - { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, - { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, - { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, - { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, - { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, - { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, - { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, - { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, - { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, - { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, - { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, - { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, - { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, - { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, - { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, - { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, - { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, - { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, - { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, - { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, - - // AVX 128-bit versions of foldable instructions - { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE }, - { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, - { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, - { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, - { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, - { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE }, - { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE }, - - // AVX 256-bit foldable instructions - { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE }, - { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, - { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, - - // AVX-512 foldable instructions - { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, - { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, - { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, - { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, - { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, - { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, - { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE }, - { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE }, - { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE }, - { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, - { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, - { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE }, - { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE }, - { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE }, - { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE }, - { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE }, - { X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE }, - { X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE }, - { X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE }, - { X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE }, - { X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE }, - { X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE }, - { X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE }, - { X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE }, - - // AVX-512 foldable instructions (256-bit versions) - { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE }, - { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE }, - { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE }, - { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE }, - { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, - { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, - { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, - { X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE }, - { X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE }, - - // AVX-512 foldable instructions (128-bit versions) - { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, - { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }, - - // F16C foldable instructions - { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE }, - { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) { AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags); } - static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { - { X86::BSF16rr, X86::BSF16rm, 0 }, - { X86::BSF32rr, X86::BSF32rm, 0 }, - { X86::BSF64rr, X86::BSF64rm, 0 }, - { X86::BSR16rr, X86::BSR16rm, 0 }, - { X86::BSR32rr, X86::BSR32rm, 0 }, - { X86::BSR64rr, X86::BSR64rm, 0 }, - { X86::CMP16rr, X86::CMP16rm, 0 }, - { X86::CMP32rr, X86::CMP32rm, 0 }, - { X86::CMP64rr, X86::CMP64rm, 0 }, - { X86::CMP8rr, X86::CMP8rm, 0 }, - { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, - { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, - { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, - { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, - { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, - { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, - { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, - { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, - { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, - { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, - { X86::IMUL16rri, X86::IMUL16rmi, 0 }, - { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, - { X86::IMUL32rri, X86::IMUL32rmi, 0 }, - { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, - { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, - { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, - { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE }, - { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE }, - { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE }, - { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE }, - { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE }, - { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE }, - { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE }, - { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, - { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, - { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, - { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, - { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, - { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, - { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE }, - { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE }, - { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE }, - { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE }, - { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE }, - { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE }, - { X86::MOV16rr, X86::MOV16rm, 0 }, - { X86::MOV32rr, X86::MOV32rm, 0 }, - { X86::MOV64rr, X86::MOV64rm, 0 }, - { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, - { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, - { X86::MOV8rr, X86::MOV8rm, 0 }, - { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, - { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, - { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE }, - { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, - { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, - { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, - { X86::MOVDQUrr, X86::MOVDQUrm, 0 }, - { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, - { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, - { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, - { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 }, - { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 }, - { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, - { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, - { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, - { X86::MOVUPDrr, X86::MOVUPDrm, 0 }, - { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE }, - { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, - { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, - { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, - { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, - { X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 }, - { X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 }, - { X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 }, - { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, - { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, - { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, - { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, - { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, - { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE }, - { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE }, - { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE }, - { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE }, - { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE }, - { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE }, - { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE }, - { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE }, - { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE }, - { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE }, - { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE }, - { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE }, - { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, - { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, - { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, - { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, - { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, - { X86::RCPSSr, X86::RCPSSm, 0 }, - { X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE }, - { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, - { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, - { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, - { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, - { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, - { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, - { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE }, - { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, - { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, - { X86::SQRTSDr, X86::SQRTSDm, 0 }, - { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE }, - { X86::SQRTSSr, X86::SQRTSSm, 0 }, - { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE }, - { X86::TEST16rr, X86::TEST16rm, 0 }, - { X86::TEST32rr, X86::TEST32rm, 0 }, - { X86::TEST64rr, X86::TEST64rm, 0 }, - { X86::TEST8rr, X86::TEST8rm, 0 }, - // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 - { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, - { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, - - // MMX version of foldable instructions - { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 }, - { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 }, - { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 }, - { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 }, - { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 }, - { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 }, - { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 }, - { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 }, - { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 }, - { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 }, - - // 3DNow! version of foldable instructions - { X86::PF2IDrr, X86::PF2IDrm, 0 }, - { X86::PF2IWrr, X86::PF2IWrm, 0 }, - { X86::PFRCPrr, X86::PFRCPrm, 0 }, - { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 }, - { X86::PI2FDrr, X86::PI2FDrm, 0 }, - { X86::PI2FWrr, X86::PI2FWrm, 0 }, - { X86::PSWAPDrr, X86::PSWAPDrm, 0 }, - - // AVX 128-bit versions of foldable instructions - { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE }, - { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE }, - { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE }, - { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE }, - { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, - { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE }, - { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, - { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE }, - { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, - { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE }, - { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, - { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE }, - { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE }, - { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE }, - { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE }, - { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE }, - { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, - { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, - { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 }, - { X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 }, - { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, - { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 }, - { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, - { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, - { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, - { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, - { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, - { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE }, - { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, - { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, - { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, - { X86::VMOVDQUrr, X86::VMOVDQUrm, 0 }, - { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, - { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, - { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, - { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE }, - { X86::VPABSBrr, X86::VPABSBrm, 0 }, - { X86::VPABSDrr, X86::VPABSDrm, 0 }, - { X86::VPABSWrr, X86::VPABSWrm, 0 }, - { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, - { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, - { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, - { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, - { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, - { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, - { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, - { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE }, - { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE }, - { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE }, - { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE }, - { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE }, - { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE }, - { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE }, - { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE }, - { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE }, - { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE }, - { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE }, - { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE }, - { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, - { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, - { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, - { X86::VPTESTrr, X86::VPTESTrm, 0 }, - { X86::VRCPPSr, X86::VRCPPSm, 0 }, - { X86::VROUNDPDr, X86::VROUNDPDm, 0 }, - { X86::VROUNDPSr, X86::VROUNDPSm, 0 }, - { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, - { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, - { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, - { X86::VTESTPDrr, X86::VTESTPDrm, 0 }, - { X86::VTESTPSrr, X86::VTESTPSrm, 0 }, - { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, - { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, - - // AVX 256-bit foldable instructions - { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE }, - { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, - { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, - { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, - { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, - { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE }, - { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, - { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, - { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, - { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, - { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, - { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, - { X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 }, - { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, - { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, - { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, - { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, - { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, - { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, - { X86::VPTESTYrr, X86::VPTESTYrm, 0 }, - { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, - { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 }, - { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 }, - { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, - { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, - { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, - { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 }, - { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 }, - - // AVX2 foldable instructions - - // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the - // VBROADCASTS{SD}rm memory instructions were available from AVX1. - // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction - // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions - // so they don't need an equivalent limitation. - { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, - { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, - { X86::VPABSBYrr, X86::VPABSBYrm, 0 }, - { X86::VPABSDYrr, X86::VPABSDYrm, 0 }, - { X86::VPABSWYrr, X86::VPABSWYrm, 0 }, - { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE }, - { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE }, - { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE }, - { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE }, - { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE }, - { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, - { X86::VPERMQYri, X86::VPERMQYmi, 0 }, - { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE }, - { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE }, - { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, - { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, - { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, - { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE }, - { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE }, - { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE }, - { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, - { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, - { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, - { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE }, - { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, - { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, - { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, - - // XOP foldable instructions - { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 }, - { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 }, - { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 }, - { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 }, - { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 }, - { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 }, - { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 }, - { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 }, - { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 }, - { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 }, - { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 }, - { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 }, - { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 }, - { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 }, - { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 }, - { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 }, - { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 }, - { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 }, - { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 }, - { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 }, - { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 }, - { X86::VPROTBri, X86::VPROTBmi, 0 }, - { X86::VPROTBrr, X86::VPROTBmr, 0 }, - { X86::VPROTDri, X86::VPROTDmi, 0 }, - { X86::VPROTDrr, X86::VPROTDmr, 0 }, - { X86::VPROTQri, X86::VPROTQmi, 0 }, - { X86::VPROTQrr, X86::VPROTQmr, 0 }, - { X86::VPROTWri, X86::VPROTWmi, 0 }, - { X86::VPROTWrr, X86::VPROTWmr, 0 }, - { X86::VPSHABrr, X86::VPSHABmr, 0 }, - { X86::VPSHADrr, X86::VPSHADmr, 0 }, - { X86::VPSHAQrr, X86::VPSHAQmr, 0 }, - { X86::VPSHAWrr, X86::VPSHAWmr, 0 }, - { X86::VPSHLBrr, X86::VPSHLBmr, 0 }, - { X86::VPSHLDrr, X86::VPSHLDmr, 0 }, - { X86::VPSHLQrr, X86::VPSHLQmr, 0 }, - { X86::VPSHLWrr, X86::VPSHLWmr, 0 }, - - // LWP foldable instructions - { X86::LWPINS32rri, X86::LWPINS32rmi, 0 }, - { X86::LWPINS64rri, X86::LWPINS64rmi, 0 }, - { X86::LWPVAL32rri, X86::LWPVAL32rmi, 0 }, - { X86::LWPVAL64rri, X86::LWPVAL64rmi, 0 }, - - // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions - { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, - { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, - { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 }, - { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 }, - { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 }, - { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 }, - { X86::BLCI32rr, X86::BLCI32rm, 0 }, - { X86::BLCI64rr, X86::BLCI64rm, 0 }, - { X86::BLCIC32rr, X86::BLCIC32rm, 0 }, - { X86::BLCIC64rr, X86::BLCIC64rm, 0 }, - { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 }, - { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 }, - { X86::BLCS32rr, X86::BLCS32rm, 0 }, - { X86::BLCS64rr, X86::BLCS64rm, 0 }, - { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 }, - { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 }, - { X86::BLSI32rr, X86::BLSI32rm, 0 }, - { X86::BLSI64rr, X86::BLSI64rm, 0 }, - { X86::BLSIC32rr, X86::BLSIC32rm, 0 }, - { X86::BLSIC64rr, X86::BLSIC64rm, 0 }, - { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, - { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, - { X86::BLSR32rr, X86::BLSR32rm, 0 }, - { X86::BLSR64rr, X86::BLSR64rm, 0 }, - { X86::BZHI32rr, X86::BZHI32rm, 0 }, - { X86::BZHI64rr, X86::BZHI64rm, 0 }, - { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, - { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, - { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, - { X86::POPCNT16rr, X86::POPCNT16rm, 0 }, - { X86::POPCNT32rr, X86::POPCNT32rm, 0 }, - { X86::POPCNT64rr, X86::POPCNT64rm, 0 }, - { X86::RORX32ri, X86::RORX32mi, 0 }, - { X86::RORX64ri, X86::RORX64mi, 0 }, - { X86::SARX32rr, X86::SARX32rm, 0 }, - { X86::SARX64rr, X86::SARX64rm, 0 }, - { X86::SHRX32rr, X86::SHRX32rm, 0 }, - { X86::SHRX64rr, X86::SHRX64rm, 0 }, - { X86::SHLX32rr, X86::SHLX32rm, 0 }, - { X86::SHLX64rr, X86::SHLX64rm, 0 }, - { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, - { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, - { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, - { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, - { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, - { X86::TZMSK32rr, X86::TZMSK32rm, 0 }, - { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, - - // AVX-512 foldable instructions - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, - { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, - { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 }, - { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 }, - { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, - { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, - { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, - { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, - { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, - { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, - { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, - { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, - { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, - { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, - { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, - { X86::VPABSBZrr, X86::VPABSBZrm, 0 }, - { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, - { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, - { X86::VPABSWZrr, X86::VPABSWZrm, 0 }, - { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 }, - { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 }, - { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, - { X86::VPERMQZri, X86::VPERMQZmi, 0 }, - { X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 }, - { X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE }, - { X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 }, - { X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 }, - { X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 }, - { X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 }, - { X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 }, - { X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE }, - { X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 }, - { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 }, - { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 }, - { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 }, - { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, - { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, - { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, - { X86::VPSLLDQZ512rr, X86::VPSLLDQZ512rm, 0 }, - { X86::VPSLLDZri, X86::VPSLLDZmi, 0 }, - { X86::VPSLLQZri, X86::VPSLLQZmi, 0 }, - { X86::VPSLLWZri, X86::VPSLLWZmi, 0 }, - { X86::VPSRADZri, X86::VPSRADZmi, 0 }, - { X86::VPSRAQZri, X86::VPSRAQZmi, 0 }, - { X86::VPSRAWZri, X86::VPSRAWZmi, 0 }, - { X86::VPSRLDQZ512rr, X86::VPSRLDQZ512rm, 0 }, - { X86::VPSRLDZri, X86::VPSRLDZmi, 0 }, - { X86::VPSRLQZri, X86::VPSRLQZmi, 0 }, - { X86::VPSRLWZri, X86::VPSRLWZmi, 0 }, - - // AVX-512 foldable instructions (256-bit versions) - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, - { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, - { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, - { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, - { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, - { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, - { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, - { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, - { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, - { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, - { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 }, - { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 }, - { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 }, - { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 }, - { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 }, - { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 }, - { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 }, - { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 }, - { X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 }, - { X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 }, - { X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 }, - { X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 }, - { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 }, - { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 }, - { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE }, - { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 }, - { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 }, - { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, - { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 }, - { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 }, - { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 }, - { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 }, - { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 }, - { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 }, - { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 }, - { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 }, - { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 }, - { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 }, - { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 }, - - // AVX-512 foldable instructions (128-bit versions) - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, - { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, - { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, - { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, - { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, - { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, - { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, - { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, - { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, - { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 }, - { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 }, - { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 }, - { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 }, - { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 }, - { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 }, - { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE }, - { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE }, - { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 }, - { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 }, - { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 }, - { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 }, - { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 }, - { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 }, - { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 }, - { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 }, - { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 }, - { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 }, - { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 }, - { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 }, - { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 }, - { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 }, - - // F16C foldable instructions - { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, - { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, - - // AES foldable instructions - { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, - { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, - { X86::VAESIMCrr, X86::VAESIMCrm, 0 }, - { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 } - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) { AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, @@ -1040,1394 +143,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } - static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { - { X86::ADC32rr, X86::ADC32rm, 0 }, - { X86::ADC64rr, X86::ADC64rm, 0 }, - { X86::ADD16rr, X86::ADD16rm, 0 }, - { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, - { X86::ADD32rr, X86::ADD32rm, 0 }, - { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, - { X86::ADD64rr, X86::ADD64rm, 0 }, - { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, - { X86::ADD8rr, X86::ADD8rm, 0 }, - { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, - { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, - { X86::ADDSDrr, X86::ADDSDrm, 0 }, - { X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE }, - { X86::ADDSSrr, X86::ADDSSrm, 0 }, - { X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE }, - { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, - { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, - { X86::AND16rr, X86::AND16rm, 0 }, - { X86::AND32rr, X86::AND32rm, 0 }, - { X86::AND64rr, X86::AND64rm, 0 }, - { X86::AND8rr, X86::AND8rm, 0 }, - { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 }, - { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, - { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, - { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, - { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 }, - { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, - { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, - { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, - { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, - { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, - { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, - { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, - { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, - { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, - { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, - { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, - { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, - { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, - { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, - { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, - { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, - { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, - { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, - { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, - { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, - { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, - { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, - { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, - { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, - { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, - { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, - { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, - { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, - { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, - { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, - { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, - { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, - { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, - { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, - { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, - { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, - { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, - { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, - { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, - { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, - { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, - { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, - { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, - { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, - { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, - { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, - { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, - { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, - { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, - { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, - { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, - { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, - { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, - { X86::CMPSDrr, X86::CMPSDrm, 0 }, - { X86::CMPSSrr, X86::CMPSSrm, 0 }, - { X86::CRC32r32r32, X86::CRC32r32m32, 0 }, - { X86::CRC32r64r64, X86::CRC32r64m64, 0 }, - { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, - { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, - { X86::DIVSDrr, X86::DIVSDrm, 0 }, - { X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE }, - { X86::DIVSSrr, X86::DIVSSrm, 0 }, - { X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE }, - { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, - { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, - { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, - { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, - { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, - { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 }, - { X86::IMUL16rr, X86::IMUL16rm, 0 }, - { X86::IMUL32rr, X86::IMUL32rm, 0 }, - { X86::IMUL64rr, X86::IMUL64rm, 0 }, - { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE }, - { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE }, - { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE }, - { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, - { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, - { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, - { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, - { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE }, - { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, - { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 }, - { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, - { X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 }, - { X86::MAXSDrr, X86::MAXSDrm, 0 }, - { X86::MAXCSDrr, X86::MAXCSDrm, 0 }, - { X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE }, - { X86::MAXSSrr, X86::MAXSSrm, 0 }, - { X86::MAXCSSrr, X86::MAXCSSrm, 0 }, - { X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE }, - { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, - { X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 }, - { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, - { X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 }, - { X86::MINSDrr, X86::MINSDrm, 0 }, - { X86::MINCSDrr, X86::MINCSDrm, 0 }, - { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE }, - { X86::MINSSrr, X86::MINSSrm, 0 }, - { X86::MINCSSrr, X86::MINCSSrm, 0 }, - { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE }, - { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, - { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, - { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, - { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, - { X86::MULSDrr, X86::MULSDrm, 0 }, - { X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE }, - { X86::MULSSrr, X86::MULSSrm, 0 }, - { X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE }, - { X86::OR16rr, X86::OR16rm, 0 }, - { X86::OR32rr, X86::OR32rm, 0 }, - { X86::OR64rr, X86::OR64rm, 0 }, - { X86::OR8rr, X86::OR8rm, 0 }, - { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 }, - { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, - { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, - { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, - { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 }, - { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, - { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, - { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, - { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, - { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, - { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, - { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, - { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, - { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, - { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 }, - { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, - { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, - { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, - { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, - { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 }, - { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, - { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 }, - { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, - { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, - { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, - { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, - { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, - { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, - { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 }, - { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, - { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 }, - { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 }, - { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 }, - { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, - { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, - { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, - { X86::PINSRBrr, X86::PINSRBrm, 0 }, - { X86::PINSRDrr, X86::PINSRDrm, 0 }, - { X86::PINSRQrr, X86::PINSRQrm, 0 }, - { X86::PINSRWrri, X86::PINSRWrmi, 0 }, - { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 }, - { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, - { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, - { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, - { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, - { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, - { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, - { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, - { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, - { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, - { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, - { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, - { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, - { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, - { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, - { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 }, - { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, - { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, - { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, - { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 }, - { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, - { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, - { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, - { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, - { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 }, - { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 }, - { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 }, - { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, - { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, - { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, - { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 }, - { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 }, - { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 }, - { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 }, - { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, - { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, - { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, - { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 }, - { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, - { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, - { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 }, - { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 }, - { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, - { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, - { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, - { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 }, - { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 }, - { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 }, - { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 }, - { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, - { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, - { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, - { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE }, - { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE }, - { X86::SBB32rr, X86::SBB32rm, 0 }, - { X86::SBB64rr, X86::SBB64rm, 0 }, - { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, - { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 }, - { X86::SUB16rr, X86::SUB16rm, 0 }, - { X86::SUB32rr, X86::SUB32rm, 0 }, - { X86::SUB64rr, X86::SUB64rm, 0 }, - { X86::SUB8rr, X86::SUB8rm, 0 }, - { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, - { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, - { X86::SUBSDrr, X86::SUBSDrm, 0 }, - { X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE }, - { X86::SUBSSrr, X86::SUBSSrm, 0 }, - { X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE }, - // FIXME: TEST*rr -> swapped operand of TEST*mr. - { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, - { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, - { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, - { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 }, - { X86::XOR16rr, X86::XOR16rm, 0 }, - { X86::XOR32rr, X86::XOR32rm, 0 }, - { X86::XOR64rr, X86::XOR64rm, 0 }, - { X86::XOR8rr, X86::XOR8rm, 0 }, - { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, - { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, - - // MMX version of foldable instructions - { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 }, - { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 }, - { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 }, - { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 }, - { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 }, - { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 }, - { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 }, - { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 }, - { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 }, - { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 }, - { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 }, - { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 }, - { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 }, - { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 }, - { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 }, - { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 }, - { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 }, - { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 }, - { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 }, - { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 }, - { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 }, - { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 }, - { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 }, - { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 }, - { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 }, - { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 }, - { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 }, - { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 }, - { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 }, - { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 }, - { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 }, - { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 }, - { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 }, - { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 }, - { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 }, - { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 }, - { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 }, - { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 }, - { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 }, - { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 }, - { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 }, - { X86::MMX_PORirr, X86::MMX_PORirm, 0 }, - { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 }, - { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 }, - { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 }, - { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 }, - { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 }, - { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 }, - { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 }, - { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 }, - { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 }, - { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 }, - { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 }, - { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 }, - { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 }, - { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 }, - { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 }, - { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 }, - { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 }, - { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 }, - { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 }, - { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 }, - { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 }, - { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 }, - { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 }, - { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 }, - { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 }, - { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 }, - { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 }, - { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, - - // 3DNow! version of foldable instructions - { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 }, - { X86::PFACCrr, X86::PFACCrm, 0 }, - { X86::PFADDrr, X86::PFADDrm, 0 }, - { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 }, - { X86::PFCMPGErr, X86::PFCMPGErm, 0 }, - { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 }, - { X86::PFMAXrr, X86::PFMAXrm, 0 }, - { X86::PFMINrr, X86::PFMINrm, 0 }, - { X86::PFMULrr, X86::PFMULrm, 0 }, - { X86::PFNACCrr, X86::PFNACCrm, 0 }, - { X86::PFPNACCrr, X86::PFPNACCrm, 0 }, - { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 }, - { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 }, - { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 }, - { X86::PFSUBrr, X86::PFSUBrm, 0 }, - { X86::PFSUBRrr, X86::PFSUBRrm, 0 }, - { X86::PMULHRWrr, X86::PMULHRWrm, 0 }, - - // AVX 128-bit versions of foldable instructions - { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, - { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, - { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, - { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, - { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, - { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, - { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, - { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, - { X86::VADDPDrr, X86::VADDPDrm, 0 }, - { X86::VADDPSrr, X86::VADDPSrm, 0 }, - { X86::VADDSDrr, X86::VADDSDrm, 0 }, - { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE }, - { X86::VADDSSrr, X86::VADDSSrm, 0 }, - { X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE }, - { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, - { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, - { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, - { X86::VANDNPSrr, X86::VANDNPSrm, 0 }, - { X86::VANDPDrr, X86::VANDPDrm, 0 }, - { X86::VANDPSrr, X86::VANDPSrm, 0 }, - { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 }, - { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 }, - { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, - { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, - { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, - { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, - { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, - { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, - { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, - { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, - { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, - { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE }, - { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, - { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE }, - { X86::VDPPDrri, X86::VDPPDrmi, 0 }, - { X86::VDPPSrri, X86::VDPPSrmi, 0 }, - { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, - { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, - { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, - { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, - { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE }, - { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE }, - { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, - { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, - { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, - { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 }, - { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, - { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, - { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, - { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE }, - { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, - { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE }, - { X86::VMINCPDrr, X86::VMINCPDrm, 0 }, - { X86::VMINCPSrr, X86::VMINCPSrm, 0 }, - { X86::VMINCSDrr, X86::VMINCSDrm, 0 }, - { X86::VMINCSSrr, X86::VMINCSSrm, 0 }, - { X86::VMINPDrr, X86::VMINPDrm, 0 }, - { X86::VMINPSrr, X86::VMINPSrm, 0 }, - { X86::VMINSDrr, X86::VMINSDrm, 0 }, - { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE }, - { X86::VMINSSrr, X86::VMINSSrm, 0 }, - { X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE }, - { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, - { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, - { X86::VMULPDrr, X86::VMULPDrm, 0 }, - { X86::VMULPSrr, X86::VMULPSrm, 0 }, - { X86::VMULSDrr, X86::VMULSDrm, 0 }, - { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE }, - { X86::VMULSSrr, X86::VMULSSrm, 0 }, - { X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE }, - { X86::VORPDrr, X86::VORPDrm, 0 }, - { X86::VORPSrr, X86::VORPSrm, 0 }, - { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, - { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 }, - { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 }, - { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 }, - { X86::VPADDBrr, X86::VPADDBrm, 0 }, - { X86::VPADDDrr, X86::VPADDDrm, 0 }, - { X86::VPADDQrr, X86::VPADDQrm, 0 }, - { X86::VPADDSBrr, X86::VPADDSBrm, 0 }, - { X86::VPADDSWrr, X86::VPADDSWrm, 0 }, - { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, - { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, - { X86::VPADDWrr, X86::VPADDWrm, 0 }, - { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 }, - { X86::VPANDNrr, X86::VPANDNrm, 0 }, - { X86::VPANDrr, X86::VPANDrm, 0 }, - { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, - { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, - { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 }, - { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, - { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 }, - { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, - { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, - { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, - { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 }, - { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 }, - { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 }, - { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 }, - { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 }, - { X86::VPHADDDrr, X86::VPHADDDrm, 0 }, - { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 }, - { X86::VPHADDWrr, X86::VPHADDWrm, 0 }, - { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 }, - { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 }, - { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, - { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, - { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, - { X86::VPINSRBrr, X86::VPINSRBrm, 0 }, - { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, - { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, - { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, - { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 }, - { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, - { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, - { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, - { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, - { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, - { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, - { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, - { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, - { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, - { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, - { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, - { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, - { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, - { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, - { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 }, - { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, - { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, - { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, - { X86::VPMULLWrr, X86::VPMULLWrm, 0 }, - { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 }, - { X86::VPORrr, X86::VPORrm, 0 }, - { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, - { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, - { X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 }, - { X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 }, - { X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 }, - { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, - { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, - { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, - { X86::VPSRADrr, X86::VPSRADrm, 0 }, - { X86::VPSRAWrr, X86::VPSRAWrm, 0 }, - { X86::VPSRLDrr, X86::VPSRLDrm, 0 }, - { X86::VPSRLQrr, X86::VPSRLQrm, 0 }, - { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, - { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, - { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, - { X86::VPSUBQrr, X86::VPSUBQrm, 0 }, - { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, - { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, - { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 }, - { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 }, - { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, - { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, - { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, - { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 }, - { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 }, - { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 }, - { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 }, - { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, - { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, - { X86::VPXORrr, X86::VPXORrm, 0 }, - { X86::VRCPSSr, X86::VRCPSSm, 0 }, - { X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE }, - { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, - { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE }, - { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, - { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE }, - { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, - { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE }, - { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, - { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, - { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, - { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE }, - { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, - { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE }, - { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, - { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, - { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, - { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE }, - { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, - { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE }, - { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, - { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, - { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, - { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, - { X86::VXORPDrr, X86::VXORPDrm, 0 }, - { X86::VXORPSrr, X86::VXORPSrm, 0 }, - - // AVX 256-bit foldable instructions - { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, - { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, - { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 }, - { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 }, - { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 }, - { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 }, - { X86::VANDPDYrr, X86::VANDPDYrm, 0 }, - { X86::VANDPSYrr, X86::VANDPSYrm, 0 }, - { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 }, - { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 }, - { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 }, - { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, - { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, - { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, - { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, - { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, - { X86::VDPPSYrri, X86::VDPPSYrmi, 0 }, - { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, - { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, - { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, - { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, - { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, - { X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 }, - { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 }, - { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, - { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, - { X86::VMINCPDYrr, X86::VMINCPDYrm, 0 }, - { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 }, - { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, - { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, - { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, - { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, - { X86::VORPDYrr, X86::VORPDYrm, 0 }, - { X86::VORPSYrr, X86::VORPSYrm, 0 }, - { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 }, - { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 }, - { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 }, - { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 }, - { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 }, - { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 }, - { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, - { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 }, - { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 }, - { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 }, - { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, - { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, - { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, - - // AVX2 foldable instructions - { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, - { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, - { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 }, - { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 }, - { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 }, - { X86::VPADDBYrr, X86::VPADDBYrm, 0 }, - { X86::VPADDDYrr, X86::VPADDDYrm, 0 }, - { X86::VPADDQYrr, X86::VPADDQYrm, 0 }, - { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 }, - { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 }, - { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, - { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, - { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, - { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 }, - { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, - { X86::VPANDYrr, X86::VPANDYrm, 0 }, - { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, - { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, - { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, - { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, - { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 }, - { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, - { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, - { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, - { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 }, - { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 }, - { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 }, - { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 }, - { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 }, - { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, - { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, - { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, - { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, - { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, - { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, - { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, - { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, - { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, - { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, - { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 }, - { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, - { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, - { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, - { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, - { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, - { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, - { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, - { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, - { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, - { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, - { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, - { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, - { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, - { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, - { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, - { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 }, - { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, - { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, - { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, - { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 }, - { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 }, - { X86::VPORYrr, X86::VPORYrm, 0 }, - { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, - { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, - { X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 }, - { X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 }, - { X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 }, - { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, - { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, - { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, - { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 }, - { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 }, - { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 }, - { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 }, - { X86::VPSRADYrr, X86::VPSRADYrm, 0 }, - { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, - { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, - { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, - { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, - { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, - { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, - { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 }, - { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 }, - { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 }, - { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, - { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, - { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, - { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 }, - { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, - { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, - { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 }, - { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 }, - { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, - { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, - { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, - { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 }, - { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 }, - { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 }, - { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 }, - { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, - { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, - { X86::VPXORYrr, X86::VPXORYrm, 0 }, - - // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE }, - { X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE }, - { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE }, - { X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE }, - { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE }, - { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE }, - { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE }, - { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE }, - { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE }, - - // XOP foldable instructions - { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, - { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 }, - { X86::VPCOMBri, X86::VPCOMBmi, 0 }, - { X86::VPCOMDri, X86::VPCOMDmi, 0 }, - { X86::VPCOMQri, X86::VPCOMQmi, 0 }, - { X86::VPCOMWri, X86::VPCOMWmi, 0 }, - { X86::VPCOMUBri, X86::VPCOMUBmi, 0 }, - { X86::VPCOMUDri, X86::VPCOMUDmi, 0 }, - { X86::VPCOMUQri, X86::VPCOMUQmi, 0 }, - { X86::VPCOMUWri, X86::VPCOMUWmi, 0 }, - { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 }, - { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 }, - { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 }, - { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 }, - { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 }, - { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 }, - { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 }, - { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 }, - { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 }, - { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 }, - { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 }, - { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 }, - { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 }, - { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 }, - { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 }, - { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 }, - { X86::VPPERMrrr, X86::VPPERMrmr, 0 }, - { X86::VPROTBrr, X86::VPROTBrm, 0 }, - { X86::VPROTDrr, X86::VPROTDrm, 0 }, - { X86::VPROTQrr, X86::VPROTQrm, 0 }, - { X86::VPROTWrr, X86::VPROTWrm, 0 }, - { X86::VPSHABrr, X86::VPSHABrm, 0 }, - { X86::VPSHADrr, X86::VPSHADrm, 0 }, - { X86::VPSHAQrr, X86::VPSHAQrm, 0 }, - { X86::VPSHAWrr, X86::VPSHAWrm, 0 }, - { X86::VPSHLBrr, X86::VPSHLBrm, 0 }, - { X86::VPSHLDrr, X86::VPSHLDrm, 0 }, - { X86::VPSHLQrr, X86::VPSHLQrm, 0 }, - { X86::VPSHLWrr, X86::VPSHLWrm, 0 }, - - // BMI/BMI2 foldable instructions - { X86::ANDN32rr, X86::ANDN32rm, 0 }, - { X86::ANDN64rr, X86::ANDN64rm, 0 }, - { X86::MULX32rr, X86::MULX32rm, 0 }, - { X86::MULX64rr, X86::MULX64rm, 0 }, - { X86::PDEP32rr, X86::PDEP32rm, 0 }, - { X86::PDEP64rr, X86::PDEP64rm, 0 }, - { X86::PEXT32rr, X86::PEXT32rm, 0 }, - { X86::PEXT64rr, X86::PEXT64rm, 0 }, - - // ADX foldable instructions - { X86::ADCX32rr, X86::ADCX32rm, 0 }, - { X86::ADCX64rr, X86::ADCX64rm, 0 }, - { X86::ADOX32rr, X86::ADOX32rm, 0 }, - { X86::ADOX64rr, X86::ADOX64rm, 0 }, - - // AVX-512 foldable instructions - { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, - { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, - { X86::VADDSDZrr, X86::VADDSDZrm, 0 }, - { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE }, - { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, - { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE }, - { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, - { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, - { X86::VANDNPDZrr, X86::VANDNPDZrm, 0 }, - { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 }, - { X86::VANDPDZrr, X86::VANDPDZrm, 0 }, - { X86::VANDPSZrr, X86::VANDPSZrm, 0 }, - { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, - { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 }, - { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 }, - { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, - { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, - { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, - { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, - { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, - { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, - { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE }, - { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, - { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE }, - { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 }, - { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 }, - { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 }, - { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 }, - { X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 }, - { X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 }, - { X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 }, - { X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 }, - { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 }, - { X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 }, - { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 }, - { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 }, - { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, - { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, - { X86::VMAXSDZrr, X86::VMAXSDZrm, 0 }, - { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE }, - { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 }, - { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE }, - { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 }, - { X86::VMINCPSZrr, X86::VMINCPSZrm, 0 }, - { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 }, - { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 }, - { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, - { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, - { X86::VMINSDZrr, X86::VMINSDZrm, 0 }, - { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, - { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, - { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, - { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE }, - { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, - { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, - { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, - { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE }, - { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, - { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, - { X86::VORPDZrr, X86::VORPDZrm, 0 }, - { X86::VORPSZrr, X86::VORPSZrm, 0 }, - { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 }, - { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 }, - { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 }, - { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 }, - { X86::VPADDBZrr, X86::VPADDBZrm, 0 }, - { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, - { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, - { X86::VPADDSBZrr, X86::VPADDSBZrm, 0 }, - { X86::VPADDSWZrr, X86::VPADDSWZrm, 0 }, - { X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 }, - { X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 }, - { X86::VPADDWZrr, X86::VPADDWZrm, 0 }, - { X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 }, - { X86::VPANDDZrr, X86::VPANDDZrm, 0 }, - { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 }, - { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 }, - { X86::VPANDQZrr, X86::VPANDQZrm, 0 }, - { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 }, - { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 }, - { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 }, - { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 }, - { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 }, - { X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 }, - { X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 }, - { X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 }, - { X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 }, - { X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 }, - { X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 }, - { X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 }, - { X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 }, - { X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 }, - { X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 }, - { X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 }, - { X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 }, - { X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 }, - { X86::VPERMBZrr, X86::VPERMBZrm, 0 }, - { X86::VPERMDZrr, X86::VPERMDZrm, 0 }, - { X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 }, - { X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 }, - { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 }, - { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, - { X86::VPERMQZrr, X86::VPERMQZrm, 0 }, - { X86::VPERMWZrr, X86::VPERMWZrm, 0 }, - { X86::VPINSRBZrr, X86::VPINSRBZrm, 0 }, - { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 }, - { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 }, - { X86::VPINSRWZrr, X86::VPINSRWZrm, 0 }, - { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 }, - { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 }, - { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 }, - { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, - { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, - { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 }, - { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 }, - { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, - { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 }, - { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 }, - { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 }, - { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 }, - { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 }, - { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 }, - { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 }, - { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, - { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, - { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 }, - { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, - { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 }, - { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 }, - { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 }, - { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, - { X86::VPORDZrr, X86::VPORDZrm, 0 }, - { X86::VPORQZrr, X86::VPORQZrm, 0 }, - { X86::VPSADBWZ512rr, X86::VPSADBWZ512rm, 0 }, - { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, - { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 }, - { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 }, - { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, - { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, - { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 }, - { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 }, - { X86::VPSRADZrr, X86::VPSRADZrm, 0 }, - { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 }, - { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, - { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 }, - { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 }, - { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 }, - { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 }, - { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 }, - { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, - { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, - { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 }, - { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 }, - { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 }, - { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, - { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, - { X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 }, - { X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 }, - { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 }, - { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 }, - { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 }, - { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 }, - { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 }, - { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 }, - { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 }, - { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 }, - { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 }, - { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 }, - { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, - { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, - { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, - { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, - { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, - { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, - { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, - { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, - { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, - { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, - { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, - { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 }, - { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 }, - { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 }, - { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 }, - { X86::VXORPDZrr, X86::VXORPDZrm, 0 }, - { X86::VXORPSZrr, X86::VXORPSZrm, 0 }, - - // AVX-512{F,VL} foldable instructions - { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, - { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, - { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, - { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, - { X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 }, - { X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 }, - { X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 }, - { X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 }, - { X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 }, - { X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 }, - { X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 }, - { X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 }, - { X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 }, - { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 }, - { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 }, - { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 }, - { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 }, - { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, - { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, - { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 }, - { X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 }, - { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 }, - { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 }, - { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 }, - { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 }, - { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 }, - { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 }, - { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 }, - { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 }, - { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 }, - { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 }, - { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 }, - { X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 }, - { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 }, - { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 }, - { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 }, - { X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 }, - { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 }, - { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 }, - { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 }, - { X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 }, - { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 }, - { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 }, - { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 }, - { X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 }, - { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 }, - { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 }, - { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 }, - { X86::VORPDZ128rr, X86::VORPDZ128rm, 0 }, - { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 }, - { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 }, - { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, - { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 }, - { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 }, - { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 }, - { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 }, - { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 }, - { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 }, - { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 }, - { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 }, - { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 }, - { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 }, - { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 }, - { X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 }, - { X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 }, - { X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 }, - { X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 }, - { X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 }, - { X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 }, - { X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 }, - { X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 }, - { X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 }, - { X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 }, - { X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 }, - { X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 }, - { X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 }, - { X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 }, - { X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 }, - { X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 }, - { X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 }, - { X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 }, - { X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 }, - { X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 }, - { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 }, - { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 }, - { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 }, - { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 }, - { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 }, - { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 }, - { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 }, - { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 }, - { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 }, - { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 }, - { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 }, - { X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 }, - { X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 }, - { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 }, - { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 }, - { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 }, - { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 }, - { X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 }, - { X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 }, - { X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 }, - { X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 }, - { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 }, - { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 }, - { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 }, - { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 }, - { X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 }, - { X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 }, - { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 }, - { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 }, - { X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 }, - { X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 }, - { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 }, - { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 }, - { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 }, - { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 }, - { X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 }, - { X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 }, - { X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 }, - { X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 }, - { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 }, - { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 }, - { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 }, - { X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 }, - { X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 }, - { X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 }, - { X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 }, - { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 }, - { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 }, - { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 }, - { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 }, - { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 }, - { X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 }, - { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 }, - { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 }, - { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 }, - { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 }, - { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 }, - { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 }, - { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 }, - { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 }, - { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 }, - { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 }, - { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 }, - { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 }, - { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 }, - { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 }, - { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 }, - { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 }, - { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 }, - { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 }, - { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 }, - { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 }, - { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 }, - { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 }, - { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 }, - { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 }, - { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 }, - { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 }, - { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 }, - { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 }, - { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 }, - { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 }, - { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 }, - { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 }, - { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 }, - { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 }, - { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 }, - { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 }, - { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 }, - { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 }, - { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 }, - { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 }, - { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 }, - { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 }, - { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 }, - { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 }, - { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 }, - { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 }, - { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 }, - { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 }, - { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 }, - { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 }, - { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 }, - { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 }, - { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 }, - { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 }, - { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 }, - { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 }, - { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 }, - { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 }, - { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 }, - { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 }, - { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 }, - { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 }, - { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 }, - { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 }, - { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 }, - { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 }, - { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 }, - { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 }, - { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 }, - { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 }, - { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 }, - { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 }, - { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 }, - { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 }, - { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 }, - { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 }, - { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 }, - { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 }, - { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 }, - { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 }, - { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 }, - { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 }, - { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 }, - { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 }, - { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 }, - { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 }, - { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 }, - { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 }, - { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 }, - { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 }, - { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 }, - { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 }, - { X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 }, - { X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 }, - { X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 }, - { X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 }, - { X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 }, - { X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 }, - { X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 }, - { X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 }, - { X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 }, - { X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 }, - { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 }, - { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 }, - { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 }, - { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 }, - { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 }, - { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 }, - { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 }, - { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 }, - { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 }, - { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 }, - { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 }, - { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 }, - { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 }, - { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 }, - { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 }, - { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 }, - { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 }, - { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 }, - { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 }, - { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 }, - { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 }, - { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 }, - { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 }, - { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 }, - { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 }, - { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 }, - { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 }, - { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 }, - { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, - { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, - { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, - { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 }, - { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 }, - { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 }, - { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 }, - { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 }, - { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 }, - { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 }, - { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 }, - { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 }, - { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 }, - { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 }, - { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, - - // AVX-512 masked foldable instructions - { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, - { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 }, - { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 }, - { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 }, - { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 }, - { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 }, - { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 }, - { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 }, - { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 }, - { X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 }, - { X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 }, - { X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 }, - { X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 }, - { X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 }, - { X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 }, - { X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 }, - { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 }, - { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 }, - { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 }, - { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 }, - { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 }, - { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, - { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 }, - { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 }, - { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 }, - { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 }, - { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 }, - { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 }, - { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 }, - { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 }, - { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 }, - - // AVX-512VL 256-bit masked foldable instructions - { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, - { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 }, - { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 }, - { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 }, - { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 }, - { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 }, - { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 }, - { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 }, - { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 }, - { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 }, - { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 }, - { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 }, - { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 }, - { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 }, - { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 }, - { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE }, - { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 }, - { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 }, - { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, - { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 }, - { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 }, - { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 }, - { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 }, - { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 }, - { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 }, - { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 }, - { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 }, - { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 }, - - // AVX-512VL 128-bit masked foldable instructions - { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, - { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 }, - { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 }, - { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 }, - { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 }, - { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 }, - { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 }, - { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE }, - { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE }, - { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 }, - { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 }, - { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 }, - { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 }, - { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 }, - { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 }, - { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 }, - { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 }, - { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 }, - { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 }, - { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 }, - { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 }, - - // AES foldable instructions - { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, - { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, - { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, - { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, - { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 }, - { X86::VAESDECrr, X86::VAESDECrm, 0 }, - { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 }, - { X86::VAESENCrr, X86::VAESENCrm, 0 }, - - // SHA foldable instructions - { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, - { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 }, - { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 }, - { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, - { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, - { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, - { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 } - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) { AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, @@ -2435,1103 +150,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } - static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { - // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, - { X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE }, - { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, - { X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE }, - { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE }, - { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE }, - { X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE }, - { X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE }, - { X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE }, - { X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE }, - { X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE }, - { X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE }, - { X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE }, - - // XOP foldable instructions - { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, - { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 }, - { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, - { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 }, - { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, - { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 }, - { X86::VPPERMrrr, X86::VPPERMrrm, 0 }, - - // AVX-512 instructions with 3 source operands. - { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 }, - { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, - { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, - { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, - { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, - { X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 }, - { X86::VPERMT2Brr, X86::VPERMT2Brm, 0 }, - { X86::VPERMT2Drr, X86::VPERMT2Drm, 0 }, - { X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 }, - { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 }, - { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 }, - { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 }, - { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 }, - { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 }, - - // AVX-512VL 256-bit instructions with 3 source operands. - { X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 }, - { X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 }, - { X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 }, - { X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 }, - { X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 }, - { X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 }, - { X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 }, - { X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 }, - { X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 }, - { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 }, - { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 }, - { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 }, - { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 }, - { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 }, - - // AVX-512VL 128-bit instructions with 3 source operands. - { X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 }, - { X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 }, - { X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 }, - { X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 }, - { X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 }, - { X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 }, - { X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 }, - { X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 }, - { X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 }, - { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 }, - { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 }, - { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 }, - { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 }, - { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 }, - - // AVX-512 masked instructions - { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, - { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, - { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 }, - { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 }, - { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 }, - { X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 }, - { X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 }, - { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 }, - { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, - { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, - { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 }, - { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 }, - { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 }, - { X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 }, - { X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 }, - { X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 }, - { X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 }, - { X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 }, - { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 }, - { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, - { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, - { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, - { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 }, - { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 }, - { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, - { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, - { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, - { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, - { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 }, - { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 }, - { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, - { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, - { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 }, - { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 }, - { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 }, - { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 }, - { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 }, - { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 }, - { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 }, - { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 }, - { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 }, - { X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 }, - { X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 }, - { X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 }, - { X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 }, - { X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 }, - { X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 }, - { X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 }, - { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 }, - { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 }, - { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, - { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 }, - { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 }, - { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, - { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 }, - { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 }, - { X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 }, - { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 }, - { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 }, - { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 }, - { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 }, - { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 }, - { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 }, - { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 }, - { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 }, - { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 }, - { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 }, - { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 }, - { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 }, - { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 }, - { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 }, - { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 }, - { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 }, - { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 }, - { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 }, - { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 }, - { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 }, - { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 }, - { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 }, - { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 }, - { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 }, - { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 }, - { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 }, - { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 }, - { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 }, - { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 }, - { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 }, - { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 }, - { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 }, - { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 }, - { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 }, - { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 }, - { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 }, - { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 }, - { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 }, - { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 }, - { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 }, - { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 }, - { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 }, - { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 }, - { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 }, - { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 }, - { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 }, - { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 }, - { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 }, - { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 }, - { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 }, - { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 }, - { X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 }, - { X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 }, - { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 }, - { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 }, - { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 }, - { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 }, - { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 }, - { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 }, - { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 }, - { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 }, - { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 }, - { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 }, - { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 }, - { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 }, - { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, - { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 }, - { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 }, - { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, - { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, - { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE }, - { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE }, - { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, - { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, - { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, - { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 }, - { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 }, - { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 }, - - // AVX-512{F,VL} masked arithmetic instructions 256-bit - { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, - { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, - { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 }, - { X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 }, - { X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 }, - { X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 }, - { X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 }, - { X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 }, - { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, - { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, - { X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 }, - { X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 }, - { X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 }, - { X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 }, - { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 }, - { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 }, - { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, - { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, - { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 }, - { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 }, - { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, - { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, - { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, - { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, - { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, - { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 }, - { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 }, - { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 }, - { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 }, - { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 }, - { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 }, - { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 }, - { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 }, - { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 }, - { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 }, - { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 }, - { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 }, - { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 }, - { X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 }, - { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 }, - { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 }, - { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 }, - { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, - { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 }, - { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 }, - { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, - { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 }, - { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 }, - { X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 }, - { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 }, - { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 }, - { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 }, - { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 }, - { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 }, - { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 }, - { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 }, - { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 }, - { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 }, - { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 }, - { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 }, - { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 }, - { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 }, - { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 }, - { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 }, - { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 }, - { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 }, - { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 }, - { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 }, - { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 }, - { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 }, - { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 }, - { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 }, - { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 }, - { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 }, - { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 }, - { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 }, - { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, - { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, - { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 }, - { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 }, - { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 }, - { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 }, - { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 }, - { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 }, - { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 }, - { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 }, - { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 }, - { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 }, - { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 }, - { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 }, - { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 }, - { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 }, - { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 }, - { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 }, - { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 }, - { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 }, - { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 }, - { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, - { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, - { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, - { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 }, - { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 }, - { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 }, - { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 }, - { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 }, - { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 }, - { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 }, - { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 }, - { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 }, - { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 }, - { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 }, - { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 }, - { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 }, - { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, - { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, - { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 }, - { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 }, - { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, - { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, - { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, - { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 }, - { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 }, - { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 }, - { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 }, - { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 }, - - // AVX-512{F,VL} masked arithmetic instructions 128-bit - { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, - { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, - { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 }, - { X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 }, - { X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 }, - { X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 }, - { X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 }, - { X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 }, - { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, - { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, - { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 }, - { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 }, - { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }, - { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, - { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 }, - { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 }, - { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, - { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, - { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, - { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, - { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, - { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 }, - { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 }, - { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 }, - { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 }, - { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 }, - { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 }, - { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 }, - { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 }, - { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 }, - { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 }, - { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 }, - { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 }, - { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 }, - { X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 }, - { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 }, - { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 }, - { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, - { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, - { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 }, - { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 }, - { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, - { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 }, - { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 }, - { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 }, - { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, - { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 }, - { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 }, - { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 }, - { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 }, - { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 }, - { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 }, - { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 }, - { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 }, - { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 }, - { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 }, - { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 }, - { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 }, - { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 }, - { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 }, - { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 }, - { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 }, - { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 }, - { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 }, - { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 }, - { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 }, - { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 }, - { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 }, - { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, - { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, - { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 }, - { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 }, - { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 }, - { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 }, - { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 }, - { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 }, - { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 }, - { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 }, - { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 }, - { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 }, - { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 }, - { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 }, - { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 }, - { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 }, - { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 }, - { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 }, - { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 }, - { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 }, - { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 }, - { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, - { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, - { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, - { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 }, - { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 }, - { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 }, - { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 }, - { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 }, - { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 }, - { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 }, - { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 }, - { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 }, - { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 }, - { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 }, - { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 }, - { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 }, - { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, - { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, - { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 }, - { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 }, - { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, - { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, - { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, - { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 }, - { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 }, - { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 }, - { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 }, - { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, - - // AVX-512 masked foldable instructions - { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, - { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 }, - { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 }, - { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 }, - { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 }, - { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 }, - { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 }, - { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 }, - { X86::VPERMQZrik, X86::VPERMQZmik, 0 }, - { X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 }, - { X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE }, - { X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 }, - { X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 }, - { X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 }, - { X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 }, - { X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 }, - { X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE }, - { X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 }, - { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 }, - { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 }, - { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 }, - { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 }, - { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 }, - { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, - { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 }, - { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 }, - { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 }, - { X86::VPSRADZrik, X86::VPSRADZmik, 0 }, - { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 }, - { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 }, - { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 }, - { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 }, - { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 }, - - // AVX-512VL 256-bit masked foldable instructions - { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, - { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 }, - { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 }, - { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 }, - { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 }, - { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 }, - { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 }, - { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 }, - { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 }, - { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 }, - { X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 }, - { X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 }, - { X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 }, - { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 }, - { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 }, - { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE }, - { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 }, - { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 }, - { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, - { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 }, - { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 }, - { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 }, - { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 }, - { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 }, - { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 }, - { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 }, - { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 }, - { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 }, - - // AVX-512VL 128-bit masked foldable instructions - { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, - { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 }, - { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 }, - { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 }, - { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 }, - { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 }, - { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 }, - { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE }, - { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE }, - { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 }, - { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 }, - { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 }, - { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 }, - { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 }, - { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 }, - { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 }, - { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 }, - { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 }, - { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 }, - { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 }, - { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 }, - }; - for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, Entry.RegOp, Entry.MemOp, // Index 3, folded load Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } - auto I = X86InstrFMA3Info::rm_begin(); - auto E = X86InstrFMA3Info::rm_end(); - for (; I != E; ++I) { - if (!I.getGroup()->isKMasked()) { - // Intrinsic forms need to pass TB_NO_REVERSE. - if (I.getGroup()->isIntrinsic()) { - AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE); - } else { - AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD); - } - } - } - - static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { - // AVX-512 foldable masked instructions - { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, - { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, - { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE }, - { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE }, - { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 }, - { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 }, - { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 }, - { X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 }, - { X86::VANDPDZrrk, X86::VANDPDZrmk, 0 }, - { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, - { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, - { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, - { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, - { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, - { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 }, - { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 }, - { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 }, - { X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 }, - { X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 }, - { X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 }, - { X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 }, - { X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 }, - { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 }, - { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, - { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, - { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, - { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 }, - { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 }, - { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, - { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, - { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, - { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, - { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 }, - { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 }, - { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, - { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, - { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, - { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE }, - { X86::VORPDZrrk, X86::VORPDZrmk, 0 }, - { X86::VORPSZrrk, X86::VORPSZrmk, 0 }, - { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 }, - { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 }, - { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 }, - { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 }, - { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 }, - { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 }, - { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 }, - { X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 }, - { X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 }, - { X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 }, - { X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 }, - { X86::VPADDWZrrk, X86::VPADDWZrmk, 0 }, - { X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 }, - { X86::VPANDDZrrk, X86::VPANDDZrmk, 0 }, - { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 }, - { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 }, - { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, - { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 }, - { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 }, - { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, - { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, - { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 }, - { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 }, - { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 }, - { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 }, - { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 }, - { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 }, - { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 }, - { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 }, - { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 }, - { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 }, - { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 }, - { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 }, - { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 }, - { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 }, - { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 }, - { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, - { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, - { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, - { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, - { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, - { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 }, - { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 }, - { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 }, - { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 }, - { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 }, - { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 }, - { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 }, - { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 }, - { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 }, - { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 }, - { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 }, - { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 }, - { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 }, - { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 }, - { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 }, - { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 }, - { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 }, - { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 }, - { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 }, - { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 }, - { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 }, - { X86::VPORDZrrk, X86::VPORDZrmk, 0 }, - { X86::VPORQZrrk, X86::VPORQZrmk, 0 }, - { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 }, - { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 }, - { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 }, - { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 }, - { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 }, - { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 }, - { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 }, - { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 }, - { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 }, - { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 }, - { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 }, - { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 }, - { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 }, - { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 }, - { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 }, - { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 }, - { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 }, - { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 }, - { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 }, - { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 }, - { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 }, - { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 }, - { X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 }, - { X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 }, - { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 }, - { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 }, - { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 }, - { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 }, - { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 }, - { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 }, - { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 }, - { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 }, - { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 }, - { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 }, - { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 }, - { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 }, - { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 }, - { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, - { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 }, - { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 }, - { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, - { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, - { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE }, - { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE }, - { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, - { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, - { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, - { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 }, - { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 }, - { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, - - // AVX-512{F,VL} foldable masked instructions 256-bit - { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, - { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, - { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 }, - { X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 }, - { X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 }, - { X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 }, - { X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 }, - { X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 }, - { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, - { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, - { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 }, - { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 }, - { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 }, - { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 }, - { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 }, - { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 }, - { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, - { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, - { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 }, - { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 }, - { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, - { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, - { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, - { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, - { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, - { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 }, - { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 }, - { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 }, - { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 }, - { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 }, - { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 }, - { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 }, - { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 }, - { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 }, - { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 }, - { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 }, - { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 }, - { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 }, - { X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 }, - { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 }, - { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 }, - { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 }, - { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, - { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 }, - { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 }, - { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, - { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, - { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 }, - { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 }, - { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 }, - { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 }, - { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 }, - { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 }, - { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 }, - { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 }, - { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 }, - { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 }, - { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 }, - { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 }, - { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 }, - { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 }, - { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 }, - { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, - { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, - { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, - { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, - { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, - { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 }, - { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 }, - { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 }, - { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 }, - { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 }, - { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 }, - { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 }, - { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 }, - { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 }, - { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 }, - { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 }, - { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 }, - { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 }, - { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 }, - { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 }, - { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 }, - { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 }, - { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 }, - { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 }, - { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 }, - { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 }, - { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, - { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, - { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 }, - { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 }, - { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 }, - { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 }, - { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 }, - { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 }, - { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 }, - { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 }, - { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 }, - { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 }, - { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 }, - { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 }, - { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 }, - { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 }, - { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 }, - { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 }, - { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 }, - { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 }, - { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 }, - { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, - { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, - { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, - { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 }, - { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 }, - { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 }, - { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 }, - { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 }, - { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 }, - { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, - { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 }, - { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 }, - { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 }, - { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 }, - { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 }, - { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 }, - { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 }, - { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 }, - { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 }, - { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 }, - { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 }, - { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 }, - { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, - { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, - { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, - { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 }, - { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 }, - { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 }, - { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 }, - { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 }, - - // AVX-512{F,VL} foldable instructions 128-bit - { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, - { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, - { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 }, - { X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 }, - { X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 }, - { X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 }, - { X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 }, - { X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 }, - { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, - { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, - { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 }, - { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 }, - { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }, - { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, - { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 }, - { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 }, - { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, - { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, - { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, - { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, - { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, - { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 }, - { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 }, - { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 }, - { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 }, - { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 }, - { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 }, - { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 }, - { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 }, - { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 }, - { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 }, - { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 }, - { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 }, - { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 }, - { X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 }, - { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 }, - { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 }, - { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, - { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, - { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 }, - { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 }, - { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, - { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 }, - { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 }, - { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 }, - { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 }, - { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 }, - { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 }, - { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 }, - { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 }, - { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 }, - { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 }, - { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 }, - { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 }, - { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, - { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, - { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, - { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, - { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, - { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 }, - { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 }, - { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 }, - { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 }, - { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 }, - { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 }, - { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 }, - { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 }, - { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 }, - { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 }, - { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 }, - { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 }, - { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 }, - { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 }, - { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 }, - { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 }, - { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 }, - { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 }, - { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 }, - { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 }, - { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 }, - { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, - { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, - { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 }, - { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 }, - { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 }, - { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 }, - { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 }, - { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 }, - { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 }, - { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 }, - { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 }, - { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 }, - { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 }, - { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 }, - { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 }, - { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 }, - { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 }, - { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 }, - { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 }, - { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 }, - { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 }, - { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, - { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, - { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, - { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 }, - { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 }, - { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 }, - { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 }, - { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 }, - { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 }, - { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, - { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 }, - { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 }, - { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 }, - { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 }, - { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 }, - { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 }, - { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 }, - { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 }, - { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 }, - { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 }, - { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 }, - { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 }, - { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, - { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, - { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, - { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 }, - { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 }, - { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 }, - { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 }, - { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 }, - - // 512-bit three source instructions with zero masking. - { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 }, - { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 }, - { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 }, - { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 }, - { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 }, - { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 }, - { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 }, - { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 }, - { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 }, - { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, - { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, - { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, - { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, - { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, - - // 256-bit three source instructions with zero masking. - { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 }, - { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 }, - { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 }, - { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 }, - { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 }, - { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 }, - { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 }, - { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 }, - { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 }, - { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, - { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, - { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, - { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, - { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, - - // 128-bit three source instructions with zero masking. - { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 }, - { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 }, - { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 }, - { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 }, - { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 }, - { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 }, - { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 }, - { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 }, - { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 }, - { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, - { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, - { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, - { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, - { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, - }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, @@ -3539,20 +163,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 4, folded load Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } - for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) { - if (I.getGroup()->isKMasked()) { - // Intrinsics need to pass TB_NO_REVERSE. - if (I.getGroup()->isIntrinsic()) { - AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE); - } else { - AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, - I.getRegOpcode(), I.getMemOpcode(), - TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD); - } - } - } } void @@ -5930,7 +2540,7 @@ void X86InstrInfo::replaceBranchWithTailCall( // Add implicit uses and defs of all live regs potentially clobbered by the // call. This way they still appear live across the call. - LivePhysRegs LiveRegs(&getRegisterInfo()); + LivePhysRegs LiveRegs(getRegisterInfo()); LiveRegs.addLiveOuts(MBB); SmallVector, 8> Clobbers; LiveRegs.stepForward(*MIB, Clobbers); @@ -6545,9 +3155,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // first frame index. // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. - const TargetRegisterInfo *TRI = &getRegisterInfo(); + const TargetRegisterInfo &TRI = getRegisterInfo(); MachineBasicBlock::LivenessQueryResult LQR = - MBB.computeRegisterLiveness(TRI, AX, MI); + MBB.computeRegisterLiveness(&TRI, AX, MI); // We do not want to save and restore AX if we do not have to. // Moreover, if we do so whereas AX is dead, we would need to set // an undef flag on the use of AX, otherwise the verifier will @@ -6564,7 +3174,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, } // AX contains the top most register in the aliasing hierarchy. // It may not be live, but one of its aliases may be. - for (MCRegAliasIterator AI(AX, TRI, true); + for (MCRegAliasIterator AI(AX, &TRI, true); AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI) LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live : MachineBasicBlock::LQR_Dead; @@ -8374,7 +4984,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, unsigned Opc = LoadMI.getOpcode(); unsigned UserOpc = UserMI.getOpcode(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = + const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg()); unsigned RegSize = TRI.getRegSizeInBits(*RC); @@ -10473,7 +7083,7 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const { // catch it. if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) || MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) || - MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) + MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) return MachineOutlinerInstrType::Illegal; // Outlined calls change the instruction pointer, so don't read from it. diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 01df07e1715f..fab70e918b8a 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -813,6 +813,8 @@ def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">, AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; +def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">, + AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">; def HasPFI : Predicate<"Subtarget->hasPFI()">, AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; def HasERI : Predicate<"Subtarget->hasERI()">, @@ -1436,11 +1438,14 @@ def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), // Longer forms that use a ModR/M byte. Needed for disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src), - "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, + FoldGenData<"MOV8ri">; def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src), - "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16, + FoldGenData<"MOV16ri">; def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), - "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32, + FoldGenData<"MOV32ri">; } } // SchedRW @@ -1563,13 +1568,17 @@ def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, SchedRW = [WriteMove] in { def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), - "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, + FoldGenData<"MOV8rr">; def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), - "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16, + FoldGenData<"MOV16rr">; def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32, + FoldGenData<"MOV32rr">; def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, + FoldGenData<"MOV64rr">; } let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index dc3800ce381b..2c047722db24 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -248,7 +248,8 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (MMX_X86movd2w (x86mmx VR64:$src)))], - IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; + IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>, + FoldGenData<"MMX_MOVD64rr">; let isBitcast = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), @@ -277,7 +278,7 @@ def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], - IIC_MMX_MOVQ_RR>; + IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">; } } // SchedRW diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f73d85e7e01b..a3e677209305 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -507,7 +507,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, multiclass sse12_move_rr { + string asm_opr, Domain d = GenericDomain, + string Name> { let isCommutable = 1 in def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), @@ -521,15 +522,17 @@ multiclass sse12_move_rr, Sched<[WriteFShuffle]>; + [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>, + FoldGenData; } multiclass sse12_move { + Domain d = GenericDomain, string Name> { // AVX defm V#NAME : sse12_move_rr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, + "V"#Name>, VEX_4V, VEX_LIG, VEX_WIG; def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), @@ -539,7 +542,7 @@ multiclass sse12_move; + "\t{$src2, $dst|$dst, $src2}", d, Name>; } def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), @@ -563,9 +566,9 @@ multiclass sse12_move_rm, XS; + SSEPackedSingle, "MOVSS">, XS; defm MOVSD : sse12_move, XD; + SSEPackedDouble, "MOVSD">, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { defm MOVSS : sse12_move_rm, VEX, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVAPSrr">; def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVAPDrr">; def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVUPSrr">; def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG, + FoldGenData<"VMOVUPDrr">; def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVAPSYrr">; def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVAPDYrr">; def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVUPSYrr">; def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVUPDYrr">; } // Aliases to help the assembler pick two byte VEX encodings by swapping the @@ -938,16 +949,16 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, SchedRW = [WriteFShuffle] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; + IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">; def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; + IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">; def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>; + IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">; def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>; + IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">; } let Predicates = [HasAVX, NoVLX] in { @@ -3752,17 +3763,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, - VEX, VEX_WIG; + VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVDQAYrr">; def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, - VEX, VEX_WIG; + VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqu\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG, + FoldGenData<"VMOVDQUYrr">; } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, @@ -3820,11 +3833,12 @@ def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>; + IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">; def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>, + FoldGenData<"MOVDQUrr">; } } // SchedRW @@ -5915,7 +5929,7 @@ multiclass SS41I_extract16 opc, string OpcodeStr> { (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, Sched<[WriteShuffle]>; + []>, Sched<[WriteShuffle]>, FoldGenData; let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 53224431c0e9..5dde2d07babe 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -111,7 +111,7 @@ multiclass xop3op opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XOP_4V, VEX_W, Sched<[WriteVarVecShift]>; + XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData; } let ExeDomain = SSEPackedInt in { @@ -282,7 +282,7 @@ multiclass xop4op opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W; + []>, XOP_4V, VEX_W, FoldGenData; } let ExeDomain = SSEPackedInt in { @@ -318,7 +318,7 @@ multiclass xop4op_int opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W; + []>, XOP_4V, VEX_W, FoldGenData; } let ExeDomain = SSEPackedInt in { @@ -357,7 +357,7 @@ multiclass xop_vpermil2 Opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W; + []>, VEX_W, FoldGenData; } let ExeDomain = SSEPackedDouble in { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 61956f741820..77dead8d2413 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -302,6 +302,26 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } else if (Ty.isVector() && Ty.getSizeInBits() == 256) { + if (Alignment >= 32) + return Isload ? (HasVLX ? X86::VMOVAPSZ256rm + : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX + : X86::VMOVAPSYrm) + : (HasVLX ? X86::VMOVAPSZ256mr + : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX + : X86::VMOVAPSYmr); + else + return Isload ? (HasVLX ? X86::VMOVUPSZ256rm + : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX + : X86::VMOVUPSYrm) + : (HasVLX ? X86::VMOVUPSZ256mr + : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX + : X86::VMOVUPSYmr); + } else if (Ty.isVector() && Ty.getSizeInBits() == 512) { + if (Alignment >= 64) + return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; + else + return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } return Opc; } diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index da724f5d8989..979aaee110aa 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -35,6 +35,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfoSSE1(); setLegalizerInfoSSE2(); setLegalizerInfoSSE41(); + setLegalizerInfoAVX(); setLegalizerInfoAVX2(); setLegalizerInfoAVX512(); setLegalizerInfoAVX512DQ(); @@ -209,6 +210,18 @@ void X86LegalizerInfo::setLegalizerInfoSSE41() { setAction({G_MUL, v4s32}, Legal); } +void X86LegalizerInfo::setLegalizerInfoAVX() { + if (!Subtarget.hasAVX()) + return; + + const LLT v8s32 = LLT::vector(8, 32); + const LLT v4s64 = LLT::vector(4, 64); + + for (unsigned MemOp : {G_LOAD, G_STORE}) + for (auto Ty : {v8s32, v4s64}) + setAction({MemOp, Ty}, Legal); +} + void X86LegalizerInfo::setLegalizerInfoAVX2() { if (!Subtarget.hasAVX2()) return; @@ -239,6 +252,10 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() { setAction({G_MUL, v16s32}, Legal); + for (unsigned MemOp : {G_LOAD, G_STORE}) + for (auto Ty : {v16s32, v8s64}) + setAction({MemOp, Ty}, Legal); + /************ VLX *******************/ if (!Subtarget.hasVLX()) return; diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h index ab5405a70427..135950a95f84 100644 --- a/lib/Target/X86/X86LegalizerInfo.h +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -39,6 +39,7 @@ private: void setLegalizerInfoSSE1(); void setLegalizerInfoSSE2(); void setLegalizerInfoSSE41(); + void setLegalizerInfoAVX(); void setLegalizerInfoAVX2(); void setLegalizerInfoAVX512(); void setLegalizerInfoAVX512DQ(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 2b1f43bffd71..84ec98484f8e 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -286,6 +286,7 @@ void X86Subtarget::initializeEnvironment() { HasCDI = false; HasPFI = false; HasDQI = false; + HasVPOPCNTDQ = false; HasBWI = false; HasVLX = false; HasADX = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index a9f3a2aee1be..550e95c39ab5 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -270,6 +270,9 @@ protected: /// Processor has AVX-512 Conflict Detection Instructions bool HasCDI; + /// Processor has AVX-512 population count Instructions + bool HasVPOPCNTDQ; + /// Processor has AVX-512 Doubleword and Quadword instructions bool HasDQI; @@ -494,6 +497,7 @@ public: bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } + bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } bool hasDQI() const { return HasDQI; } diff --git a/lib/Transforms/Coroutines/CoroCleanup.cpp b/lib/Transforms/Coroutines/CoroCleanup.cpp index a97db6fde454..5cf2a8c25d83 100644 --- a/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -124,6 +124,7 @@ struct CoroCleanup : FunctionPass { if (!L) AU.setPreservesAll(); } + StringRef getPassName() const override { return "Coroutine Cleanup"; } }; } diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp index e8bb0ca99d8a..b52989186165 100644 --- a/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/lib/Transforms/Coroutines/CoroEarly.cpp @@ -208,6 +208,9 @@ struct CoroEarly : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } + StringRef getPassName() const override { + return "Lower early coroutine intrinsics"; + } }; } diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp index c6ac3f614ff7..acb22449142b 100644 --- a/lib/Transforms/Coroutines/CoroElide.cpp +++ b/lib/Transforms/Coroutines/CoroElide.cpp @@ -301,6 +301,7 @@ struct CoroElide : FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); } + StringRef getPassName() const override { return "Coroutine Elision"; } }; } diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 417d57f7625b..85e9003ec3c5 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -799,9 +799,9 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { splitAround(CSI, "CoroSuspend"); } - // Put fallthrough CoroEnd into its own block. Note: Shape::buildFrom places - // the fallthrough coro.end as the first element of CoroEnds array. - splitAround(Shape.CoroEnds.front(), "CoroEnd"); + // Put CoroEnds into their own blocks. + for (CoroEndInst *CE : Shape.CoroEnds) + splitAround(CE, "CoroEnd"); // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will // never has its definition separated from the PHI by the suspend point. @@ -813,19 +813,24 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { IRBuilder<> Builder(F.getContext()); SpillInfo Spills; - // See if there are materializable instructions across suspend points. - for (Instruction &I : instructions(F)) - if (materializable(I)) - for (User *U : I.users()) - if (Checker.isDefinitionAcrossSuspend(I, U)) - Spills.emplace_back(&I, U); + for (int Repeat = 0; Repeat < 4; ++Repeat) { + // See if there are materializable instructions across suspend points. + for (Instruction &I : instructions(F)) + if (materializable(I)) + for (User *U : I.users()) + if (Checker.isDefinitionAcrossSuspend(I, U)) + Spills.emplace_back(&I, U); - // Rewrite materializable instructions to be materialized at the use point. - DEBUG(dump("Materializations", Spills)); - rewriteMaterializableInstructions(Builder, Spills); + if (Spills.empty()) + break; + + // Rewrite materializable instructions to be materialized at the use point. + DEBUG(dump("Materializations", Spills)); + rewriteMaterializableInstructions(Builder, Spills); + Spills.clear(); + } // Collect the spills for arguments and other not-materializable values. - Spills.clear(); for (Argument &A : F.args()) for (User *U : A.users()) if (Checker.isDefinitionAcrossSuspend(A, U)) @@ -847,8 +852,6 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { if (I.getType()->isTokenTy()) report_fatal_error( "token definition is separated from the use by a suspend point"); - assert(!materializable(I) && - "rewriteMaterializable did not do its job"); Spills.emplace_back(&I, U); } } diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp index 12eb16789825..cd549e4be282 100644 --- a/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/lib/Transforms/Coroutines/CoroSplit.cpp @@ -228,15 +228,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape, SmallVector Returns; - if (DISubprogram *SP = F.getSubprogram()) { - // If we have debug info, add mapping for the metadata nodes that should not - // be cloned by CloneFunctionInfo. - auto &MD = VMap.MD(); - MD[SP->getUnit()].reset(SP->getUnit()); - MD[SP->getType()].reset(SP->getType()); - MD[SP->getFile()].reset(SP->getFile()); - } - CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns); + CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/false, Returns); // Remove old returns. for (ReturnInst *Return : Returns) @@ -509,12 +501,87 @@ static void simplifySuspendPoints(coro::Shape &Shape) { S.resize(N); } +static SmallPtrSet getCoroBeginPredBlocks(CoroBeginInst *CB) { + // Collect all blocks that we need to look for instructions to relocate. + SmallPtrSet RelocBlocks; + SmallVector Work; + Work.push_back(CB->getParent()); + + do { + BasicBlock *Current = Work.pop_back_val(); + for (BasicBlock *BB : predecessors(Current)) + if (RelocBlocks.count(BB) == 0) { + RelocBlocks.insert(BB); + Work.push_back(BB); + } + } while (!Work.empty()); + return RelocBlocks; +} + +static SmallPtrSet +getNotRelocatableInstructions(CoroBeginInst *CoroBegin, + SmallPtrSetImpl &RelocBlocks) { + SmallPtrSet DoNotRelocate; + // Collect all instructions that we should not relocate + SmallVector Work; + + // Start with CoroBegin and terminators of all preceding blocks. + Work.push_back(CoroBegin); + BasicBlock *CoroBeginBB = CoroBegin->getParent(); + for (BasicBlock *BB : RelocBlocks) + if (BB != CoroBeginBB) + Work.push_back(BB->getTerminator()); + + // For every instruction in the Work list, place its operands in DoNotRelocate + // set. + do { + Instruction *Current = Work.pop_back_val(); + DoNotRelocate.insert(Current); + for (Value *U : Current->operands()) { + auto *I = dyn_cast(U); + if (!I) + continue; + if (isa(U)) + continue; + if (DoNotRelocate.count(I) == 0) { + Work.push_back(I); + DoNotRelocate.insert(I); + } + } + } while (!Work.empty()); + return DoNotRelocate; +} + +static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) { + // Analyze which non-alloca instructions are needed for allocation and + // relocate the rest to after coro.begin. We need to do it, since some of the + // targets of those instructions may be placed into coroutine frame memory + // for which becomes available after coro.begin intrinsic. + + auto BlockSet = getCoroBeginPredBlocks(CoroBegin); + auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet); + + Instruction *InsertPt = CoroBegin->getNextNode(); + BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well. + for (auto B = BB.begin(), E = BB.end(); B != E;) { + Instruction &I = *B++; + if (isa(&I)) + continue; + if (&I == CoroBegin) + break; + if (DoNotRelocateSet.count(&I)) + continue; + I.moveBefore(InsertPt); + } +} + static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) { coro::Shape Shape(F); if (!Shape.CoroBegin) return; simplifySuspendPoints(Shape); + relocateInstructionBefore(Shape.CoroBegin, F); buildCoroutineFrame(F, Shape); replaceFrameSize(Shape); @@ -660,6 +727,7 @@ struct CoroSplit : public CallGraphSCCPass { void getAnalysisUsage(AnalysisUsage &AU) const override { CallGraphSCCPass::getAnalysisUsage(AU); } + StringRef getPassName() const override { return "Coroutine Splitting"; } }; } diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 8dff2fb3be8a..4c417f1c55eb 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -558,17 +558,17 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( std::vector Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); Function *CurrentCaller = nullptr; + std::unique_ptr TempBFI; BlockFrequencyInfo *CurrentCallerBFI = nullptr; auto ComputeCurrBFI = [&,this](Function *Caller) { // For the old pass manager: if (!GetBFI) { - if (CurrentCallerBFI) - delete CurrentCallerBFI; DominatorTree DT(*Caller); LoopInfo LI(DT); BranchProbabilityInfo BPI(*Caller, LI); - CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); + TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); + CurrentCallerBFI = TempBFI.get(); } else { // New pass manager: CurrentCallerBFI = &(*GetBFI)(*Caller); @@ -591,10 +591,6 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap( else CallSiteToProfCountMap[User] = 0; } - if (!GetBFI) { - if (CurrentCallerBFI) - delete CurrentCallerBFI; - } } Function *PartialInlinerImpl::unswitchFunction(Function *F) { diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index ec06d5f9fb05..9fd3a9021a27 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -155,6 +155,10 @@ static cl::opt cl::Hidden, cl::desc("Enable the simple loop unswitch pass.")); +static cl::opt EnableGVNSink( + "enable-gvn-sink", cl::init(false), cl::Hidden, + cl::desc("Enable the GVN sinking pass (default = on)")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -307,6 +311,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createEarlyCSEPass()); // Catch trivial redundancies if (EnableGVNHoist) MPM.add(createGVNHoistPass()); + if (EnableGVNSink) { + MPM.add(createGVNSinkPass()); + MPM.add(createCFGSimplificationPass()); + } + // Speculative execution if the target has divergent branches; otherwise nop. MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); MPM.add(createJumpThreadingPass()); // Thread jumps. @@ -904,6 +913,12 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (OptLevel != 0) addLTOOptimizationPasses(PM); + else { + // The whole-program-devirt pass needs to run at -O0 because only it knows + // about the llvm.type.checked.load intrinsic: it needs to both lower the + // intrinsic itself and handle it in the summary. + PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); + } // Create a function that performs CFI checks for cross-DSO calls with targets // in the current module. diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 733eeb1767a3..7204bf517681 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -861,12 +861,9 @@ bool InstCombiner::willNotOverflowSignedSub(const Value *LHS, ComputeNumSignBits(RHS, 0, &CxtI) > 1) return true; - unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - KnownBits LHSKnown(BitWidth); - computeKnownBits(LHS, LHSKnown, 0, &CxtI); + KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI); - KnownBits RHSKnown(BitWidth); - computeKnownBits(RHS, RHSKnown, 0, &CxtI); + KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI); // Subtraction of two 2's complement numbers having identical signs will // never overflow. @@ -1059,9 +1056,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // If this is a xor that was canonicalized from a sub, turn it back into // a sub and fuse this add with it. if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) { - IntegerType *IT = cast(I.getType()); - KnownBits LHSKnown(IT->getBitWidth()); - computeKnownBits(XorLHS, LHSKnown, 0, &I); + KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I); if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue()) return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI), XorLHS); @@ -1577,8 +1572,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known // zero. if (Op0C->isMask()) { - KnownBits RHSKnown(BitWidth); - computeKnownBits(Op1, RHSKnown, 0, &I); + KnownBits RHSKnown = computeKnownBits(Op1, 0, &I); if ((*Op0C | RHSKnown.Zero).isAllOnesValue()) return BinaryOperator::CreateXor(Op1, Op0); } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 4227b2d01be8..1f8319efb3be 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1610,17 +1610,13 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Value *Mask = nullptr; Value *Masked = nullptr; if (LAnd->getOperand(0) == RAnd->getOperand(0) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, &AC, CxtI, - &DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, &AC, CxtI, - &DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, CxtI) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, CxtI)) { Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1)); Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask); } else if (LAnd->getOperand(1) == RAnd->getOperand(1) && - isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, &AC, - CxtI, &DT) && - isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, &AC, - CxtI, &DT)) { + isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, CxtI) && + isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, CxtI)) { Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0)); Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask); } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index face7abcc95f..92a38f26dde7 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1378,9 +1378,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { if (!IT) return nullptr; - unsigned BitWidth = IT->getBitWidth(); - KnownBits Known(BitWidth); - IC.computeKnownBits(Op0, Known, 0, &II); + KnownBits Known = IC.computeKnownBits(Op0, 0, &II); // Create a mask for bits above (ctlz) or below (cttz) the first known one. bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; @@ -1401,7 +1399,9 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { // If the input to cttz/ctlz is known to be non-zero, // then change the 'ZeroIsUndef' parameter to 'true' // because we know the zero behavior can't affect the result. - if (Known.One != 0 || isKnownNonZero(Op0, IC.getDataLayout())) { + if (Known.One != 0 || + isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II, + &IC.getDominatorTree())) { if (!match(II.getArgOperand(1), m_One())) { II.setOperand(1, IC.Builder->getTrue()); return &II; diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index f4bf5221f6a2..766939c56dff 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -692,8 +692,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // This only works for EQ and NE ICI->isEquality()) { // If Op1C some other power of two, convert: - KnownBits Known(Op1C->getType()->getBitWidth()); - computeKnownBits(ICI->getOperand(0), Known, 0, &CI); + KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI); APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? @@ -737,14 +736,11 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // may lead to additional simplifications. if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) { if (IntegerType *ITy = dyn_cast(CI.getType())) { - uint32_t BitWidth = ITy->getBitWidth(); Value *LHS = ICI->getOperand(0); Value *RHS = ICI->getOperand(1); - KnownBits KnownLHS(BitWidth); - KnownBits KnownRHS(BitWidth); - computeKnownBits(LHS, KnownLHS, 0, &CI); - computeKnownBits(RHS, KnownRHS, 0, &CI); + KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI); + KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI); if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { APInt KnownBits = KnownLHS.Zero | KnownLHS.One; @@ -1063,9 +1059,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { // the icmp and sext into bitwise/integer operations. if (ICI->hasOneUse() && ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){ - unsigned BitWidth = Op1C->getType()->getBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(Op0, Known, 0, &CI); + KnownBits Known = computeKnownBits(Op0, 0, &CI); APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { @@ -1104,7 +1098,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { // Distribute the bit over the whole bit width. In = Builder->CreateAShr(In, ConstantInt::get(In->getType(), - BitWidth - 1), "sext"); + KnownZeroMask.getBitWidth() - 1), "sext"); } if (CI.getType() == In->getType()) diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 6492eaedae9c..2c2b7317a1c0 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1402,9 +1402,9 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { if (*C == 0 && Pred == ICmpInst::ICMP_SGT) { SelectPatternResult SPR = matchSelectPattern(X, A, B); if (SPR.Flavor == SPF_SMIN) { - if (isKnownPositive(A, DL)) + if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT)) return new ICmpInst(Pred, B, Cmp.getOperand(1)); - if (isKnownPositive(B, DL)) + if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT)) return new ICmpInst(Pred, A, Cmp.getOperand(1)); } } @@ -1478,8 +1478,7 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp, // of the high bits truncated out of x are known. unsigned DstBits = Trunc->getType()->getScalarSizeInBits(), SrcBits = X->getType()->getScalarSizeInBits(); - KnownBits Known(SrcBits); - computeKnownBits(X, Known, 0, &Cmp); + KnownBits Known = computeKnownBits(X, 0, &Cmp); // If all the high bits are known, we can do this xform. if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) { @@ -3030,18 +3029,21 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { break; case Instruction::Add: case Instruction::Sub: - case Instruction::Xor: + case Instruction::Xor: { if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); - // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b - if (ConstantInt *CI = dyn_cast(BO0->getOperand(1))) { - if (CI->getValue().isSignMask()) { + + const APInt *C; + if (match(BO0->getOperand(1), m_APInt(C))) { + // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b + if (C->isSignMask()) { ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); } - if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { + // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b + if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) { ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); NewPred = I.getSwappedPredicate(NewPred); @@ -3049,26 +3051,30 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { } } break; - case Instruction::Mul: + } + case Instruction::Mul: { if (!I.isEquality()) break; - if (ConstantInt *CI = dyn_cast(BO0->getOperand(1))) { - // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask - // Mask = -1 >> count-trailing-zeros(Cst). - if (!CI->isZero() && !CI->isOne()) { - const APInt &AP = CI->getValue(); - ConstantInt *Mask = ConstantInt::get( - I.getContext(), - APInt::getLowBitsSet(AP.getBitWidth(), - AP.getBitWidth() - AP.countTrailingZeros())); + const APInt *C; + if (match(BO0->getOperand(1), m_APInt(C)) && *C != 0 && *C != 1) { + // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask) + // Mask = -1 >> count-trailing-zeros(C). + if (unsigned TZs = C->countTrailingZeros()) { + Constant *Mask = ConstantInt::get( + BO0->getType(), + APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs)); Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask); Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask); return new ICmpInst(Pred, And1, And2); } + // If there are no trailing zeros in the multiplier, just eliminate + // the multiplies (no masking is needed): + // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); } break; - + } case Instruction::UDiv: case Instruction::LShr: if (I.isSigned() || !BO0->isExact() || !BO1->isExact()) @@ -4497,7 +4503,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // if A is a power of 2. if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) && match(Op1, m_Zero()) && - isKnownToBeAPowerOfTwo(A, DL, false, 0, &AC, &I, &DT) && I.isEquality()) + isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality()) return new ICmpInst(I.getInversePredicate(), Builder->CreateAnd(A, B), Op1); diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 6829be86885b..56f133de3de1 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -540,6 +540,12 @@ public: return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); } + bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false, + unsigned Depth = 0, + const Instruction *CxtI = nullptr) { + return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT); + } + bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0, const Instruction *CxtI = nullptr) const { return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT); diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index fc13854f8fe7..4d408359eeea 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -47,9 +47,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, // inexact. Similarly for <<. BinaryOperator *I = dyn_cast(V); if (I && I->isLogicalShift() && - isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0, - &IC.getAssumptionCache(), &CxtI, - &IC.getDominatorTree())) { + IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) { // We know that this is an exact/nuw shift and that the input is a // non-zero context as well. if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) { @@ -1240,7 +1238,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { return BO; } - if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) { + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) // Safe because the only negative value (1 << Y) can take on is // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have @@ -1487,7 +1485,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { I.getType()); // X urem Y -> X and Y-1, where Y is a power of 2, - if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) { + if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { Constant *N1 = Constant::getAllOnesValue(I.getType()); Value *Add = Builder->CreateAdd(Op1, N1); return BinaryOperator::CreateAnd(Op0, Add); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index 219effce7ba5..b40d067b2817 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -44,7 +44,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { Value *A; Constant *C; if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C)))) - if (isKnownNonNegative(A, DL) && isKnownNonNegative(C, DL)) + if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) && + isKnownNonNegative(C, DL, 0, &AC, &I, &DT)) return BinaryOperator::Create( I.getOpcode(), Builder->CreateBinOp(I.getOpcode(), Op0, C), A); diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 4028a92771a4..5df55f01b83f 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -158,8 +158,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // Output known-0 are known to be clear if zero in either the LHS | RHS. APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero; @@ -192,8 +192,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // Output known-0 bits are only known if clear in both the LHS & RHS. APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero; @@ -224,8 +224,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) || SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // Output known-0 bits are known if clear or set in both the LHS & RHS. APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) | @@ -313,8 +313,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1)) return I; - assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?"); - assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?"); + assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?"); + assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. if (ShrinkDemandedConstant(I, 1, DemandedMask) || @@ -325,15 +325,19 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Known.One = RHSKnown.One & LHSKnown.One; Known.Zero = RHSKnown.Zero & LHSKnown.Zero; break; + case Instruction::ZExt: case Instruction::Trunc: { - unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits(); - DemandedMask = DemandedMask.zext(truncBf); - Known = Known.zext(truncBf); - if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) + unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); + + APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth); + KnownBits InputKnown(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1)) return I; - DemandedMask = DemandedMask.trunc(BitWidth); - Known = Known.trunc(BitWidth); - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + Known = Known.zextOrTrunc(BitWidth); + // Any top bits are known to be zero. + if (BitWidth > SrcBitWidth) + Known.Zero.setBitsFrom(SrcBitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case Instruction::BitCast: @@ -355,56 +359,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; - case Instruction::ZExt: { - // Compute the bits in the result that are not present in the input. - unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); - - DemandedMask = DemandedMask.trunc(SrcBitWidth); - Known = Known.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1)) - return I; - DemandedMask = DemandedMask.zext(BitWidth); - Known = Known.zext(BitWidth); - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); - // The top bits are known to be zero. - Known.Zero.setBitsFrom(SrcBitWidth); - break; - } case Instruction::SExt: { // Compute the bits in the result that are not present in the input. - unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); + unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); - APInt InputDemandedBits = DemandedMask & - APInt::getLowBitsSet(BitWidth, SrcBitWidth); + APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth); - APInt NewBits(APInt::getBitsSetFrom(BitWidth, SrcBitWidth)); // If any of the sign extended bits are demanded, we know that the sign // bit is demanded. - if ((NewBits & DemandedMask) != 0) + if (DemandedMask.getActiveBits() > SrcBitWidth) InputDemandedBits.setBit(SrcBitWidth-1); - InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth); - Known = Known.trunc(SrcBitWidth); - if (SimplifyDemandedBits(I, 0, InputDemandedBits, Known, Depth + 1)) + KnownBits InputKnown(SrcBitWidth); + if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1)) return I; - InputDemandedBits = InputDemandedBits.zext(BitWidth); - Known = Known.zext(BitWidth); - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); - - // If the sign bit of the input is known set or clear, then we know the - // top bits of the result. // If the input sign bit is known zero, or if the NewBits are not demanded // convert this into a zero extension. - if (Known.Zero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) { - // Convert to ZExt cast + if (InputKnown.isNonNegative() || + DemandedMask.getActiveBits() <= SrcBitWidth) { + // Convert to ZExt cast. CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); return InsertNewInstWith(NewCast, *I); - } else if (Known.One[SrcBitWidth-1]) { // Input sign bit known set - Known.One |= NewBits; - } + } + + // If the sign bit of the input is known set or clear, then we know the + // top bits of the result. + Known = InputKnown.sext(BitWidth); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } case Instruction::Add: @@ -467,7 +451,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero <<= ShiftAmt; Known.One <<= ShiftAmt; // low bits known zero. @@ -491,7 +475,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); Known.Zero.lshrInPlace(ShiftAmt); Known.One.lshrInPlace(ShiftAmt); if (ShiftAmt) @@ -535,7 +519,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); // Compute the new bits that are at the top now. APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); Known.Zero.lshrInPlace(ShiftAmt); @@ -590,7 +574,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One)) Known.One |= ~LowBits; - assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?"); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 7ed9fd566b37..2730afc5c5b9 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1963,6 +1963,7 @@ static bool isAllocSiteRemovable(Instruction *AI, // Give up the moment we see something we can't handle. return false; + case Instruction::AddrSpaceCast: case Instruction::BitCast: case Instruction::GetElementPtr: Users.emplace_back(I); @@ -2064,7 +2065,8 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { replaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()), C->isFalseWhenEqual())); - } else if (isa(I) || isa(I)) { + } else if (isa(I) || isa(I) || + isa(I)) { replaceInstUsesWith(*I, UndefValue::get(I->getType())); } eraseInstFromFunction(*I); @@ -2180,8 +2182,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { // There might be assume intrinsics dominating this return that completely // determine the value. If so, constant fold it. - KnownBits Known(VTy->getPrimitiveSizeInBits()); - computeKnownBits(ResultOp, Known, 0, &RI); + KnownBits Known = computeKnownBits(ResultOp, 0, &RI); if (Known.isConstant()) RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant())); @@ -2242,9 +2243,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { return &SI; } - unsigned BitWidth = cast(Cond->getType())->getBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(Cond, Known, 0, &SI); + KnownBits Known = computeKnownBits(Cond, 0, &SI); unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); @@ -2257,12 +2256,12 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes()); } - unsigned NewWidth = BitWidth - std::max(LeadingKnownZeros, LeadingKnownOnes); + unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes); // Shrink the condition operand if the new type is smaller than the old type. // This may produce a non-standard type for the switch, but that's ok because // the backend should extend back to a legal type for the target. - if (NewWidth > 0 && NewWidth < BitWidth) { + if (NewWidth > 0 && NewWidth < Known.getBitWidth()) { IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth); Builder->SetInsertPoint(&SI); Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc"); @@ -2841,9 +2840,7 @@ bool InstCombiner::run() { // a value even when the operands are not all constants. Type *Ty = I->getType(); if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) { - unsigned BitWidth = Ty->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(I, Known, /*Depth*/0, I); + KnownBits Known = computeKnownBits(I, /*Depth*/0, I); if (Known.isConstant()) { Constant *C = ConstantInt::get(Ty, Known.getConstant()); DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 990bcec109de..1e30dbf6b55a 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -180,7 +180,7 @@ static cl::opt static cl::opt PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden, cl::desc("Use this option to turn on/off " - "memory instrinsic size profiling.")); + "memory intrinsic size profiling.")); // Command line option to turn on CFG dot dump after profile annotation. // Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 4bc0a7133118..300085eccb0c 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -401,7 +401,10 @@ static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB, if (Options.NoPrune || &F.getEntryBlock() == BB) return true; - return !(isFullDominator(BB, DT) || isFullPostDominator(BB, PDT)); + // Do not instrument full dominators, or full post-dominators with multiple + // predecessors. + return !isFullDominator(BB, DT) + && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor()); } bool SanitizerCoverageModule::runOnFunction(Function &F) { diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 523390758769..f5196cc46181 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -13,6 +13,7 @@ add_llvm_library(LLVMScalarOpts GuardWidening.cpp GVN.cpp GVNHoist.cpp + GVNSink.cpp IVUsersPrinter.cpp InductiveRangeCheckElimination.cpp IndVarSimplify.cpp diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index f62e111460ca..c3810366bf22 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -164,9 +164,9 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, /// \brief Given \p BBs as input, find another set of BBs which collectively /// dominates \p BBs and have the minimal sum of frequencies. Return the BB /// set found in \p BBs. -void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, - BasicBlock *Entry, - SmallPtrSet &BBs) { +static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, + BasicBlock *Entry, + SmallPtrSet &BBs) { assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); // Nodes on the current path to the root. SmallPtrSet Path; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 0490d93f6455..0d6e0538261d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -80,9 +80,10 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, struct llvm::GVN::Expression { uint32_t opcode; Type *type; + bool commutative; SmallVector varargs; - Expression(uint32_t o = ~2U) : opcode(o) {} + Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {} bool operator==(const Expression &other) const { if (opcode != other.opcode) @@ -246,6 +247,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); + e.commutative = true; } if (CmpInst *C = dyn_cast(I)) { @@ -256,6 +258,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) { Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (C->getOpcode() << 8) | Predicate; + e.commutative = true; } else if (InsertValueInst *E = dyn_cast(I)) { for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); II != IE; ++II) @@ -281,6 +284,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode, Predicate = CmpInst::getSwappedPredicate(Predicate); } e.opcode = (Opcode << 8) | Predicate; + e.commutative = true; return e; } @@ -348,25 +352,25 @@ GVN::ValueTable::~ValueTable() = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); + if (PHINode *PN = dyn_cast(V)) + NumberingPhi[num] = PN; } uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = createExpr(C); - uint32_t &e = expressionNumbering[exp]; - if (!e) { - e = nextValueNumber++; - valueNumbering[C] = e; - return e; + auto ValNum = assignExpNewValueNum(exp); + if (ValNum.second) { + valueNumbering[C] = ValNum.first; + return ValNum.first; } if (!MD) { - e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[C] = e; return e; } @@ -522,23 +526,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) { case Instruction::ExtractValue: exp = createExtractvalueExpr(cast(I)); break; + case Instruction::PHI: + valueNumbering[V] = nextValueNumber; + NumberingPhi[nextValueNumber] = cast(V); + return nextValueNumber++; default: valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; + uint32_t e = assignExpNewValueNum(exp).first; valueNumbering[V] = e; return e; } /// Returns the value number of the specified value. Fails if /// the value has not yet been numbered. -uint32_t GVN::ValueTable::lookup(Value *V) const { +uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const { DenseMap::const_iterator VI = valueNumbering.find(V); - assert(VI != valueNumbering.end() && "Value not numbered?"); - return VI->second; + if (Verify) { + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; + } + return (VI != valueNumbering.end()) ? VI->second : 0; } /// Returns the value number of the given comparison, @@ -549,21 +559,29 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Predicate, Value *LHS, Value *RHS) { Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS); - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; - return e; + return assignExpNewValueNum(exp).first; } /// Remove all entries from the ValueTable. void GVN::ValueTable::clear() { valueNumbering.clear(); expressionNumbering.clear(); + NumberingPhi.clear(); + PhiTranslateTable.clear(); + BlockRPONumber.clear(); nextValueNumber = 1; + Expressions.clear(); + ExprIdx.clear(); + nextExprNumber = 0; } /// Remove a value from the value numbering. void GVN::ValueTable::erase(Value *V) { + uint32_t Num = valueNumbering.lookup(V); valueNumbering.erase(V); + // If V is PHINode, V <--> value number is an one-to-one mapping. + if (isa(V)) + NumberingPhi.erase(Num); } /// verifyRemoved - Verify that the value is removed from all internal data @@ -1451,6 +1469,104 @@ bool GVN::processLoad(LoadInst *L) { return false; } +/// Return a pair the first field showing the value number of \p Exp and the +/// second field showing whether it is a value number newly created. +std::pair +GVN::ValueTable::assignExpNewValueNum(Expression &Exp) { + uint32_t &e = expressionNumbering[Exp]; + bool CreateNewValNum = !e; + if (CreateNewValNum) { + Expressions.push_back(Exp); + if (ExprIdx.size() < nextValueNumber + 1) + ExprIdx.resize(nextValueNumber * 2); + e = nextValueNumber; + ExprIdx[nextValueNumber++] = nextExprNumber++; + } + return {e, CreateNewValNum}; +} + +void GVN::ValueTable::assignBlockRPONumber(Function &F) { + uint32_t NextBlockNumber = 1; + ReversePostOrderTraversal RPOT(&F); + for (BasicBlock *BB : RPOT) + BlockRPONumber[BB] = NextBlockNumber++; +} + +/// Return whether all the values related with the same \p num are +/// defined in \p BB. +bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB, + GVN &Gvn) { + LeaderTableEntry *Vals = &Gvn.LeaderTable[Num]; + while (Vals && Vals->BB == BB) + Vals = Vals->Next; + return !Vals; +} + +/// Wrap phiTranslateImpl to provide caching functionality. +uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred, + const BasicBlock *PhiBlock, uint32_t Num, + GVN &Gvn) { + auto FindRes = PhiTranslateTable.find({Num, Pred}); + if (FindRes != PhiTranslateTable.end()) + return FindRes->second; + uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn); + PhiTranslateTable.insert({{Num, Pred}, NewNum}); + return NewNum; +} + +/// Translate value number \p Num using phis, so that it has the values of +/// the phis in BB. +uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred, + const BasicBlock *PhiBlock, + uint32_t Num, GVN &Gvn) { + if (PHINode *PN = NumberingPhi[Num]) { + if (BlockRPONumber[Pred] >= BlockRPONumber[PhiBlock]) + return Num; + for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { + if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred) + if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false)) + return TransVal; + } + return Num; + } + + // If there is any value related with Num is defined in a BB other than + // PhiBlock, it cannot depend on a phi in PhiBlock without going through + // a backedge. We can do an early exit in that case to save compile time. + if (!areAllValsInBB(Num, PhiBlock, Gvn)) + return Num; + + if (ExprIdx[Num] == 0 || Num >= ExprIdx.size()) + return Num; + Expression Exp = Expressions[ExprIdx[Num]]; + + for (unsigned i = 0; i < Exp.varargs.size(); i++) { + // For InsertValue and ExtractValue, some varargs are index numbers + // instead of value numbers. Those index numbers should not be + // translated. + if ((i > 1 && Exp.opcode == Instruction::InsertValue) || + (i > 0 && Exp.opcode == Instruction::ExtractValue)) + continue; + Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn); + } + + if (Exp.commutative) { + assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!"); + if (Exp.varargs[0] > Exp.varargs[1]) { + std::swap(Exp.varargs[0], Exp.varargs[1]); + uint32_t Opcode = Exp.opcode >> 8; + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) + Exp.opcode = (Opcode << 8) | + CmpInst::getSwappedPredicate( + static_cast(Exp.opcode & 255)); + } + } + + if (uint32_t NewNum = expressionNumbering[Exp]) + return NewNum; + return Num; +} + // In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, // and then scan the list to find one whose block dominates the block in @@ -1856,6 +1972,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, // Fabricate val-num for dead-code in order to suppress assertion in // performPRE(). assignValNumForDeadCode(); + VN.assignBlockRPONumber(F); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); @@ -1945,7 +2062,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, success = false; break; } - if (Value *V = findLeader(Pred, VN.lookup(Op))) { + uint32_t TValNo = + VN.phiTranslate(Pred, Instr->getParent(), VN.lookup(Op), *this); + if (Value *V = findLeader(Pred, TValNo)) { Instr->setOperand(i, V); } else { success = false; @@ -1962,10 +2081,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Instr->insertBefore(Pred->getTerminator()); Instr->setName(Instr->getName() + ".pre"); Instr->setDebugLoc(Instr->getDebugLoc()); - VN.add(Instr, ValNo); + + unsigned Num = VN.lookupOrAdd(Instr); + VN.add(Instr, Num); // Update the availability map to include the new instruction. - addToLeaderTable(ValNo, Instr, Pred); + addToLeaderTable(Num, Instr, Pred); return true; } @@ -2014,7 +2135,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { break; } - Value *predV = findLeader(P, ValNo); + uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); + Value *predV = findLeader(P, TValNo); if (!predV) { predMap.push_back(std::make_pair(static_cast(nullptr), P)); PREPred = P; diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp new file mode 100644 index 000000000000..5c75f39e381d --- /dev/null +++ b/lib/Transforms/Scalar/GVNSink.cpp @@ -0,0 +1,872 @@ +//===- GVNSink.cpp - sink expressions into successors -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file GVNSink.cpp +/// This pass attempts to sink instructions into successors, reducing static +/// instruction count and enabling if-conversion. +/// +/// We use a variant of global value numbering to decide what can be sunk. +/// Consider: +/// +/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ] +/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ] +/// \ / +/// [ %e = phi i32 %a2, %c2 ] +/// [ add i32 %e, 4 ] +/// +/// +/// GVN would number %a1 and %c1 differently because they compute different +/// results - the VN of an instruction is a function of its opcode and the +/// transitive closure of its operands. This is the key property for hoisting +/// and CSE. +/// +/// What we want when sinking however is for a numbering that is a function of +/// the *uses* of an instruction, which allows us to answer the question "if I +/// replace %a1 with %c1, will it contribute in an equivalent way to all +/// successive instructions?". The PostValueTable class in GVN provides this +/// mapping. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/GVNExpression.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include +using namespace llvm; + +#define DEBUG_TYPE "gvn-sink" + +STATISTIC(NumRemoved, "Number of instructions removed"); + +namespace { + +static bool isMemoryInst(const Instruction *I) { + return isa(I) || isa(I) || + (isa(I) && !cast(I)->doesNotAccessMemory()) || + (isa(I) && !cast(I)->doesNotAccessMemory()); +} + +/// Iterates through instructions in a set of blocks in reverse order from the +/// first non-terminator. For example (assume all blocks have size n): +/// LockstepReverseIterator I([B1, B2, B3]); +/// *I-- = [B1[n], B2[n], B3[n]]; +/// *I-- = [B1[n-1], B2[n-1], B3[n-1]]; +/// *I-- = [B1[n-2], B2[n-2], B3[n-2]]; +/// ... +/// +/// It continues until all blocks have been exhausted. Use \c getActiveBlocks() +/// to +/// determine which blocks are still going and the order they appear in the +/// list returned by operator*. +class LockstepReverseIterator { + ArrayRef Blocks; + SmallPtrSet ActiveBlocks; + SmallVector Insts; + bool Fail; + +public: + LockstepReverseIterator(ArrayRef Blocks) : Blocks(Blocks) { + reset(); + } + + void reset() { + Fail = false; + ActiveBlocks.clear(); + for (BasicBlock *BB : Blocks) + ActiveBlocks.insert(BB); + Insts.clear(); + for (BasicBlock *BB : Blocks) { + if (BB->size() <= 1) { + // Block wasn't big enough - only contained a terminator. + ActiveBlocks.erase(BB); + continue; + } + Insts.push_back(BB->getTerminator()->getPrevNode()); + } + if (Insts.empty()) + Fail = true; + } + + bool isValid() const { return !Fail; } + ArrayRef operator*() const { return Insts; } + SmallPtrSet &getActiveBlocks() { return ActiveBlocks; } + + void restrictToBlocks(SmallPtrSetImpl &Blocks) { + for (auto II = Insts.begin(); II != Insts.end();) { + if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) == + Blocks.end()) { + ActiveBlocks.erase((*II)->getParent()); + II = Insts.erase(II); + } else { + ++II; + } + } + } + + void operator--() { + if (Fail) + return; + SmallVector NewInsts; + for (auto *Inst : Insts) { + if (Inst == &Inst->getParent()->front()) + ActiveBlocks.erase(Inst->getParent()); + else + NewInsts.push_back(Inst->getPrevNode()); + } + if (NewInsts.empty()) { + Fail = true; + return; + } + Insts = NewInsts; + } +}; + +//===----------------------------------------------------------------------===// + +/// Candidate solution for sinking. There may be different ways to +/// sink instructions, differing in the number of instructions sunk, +/// the number of predecessors sunk from and the number of PHIs +/// required. +struct SinkingInstructionCandidate { + unsigned NumBlocks; + unsigned NumInstructions; + unsigned NumPHIs; + unsigned NumMemoryInsts; + int Cost = -1; + SmallVector Blocks; + + void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) { + unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs; + unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0; + Cost = (NumInstructions * (NumBlocks - 1)) - + (NumExtraPHIs * + NumExtraPHIs) // PHIs are expensive, so make sure they're worth it. + - SplitEdgeCost; + } + bool operator>=(const SinkingInstructionCandidate &Other) const { + return Cost >= Other.Cost; + } +}; + +#ifndef NDEBUG +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const SinkingInstructionCandidate &C) { + OS << ""; + return OS; +} +#endif + +//===----------------------------------------------------------------------===// + +/// Describes a PHI node that may or may not exist. These track the PHIs +/// that must be created if we sunk a sequence of instructions. It provides +/// a hash function for efficient equality comparisons. +class ModelledPHI { + SmallVector Values; + SmallVector Blocks; + +public: + ModelledPHI() {} + ModelledPHI(const PHINode *PN) { + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) + Blocks.push_back(PN->getIncomingBlock(I)); + std::sort(Blocks.begin(), Blocks.end()); + + // This assumes the PHI is already well-formed and there aren't conflicting + // incoming values for the same block. + for (auto *B : Blocks) + Values.push_back(PN->getIncomingValueForBlock(B)); + } + /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI + /// without the same ID. + /// \note This is specifically for DenseMapInfo - do not use this! + static ModelledPHI createDummy(size_t ID) { + ModelledPHI M; + M.Values.push_back(reinterpret_cast(ID)); + return M; + } + + /// Create a PHI from an array of incoming values and incoming blocks. + template + ModelledPHI(const VArray &V, const BArray &B) { + std::copy(V.begin(), V.end(), std::back_inserter(Values)); + std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + } + + /// Create a PHI from [I[OpNum] for I in Insts]. + template + ModelledPHI(ArrayRef Insts, unsigned OpNum, const BArray &B) { + std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + for (auto *I : Insts) + Values.push_back(I->getOperand(OpNum)); + } + + /// Restrict the PHI's contents down to only \c NewBlocks. + /// \c NewBlocks must be a subset of \c this->Blocks. + void restrictToBlocks(const SmallPtrSetImpl &NewBlocks) { + auto BI = Blocks.begin(); + auto VI = Values.begin(); + while (BI != Blocks.end()) { + assert(VI != Values.end()); + if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) == + NewBlocks.end()) { + BI = Blocks.erase(BI); + VI = Values.erase(VI); + } else { + ++BI; + ++VI; + } + } + assert(Blocks.size() == NewBlocks.size()); + } + + ArrayRef getValues() const { return Values; } + + bool areAllIncomingValuesSame() const { + return all_of(Values, [&](Value *V) { return V == Values[0]; }); + } + bool areAllIncomingValuesSameType() const { + return all_of( + Values, [&](Value *V) { return V->getType() == Values[0]->getType(); }); + } + bool areAnyIncomingValuesConstant() const { + return any_of(Values, [&](Value *V) { return isa(V); }); + } + // Hash functor + unsigned hash() const { + return (unsigned)hash_combine_range(Values.begin(), Values.end()); + } + bool operator==(const ModelledPHI &Other) const { + return Values == Other.Values && Blocks == Other.Blocks; + } +}; + +template struct DenseMapInfo { + static inline ModelledPHI &getEmptyKey() { + static ModelledPHI Dummy = ModelledPHI::createDummy(0); + return Dummy; + } + static inline ModelledPHI &getTombstoneKey() { + static ModelledPHI Dummy = ModelledPHI::createDummy(1); + return Dummy; + } + static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); } + static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) { + return LHS == RHS; + } +}; + +typedef DenseSet> ModelledPHISet; + +//===----------------------------------------------------------------------===// +// ValueTable +//===----------------------------------------------------------------------===// +// This is a value number table where the value number is a function of the +// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know +// that the program would be equivalent if we replaced A with PHI(A, B). +//===----------------------------------------------------------------------===// + +/// A GVN expression describing how an instruction is used. The operands +/// field of BasicExpression is used to store uses, not operands. +/// +/// This class also contains fields for discriminators used when determining +/// equivalence of instructions with sideeffects. +class InstructionUseExpr : public GVNExpression::BasicExpression { + unsigned MemoryUseOrder = -1; + bool Volatile = false; + +public: + InstructionUseExpr(Instruction *I, ArrayRecycler &R, + BumpPtrAllocator &A) + : GVNExpression::BasicExpression(I->getNumUses()) { + allocateOperands(R, A); + setOpcode(I->getOpcode()); + setType(I->getType()); + + for (auto &U : I->uses()) + op_push_back(U.getUser()); + std::sort(op_begin(), op_end()); + } + void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } + void setVolatile(bool V) { Volatile = V; } + + virtual hash_code getHashValue() const { + return hash_combine(GVNExpression::BasicExpression::getHashValue(), + MemoryUseOrder, Volatile); + } + + template hash_code getHashValue(Function MapFn) { + hash_code H = + hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile); + for (auto *V : operands()) + H = hash_combine(H, MapFn(V)); + return H; + } +}; + +class ValueTable { + DenseMap ValueNumbering; + DenseMap ExpressionNumbering; + DenseMap HashNumbering; + BumpPtrAllocator Allocator; + ArrayRecycler Recycler; + uint32_t nextValueNumber; + + /// Create an expression for I based on its opcode and its uses. If I + /// touches or reads memory, the expression is also based upon its memory + /// order - see \c getMemoryUseOrder(). + InstructionUseExpr *createExpr(Instruction *I) { + InstructionUseExpr *E = + new (Allocator) InstructionUseExpr(I, Recycler, Allocator); + if (isMemoryInst(I)) + E->setMemoryUseOrder(getMemoryUseOrder(I)); + + if (CmpInst *C = dyn_cast(I)) { + CmpInst::Predicate Predicate = C->getPredicate(); + E->setOpcode((C->getOpcode() << 8) | Predicate); + } + return E; + } + + /// Helper to compute the value number for a memory instruction + /// (LoadInst/StoreInst), including checking the memory ordering and + /// volatility. + template InstructionUseExpr *createMemoryExpr(Inst *I) { + if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic()) + return nullptr; + InstructionUseExpr *E = createExpr(I); + E->setVolatile(I->isVolatile()); + return E; + } + +public: + /// Returns the value number for the specified value, assigning + /// it a new number if it did not have one before. + uint32_t lookupOrAdd(Value *V) { + auto VI = ValueNumbering.find(V); + if (VI != ValueNumbering.end()) + return VI->second; + + if (!isa(V)) { + ValueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + Instruction *I = cast(V); + InstructionUseExpr *exp = nullptr; + switch (I->getOpcode()) { + case Instruction::Load: + exp = createMemoryExpr(cast(I)); + break; + case Instruction::Store: + exp = createMemoryExpr(cast(I)); + break; + case Instruction::Call: + case Instruction::Invoke: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ExtractElement: + case Instruction::InsertElement: + case Instruction::ShuffleVector: + case Instruction::InsertValue: + case Instruction::GetElementPtr: + exp = createExpr(I); + break; + default: + break; + } + + if (!exp) { + ValueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + uint32_t e = ExpressionNumbering[exp]; + if (!e) { + hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); }); + auto I = HashNumbering.find(H); + if (I != HashNumbering.end()) { + e = I->second; + } else { + e = nextValueNumber++; + HashNumbering[H] = e; + ExpressionNumbering[exp] = e; + } + } + ValueNumbering[V] = e; + return e; + } + + /// Returns the value number of the specified value. Fails if the value has + /// not yet been numbered. + uint32_t lookup(Value *V) const { + auto VI = ValueNumbering.find(V); + assert(VI != ValueNumbering.end() && "Value not numbered?"); + return VI->second; + } + + /// Removes all value numberings and resets the value table. + void clear() { + ValueNumbering.clear(); + ExpressionNumbering.clear(); + HashNumbering.clear(); + Recycler.clear(Allocator); + nextValueNumber = 1; + } + + ValueTable() : nextValueNumber(1) {} + + /// \c Inst uses or touches memory. Return an ID describing the memory state + /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2), + /// the exact same memory operations happen after I1 and I2. + /// + /// This is a very hard problem in general, so we use domain-specific + /// knowledge that we only ever check for equivalence between blocks sharing a + /// single immediate successor that is common, and when determining if I1 == + /// I2 we will have already determined that next(I1) == next(I2). This + /// inductive property allows us to simply return the value number of the next + /// instruction that defines memory. + uint32_t getMemoryUseOrder(Instruction *Inst) { + auto *BB = Inst->getParent(); + for (auto I = std::next(Inst->getIterator()), E = BB->end(); + I != E && !I->isTerminator(); ++I) { + if (!isMemoryInst(&*I)) + continue; + if (isa(&*I)) + continue; + CallInst *CI = dyn_cast(&*I); + if (CI && CI->onlyReadsMemory()) + continue; + InvokeInst *II = dyn_cast(&*I); + if (II && II->onlyReadsMemory()) + continue; + return lookupOrAdd(&*I); + } + return 0; + } +}; + +//===----------------------------------------------------------------------===// + +class GVNSink { +public: + GVNSink() : VN() {} + bool run(Function &F) { + DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n"); + + unsigned NumSunk = 0; + ReversePostOrderTraversal RPOT(&F); + for (auto *N : RPOT) + NumSunk += sinkBB(N); + + return NumSunk > 0; + } + +private: + ValueTable VN; + + bool isInstructionBlacklisted(Instruction *I) { + // These instructions may change or break semantics if moved. + if (isa(I) || I->isEHPad() || isa(I) || + I->getType()->isTokenTy()) + return true; + return false; + } + + /// The main heuristic function. Analyze the set of instructions pointed to by + /// LRI and return a candidate solution if these instructions can be sunk, or + /// None otherwise. + Optional analyzeInstructionForSinking( + LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, + ModelledPHISet &NeededPHIs, SmallPtrSetImpl &PHIContents); + + /// Create a ModelledPHI for each PHI in BB, adding to PHIs. + void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs, + SmallPtrSetImpl &PHIContents) { + for (auto &I : *BB) { + auto *PN = dyn_cast(&I); + if (!PN) + return; + + auto MPHI = ModelledPHI(PN); + PHIs.insert(MPHI); + for (auto *V : MPHI.getValues()) + PHIContents.insert(V); + } + } + + /// The main instruction sinking driver. Set up state and try and sink + /// instructions into BBEnd from its predecessors. + unsigned sinkBB(BasicBlock *BBEnd); + + /// Perform the actual mechanics of sinking an instruction from Blocks into + /// BBEnd, which is their only successor. + void sinkLastInstruction(ArrayRef Blocks, BasicBlock *BBEnd); + + /// Remove PHIs that all have the same incoming value. + void foldPointlessPHINodes(BasicBlock *BB) { + auto I = BB->begin(); + while (PHINode *PN = dyn_cast(I++)) { + if (!all_of(PN->incoming_values(), + [&](const Value *V) { return V == PN->getIncomingValue(0); })) + continue; + if (PN->getIncomingValue(0) != PN) + PN->replaceAllUsesWith(PN->getIncomingValue(0)); + else + PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + PN->eraseFromParent(); + } + } +}; + +Optional GVNSink::analyzeInstructionForSinking( + LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, + ModelledPHISet &NeededPHIs, SmallPtrSetImpl &PHIContents) { + auto Insts = *LRI; + DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I + : Insts) { + I->dump(); + } dbgs() << " ]\n";); + + DenseMap VNums; + for (auto *I : Insts) { + uint32_t N = VN.lookupOrAdd(I); + DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n"); + if (N == ~0U) + return None; + VNums[N]++; + } + unsigned VNumToSink = + std::max_element(VNums.begin(), VNums.end(), + [](const std::pair &I, + const std::pair &J) { + return I.second < J.second; + }) + ->first; + + if (VNums[VNumToSink] == 1) + // Can't sink anything! + return None; + + // Now restrict the number of incoming blocks down to only those with + // VNumToSink. + auto &ActivePreds = LRI.getActiveBlocks(); + unsigned InitialActivePredSize = ActivePreds.size(); + SmallVector NewInsts; + for (auto *I : Insts) { + if (VN.lookup(I) != VNumToSink) + ActivePreds.erase(I->getParent()); + else + NewInsts.push_back(I); + } + for (auto *I : NewInsts) + if (isInstructionBlacklisted(I)) + return None; + + // If we've restricted the incoming blocks, restrict all needed PHIs also + // to that set. + bool RecomputePHIContents = false; + if (ActivePreds.size() != InitialActivePredSize) { + ModelledPHISet NewNeededPHIs; + for (auto P : NeededPHIs) { + P.restrictToBlocks(ActivePreds); + NewNeededPHIs.insert(P); + } + NeededPHIs = NewNeededPHIs; + LRI.restrictToBlocks(ActivePreds); + RecomputePHIContents = true; + } + + // The sunk instruction's results. + ModelledPHI NewPHI(NewInsts, ActivePreds); + + // Does sinking this instruction render previous PHIs redundant? + if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) { + NeededPHIs.erase(NewPHI); + RecomputePHIContents = true; + } + + if (RecomputePHIContents) { + // The needed PHIs have changed, so recompute the set of all needed + // values. + PHIContents.clear(); + for (auto &PHI : NeededPHIs) + PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); + } + + // Is this instruction required by a later PHI that doesn't match this PHI? + // if so, we can't sink this instruction. + for (auto *V : NewPHI.getValues()) + if (PHIContents.count(V)) + // V exists in this PHI, but the whole PHI is different to NewPHI + // (else it would have been removed earlier). We cannot continue + // because this isn't representable. + return None; + + // Which operands need PHIs? + // FIXME: If any of these fail, we should partition up the candidates to + // try and continue making progress. + Instruction *I0 = NewInsts[0]; + for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) { + ModelledPHI PHI(NewInsts, OpNum, ActivePreds); + if (PHI.areAllIncomingValuesSame()) + continue; + if (!canReplaceOperandWithVariable(I0, OpNum)) + // We can 't create a PHI from this instruction! + return None; + if (NeededPHIs.count(PHI)) + continue; + if (!PHI.areAllIncomingValuesSameType()) + return None; + // Don't create indirect calls! The called value is the final operand. + if ((isa(I0) || isa(I0)) && OpNum == E - 1 && + PHI.areAnyIncomingValuesConstant()) + return None; + + NeededPHIs.reserve(NeededPHIs.size()); + NeededPHIs.insert(PHI); + PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); + } + + if (isMemoryInst(NewInsts[0])) + ++MemoryInstNum; + + SinkingInstructionCandidate Cand; + Cand.NumInstructions = ++InstNum; + Cand.NumMemoryInsts = MemoryInstNum; + Cand.NumBlocks = ActivePreds.size(); + Cand.NumPHIs = NeededPHIs.size(); + for (auto *C : ActivePreds) + Cand.Blocks.push_back(C); + + return Cand; +} + +unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { + DEBUG(dbgs() << "GVNSink: running on basic block "; + BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); + SmallVector Preds; + for (auto *B : predecessors(BBEnd)) { + auto *T = B->getTerminator(); + if (isa(T) || isa(T)) + Preds.push_back(B); + else + return 0; + } + if (Preds.size() < 2) + return 0; + std::sort(Preds.begin(), Preds.end()); + + unsigned NumOrigPreds = Preds.size(); + // We can only sink instructions through unconditional branches. + for (auto I = Preds.begin(); I != Preds.end();) { + if ((*I)->getTerminator()->getNumSuccessors() != 1) + I = Preds.erase(I); + else + ++I; + } + + LockstepReverseIterator LRI(Preds); + SmallVector Candidates; + unsigned InstNum = 0, MemoryInstNum = 0; + ModelledPHISet NeededPHIs; + SmallPtrSet PHIContents; + analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents); + unsigned NumOrigPHIs = NeededPHIs.size(); + + while (LRI.isValid()) { + auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum, + NeededPHIs, PHIContents); + if (!Cand) + break; + Cand->calculateCost(NumOrigPHIs, Preds.size()); + Candidates.emplace_back(*Cand); + --LRI; + } + + std::stable_sort( + Candidates.begin(), Candidates.end(), + [](const SinkingInstructionCandidate &A, + const SinkingInstructionCandidate &B) { return A >= B; }); + DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C + : Candidates) dbgs() + << " " << C << "\n";); + + // Pick the top candidate, as long it is positive! + if (Candidates.empty() || Candidates.front().Cost <= 0) + return 0; + auto C = Candidates.front(); + + DEBUG(dbgs() << " -- Sinking: " << C << "\n"); + BasicBlock *InsertBB = BBEnd; + if (C.Blocks.size() < NumOrigPreds) { + DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs()); + dbgs() << "\n"); + InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split"); + if (!InsertBB) { + DEBUG(dbgs() << " -- FAILED to split edge!\n"); + // Edge couldn't be split. + return 0; + } + } + + for (unsigned I = 0; I < C.NumInstructions; ++I) + sinkLastInstruction(C.Blocks, InsertBB); + + return C.NumInstructions; +} + +void GVNSink::sinkLastInstruction(ArrayRef Blocks, + BasicBlock *BBEnd) { + SmallVector Insts; + for (BasicBlock *BB : Blocks) + Insts.push_back(BB->getTerminator()->getPrevNode()); + Instruction *I0 = Insts.front(); + + SmallVector NewOperands; + for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { + bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) { + return I->getOperand(O) != I0->getOperand(O); + }); + if (!NeedPHI) { + NewOperands.push_back(I0->getOperand(O)); + continue; + } + + // Create a new PHI in the successor block and populate it. + auto *Op = I0->getOperand(O); + assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!"); + auto *PN = PHINode::Create(Op->getType(), Insts.size(), + Op->getName() + ".sink", &BBEnd->front()); + for (auto *I : Insts) + PN->addIncoming(I->getOperand(O), I->getParent()); + NewOperands.push_back(PN); + } + + // Arbitrarily use I0 as the new "common" instruction; remap its operands + // and move it to the start of the successor block. + for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) + I0->getOperandUse(O).set(NewOperands[O]); + I0->moveBefore(&*BBEnd->getFirstInsertionPt()); + + // Update metadata and IR flags. + for (auto *I : Insts) + if (I != I0) { + combineMetadataForCSE(I0, I); + I0->andIRFlags(I); + } + + for (auto *I : Insts) + if (I != I0) + I->replaceAllUsesWith(I0); + foldPointlessPHINodes(BBEnd); + + // Finally nuke all instructions apart from the common instruction. + for (auto *I : Insts) + if (I != I0) + I->eraseFromParent(); + + NumRemoved += Insts.size() - 1; +} + +//////////////////////////////////////////////////////////////////////////////// +// Pass machinery / boilerplate + +class GVNSinkLegacyPass : public FunctionPass { +public: + static char ID; + + GVNSinkLegacyPass() : FunctionPass(ID) { + initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + GVNSink G; + return G.run(F); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + } +}; +} // namespace + +PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { + GVNSink G; + if (!G.run(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +char GVNSinkLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink", + "Early GVN sinking of Expressions", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink", + "Early GVN sinking of Expressions", false, false) + +FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); } diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp index 198d2b2b024f..65a2cd955672 100644 --- a/lib/Transforms/Scalar/GuardWidening.cpp +++ b/lib/Transforms/Scalar/GuardWidening.cpp @@ -537,9 +537,7 @@ bool GuardWideningImpl::parseRangeChecks( Changed = true; } else if (match(Check.getBase(), m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { - unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits(); - KnownBits Known(BitWidth); - computeKnownBits(OpLHS, Known, DL); + KnownBits Known = computeKnownBits(OpLHS, DL); if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) { Check.setBase(OpLHS); APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 85db6e5e1105..e21b0feb7c5a 100644 --- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -1228,7 +1228,12 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef BBs) { Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, ValueToValueMapTy &VM) { - Loop &New = LPM.addLoop(Parent); + Loop &New = *new Loop(); + if (Parent) + Parent->addChildLoop(&New); + else + LI.addTopLevelLoop(&New); + LPM.addLoop(New); // Add all of the blocks in Original to the new loop. for (auto *BB : Original->blocks()) diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index ada22ae38eb8..2ef8f8563bb9 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -253,6 +253,35 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, return EverChanged; } +// Replace uses of Cond with ToVal when safe to do so. If all uses are +// replaced, we can remove Cond. We cannot blindly replace all uses of Cond +// because we may incorrectly replace uses when guards/assumes are uses of +// of `Cond` and we used the guards/assume to reason about the `Cond` value +// at the end of block. RAUW unconditionally replaces all uses +// including the guards/assumes themselves and the uses before the +// guard/assume. +static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) { + assert(Cond->getType() == ToVal->getType()); + auto *BB = Cond->getParent(); + // We can unconditionally replace all uses in non-local blocks (i.e. uses + // strictly dominated by BB), since LVI information is true from the + // terminator of BB. + replaceNonLocalUsesWith(Cond, ToVal); + for (Instruction &I : reverse(*BB)) { + // Reached the Cond whose uses we are trying to replace, so there are no + // more uses. + if (&I == Cond) + break; + // We only replace uses in instructions that are guaranteed to reach the end + // of BB, where we know Cond is ToVal. + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + break; + I.replaceUsesOfWith(Cond, ToVal); + } + if (Cond->use_empty() && !Cond->mayHaveSideEffects()) + Cond->eraseFromParent(); +} + /// Return the cost of duplicating a piece of this block from first non-phi /// and before StopAt instruction to thread across it. Stop scanning the block /// when exceeding the threshold. If duplication is impossible, returns ~0U. @@ -833,13 +862,19 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { CondBr->eraseFromParent(); if (CondCmp->use_empty()) CondCmp->eraseFromParent(); - // TODO: We can safely replace *some* uses of the CondInst if it has + // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This // is because we use the guards/assume to reason about the `Cond` value // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. + else if (CondCmp->getParent() == BB) { + auto *CI = Ret == LazyValueInfo::True ? + ConstantInt::getTrue(CondCmp->getType()) : + ConstantInt::getFalse(CondCmp->getType()); + ReplaceFoldableUses(CondCmp, CI); + } return true; } @@ -1325,13 +1360,16 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, if (auto *CondInst = dyn_cast(Cond)) { if (CondInst->use_empty() && !CondInst->mayHaveSideEffects()) CondInst->eraseFromParent(); - // TODO: We can safely replace *some* uses of the CondInst if it has + // We can safely replace *some* uses of the CondInst if it has // exactly one value as returned by LVI. RAUW is incorrect in the // presence of guards and assumes, that have the `Cond` as the use. This // is because we use the guards/assume to reason about the `Cond` value // at the end of block, but RAUW unconditionally replaces all uses // including the guards/assumes themselves and the uses before the // guard/assume. + else if (OnlyVal && OnlyVal != MultipleVal && + CondInst->getParent() == BB) + ReplaceFoldableUses(CondInst, OnlyVal); } return true; } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 97337ea5ba62..c6a05ecbd0b1 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1035,6 +1035,17 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { return nullptr; } +// Check if the recurrence variable `VarX` is in the right form to create +// the idiom. Returns the value coerced to a PHINode if so. +static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, + BasicBlock *LoopEntry) { + auto *PhiX = dyn_cast(VarX); + if (PhiX && PhiX->getParent() == LoopEntry && + (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) + return PhiX; + return nullptr; +} + /// Return true iff the idiom is detected in the loop. /// /// Additionally: @@ -1110,13 +1121,9 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, } // step 3: Check the recurrence of variable X - { - PhiX = dyn_cast(VarX1); - if (!PhiX || - (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { - return false; - } - } + PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); + if (!PhiX) + return false; // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { @@ -1132,8 +1139,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, if (!Inc || !Inc->isOne()) continue; - PHINode *Phi = dyn_cast(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) continue; // Check if the result of the instruction is live of the loop. @@ -1227,8 +1234,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, VarX = DefX->getOperand(0); // step 3: Check the recurrence of variable X - PhiX = dyn_cast(VarX); - if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX)) + PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); + if (!PhiX) return false; // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 @@ -1248,8 +1255,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, if (!Inc || !Inc->isOne()) continue; - PHINode *Phi = dyn_cast(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) continue; CntInst = Inst; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 6ef1464e9338..19daebd0613a 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -831,7 +831,12 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop &New = LPM->addLoop(PL); + Loop &New = *new Loop(); + if (PL) + PL->addChildLoop(&New); + else + LI->addTopLevelLoop(&New); + LPM->addLoop(New); // Add all of the blocks in L to the new loop. for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 5cfbf6baeaa9..67abc3116988 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -858,7 +858,14 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, // Filter out unreachable phi operands. auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { - return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}); + if (*U == PN) + return false; + if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock})) + return false; + // Things in TOPClass are equivalent to everything. + if (ValueToClass.lookup(*U) == TOPClass) + return false; + return true; }); std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), [&](const Use *U) -> Value * { @@ -866,14 +873,6 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, HasBackedge = HasBackedge || isBackedge(BB, PHIBlock); OriginalOpsConstant = OriginalOpsConstant && isa(*U); - // Use nullptr to distinguish between things that were - // originally self-defined and those that have an operand - // leader that is self-defined. - if (*U == PN) - return nullptr; - // Things in TOPClass are equivalent to everything. - if (ValueToClass.lookup(*U) == TOPClass) - return nullptr; return lookupOperandLeader(*U); }); return E; @@ -955,6 +954,10 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, CongruenceClass *CC = ValueToClass.lookup(V); if (CC && CC->getDefiningExpr()) { + // If we simplified to something else, we need to communicate + // that we're users of the value we simplified to. + if (I != V) + addAdditionalUsers(V, I); if (I) DEBUG(dbgs() << "Simplified " << *I << " to " << " expression " << *CC->getDefiningExpr() << "\n"); @@ -1581,6 +1584,30 @@ bool NewGVN::isCycleFree(const Instruction *I) const { // Evaluate PHI nodes symbolically, and create an expression result. const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { + // Resolve irreducible and reducible phi cycles. + // FIXME: This is hopefully a temporary solution while we resolve the issues + // with fixpointing self-cycles. It currently should be "guaranteed" to be + // correct, but non-optimal. The SCCFinder does not, for example, take + // reachability of arguments into account, etc. + SCCFinder.Start(I); + bool CanOptimize = true; + SmallPtrSet OuterOps; + + auto &Component = SCCFinder.getComponentFor(I); + for (auto *Member : Component) { + if (!isa(Member)) { + CanOptimize = false; + break; + } + for (auto &PHIOp : cast(Member)->operands()) + if (!isa(PHIOp) || !Component.count(cast(PHIOp))) + OuterOps.insert(PHIOp); + } + if (CanOptimize && OuterOps.size() == 1) { + DEBUG(dbgs() << "Resolving cyclic phi to value " << *(*OuterOps.begin()) + << "\n"); + return createVariableOrConstant(*OuterOps.begin()); + } // True if one of the incoming phi edges is a backedge. bool HasBackedge = false; // All constant tracks the state of whether all the *original* phi operands @@ -1594,17 +1621,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // See if all arguments are the same. // We track if any were undef because they need special handling. bool HasUndef = false; - bool CycleFree = isCycleFree(I); auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) { - if (Arg == nullptr) - return false; - // Original self-operands are already eliminated during expression creation. - // We can only eliminate value-wise self-operands if it's cycle - // free. Otherwise, eliminating the operand can cause our value to change, - // which can cause us to not eliminate the operand, which changes the value - // back to what it was before, cycling forever. - if (CycleFree && Arg == I) - return false; if (isa(Arg)) { HasUndef = true; return false; @@ -1613,6 +1630,14 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { }); // If we are left with no operands, it's dead. if (Filtered.begin() == Filtered.end()) { + // If it has undef at this point, it means there are no-non-undef arguments, + // and thus, the value of the phi node must be undef. + if (HasUndef) { + DEBUG(dbgs() << "PHI Node " << *I + << " has no non-undef arguments, valuing it as undef\n"); + return createConstantExpression(UndefValue::get(I->getType())); + } + DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n"); deleteExpression(E); return createDeadExpression(); @@ -1642,7 +1667,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // constants, or all operands are ignored but the undef, it also must be // cycle free. if (!AllConstant && HasBackedge && NumOps > 0 && - !isa(AllSameValue) && !CycleFree) + !isa(AllSameValue) && !isCycleFree(I)) return E; // Only have to check for instructions @@ -3556,6 +3581,7 @@ bool NewGVN::eliminateInstructions(Function &F) { // Map to store the use counts DenseMap UseCounts; for (auto *CC : reverse(CongruenceClasses)) { + DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n"); // Track the equivalent store info so we can decide whether to try // dead store elimination. SmallVector PossibleDeadStores; @@ -3602,8 +3628,6 @@ bool NewGVN::eliminateInstructions(Function &F) { } CC->swap(MembersLeft); } else { - DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() - << "\n"); // If this is a singleton, we can skip it. if (CC->size() != 1 || RealToTemp.lookup(Leader)) { // This is a stack because equality replacement/etc may place @@ -3846,6 +3870,7 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } +namespace { class NewGVNLegacyPass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. @@ -3865,6 +3890,7 @@ private: AU.addPreserved(); } }; +} // namespace bool NewGVNLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 8908dae2f545..1d0e8396f6a2 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1779,8 +1779,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. if (F.hasLocalLinkage()) { - if (AddressIsTaken(&F)) + if (F.hasAddressTaken()) { AddressTakenFunctions.insert(&F); + } else { Solver.AddArgumentTrackedFunction(&F); continue; diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 24bd0a2b7bdf..6e113bccff94 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -326,7 +326,7 @@ private: /// partition. uint64_t BeginOffset, EndOffset; - /// \brief The start end end iterators of this partition. + /// \brief The start and end iterators of this partition. iterator SI, SJ; /// \brief A collection of split slice tails overlapping the partition. diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 52201d8f3e51..9fa43da99da9 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); + initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); initializeInductiveRangeCheckEliminationPass(Registry); initializeIndVarSimplifyLegacyPassPass(Registry); diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index b32a61a7e8f8..0f170e26ce5f 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -123,11 +123,62 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, // exit block. DT.changeImmediateDominator(UnswitchedNode, OldPHNode); - // Blocks reachable from the unswitched block may need to change their IDom - // as well. + // For everything that moves up the dominator tree, we need to examine the + // dominator frontier to see if it additionally should move up the dominator + // tree. This lambda appends the dominator frontier for a node on the + // worklist. + // + // Note that we don't currently use the IDFCalculator here for two reasons: + // 1) It computes dominator tree levels for the entire function on each run + // of 'compute'. While this isn't terrible, given that we expect to update + // relatively small subtrees of the domtree, it isn't necessarily the right + // tradeoff. + // 2) The interface doesn't fit this usage well. It doesn't operate in + // append-only, and builds several sets that we don't need. + // + // FIXME: Neither of these issues are a big deal and could be addressed with + // some amount of refactoring of IDFCalculator. That would allow us to share + // the core logic here (which is solving the same core problem). SmallSetVector Worklist; - for (auto *SuccBB : successors(UnswitchedBB)) - Worklist.insert(SuccBB); + SmallVector DomNodes; + SmallPtrSet DomSet; + auto AppendDomFrontier = [&](DomTreeNode *Node) { + assert(DomNodes.empty() && "Must start with no dominator nodes."); + assert(DomSet.empty() && "Must start with an empty dominator set."); + + // First flatten this subtree into sequence of nodes by doing a pre-order + // walk. + DomNodes.push_back(Node); + // We intentionally re-evaluate the size as each node can add new children. + // Because this is a tree walk, this cannot add any duplicates. + for (int i = 0; i < (int)DomNodes.size(); ++i) + DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); + + // Now create a set of the basic blocks so we can quickly test for + // dominated successors. We could in theory use the DFS numbers of the + // dominator tree for this, but we want this to remain predictably fast + // even while we mutate the dominator tree in ways that would invalidate + // the DFS numbering. + for (DomTreeNode *InnerN : DomNodes) + DomSet.insert(InnerN->getBlock()); + + // Now re-walk the nodes, appending every successor of every node that isn't + // in the set. Note that we don't append the node itself, even though if it + // is a successor it does not strictly dominate itself and thus it would be + // part of the dominance frontier. The reason we don't append it is that + // the node passed in came *from* the worklist and so it has already been + // processed. + for (DomTreeNode *InnerN : DomNodes) + for (BasicBlock *SuccBB : successors(InnerN->getBlock())) + if (!DomSet.count(SuccBB)) + Worklist.insert(SuccBB); + + DomNodes.clear(); + DomSet.clear(); + }; + + // Append the initial dom frontier nodes. + AppendDomFrontier(UnswitchedNode); // Walk the worklist. We grow the list in the loop and so must recompute size. for (int i = 0; i < (int)Worklist.size(); ++i) { @@ -136,20 +187,17 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, DomTreeNode *Node = DT[BB]; assert(!DomChain.count(Node) && "Cannot be dominated by a block you can reach!"); - // If this block doesn't have an immediate dominator somewhere in the chain - // we hoisted over, then its position in the domtree hasn't changed. Either - // it is above the region hoisted and still valid, or it is below the - // hoisted block and so was trivially updated. This also applies to - // everything reachable from this block so we're completely done with the - // it. + + // If this block had an immediate dominator somewhere in the chain + // we hoisted over, then its position in the domtree needs to move as it is + // reachable from a node hoisted over this chain. if (!DomChain.count(Node->getIDom())) continue; - // We need to change the IDom for this node but also walk its successors - // which could have similar dominance position. DT.changeImmediateDominator(Node, OldPHNode); - for (auto *SuccBB : successors(BB)) - Worklist.insert(SuccBB); + + // Now add this node's dominator frontier to the worklist as well. + AppendDomFrontier(Node); } } diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index bf2ab7c55be2..1ec3d0d49637 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -133,7 +133,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, auto *SP = cast(MD.second); NewMD = DISubprogram::getDistinct( NewFunc->getContext(), SP->getScope(), SP->getName(), - NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(), + SP->getLinkageName(), SP->getFile(), SP->getLine(), SP->getType(), SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(), SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(), SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(), diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp index 73a0b2737e95..57468be9a2a8 100644 --- a/lib/Transforms/Utils/FunctionComparator.cpp +++ b/lib/Transforms/Utils/FunctionComparator.cpp @@ -76,12 +76,14 @@ int FunctionComparator::cmpMem(StringRef L, StringRef R) const { int FunctionComparator::cmpAttrs(const AttributeList L, const AttributeList R) const { - if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots())) + if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets())) return Res; - for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) { - AttributeList::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i), - RE = R.end(i); + for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) { + AttributeSet LAS = L.getAttributes(i); + AttributeSet RAS = R.getAttributes(i); + AttributeSet::iterator LI = LAS.begin(), LE = LAS.end(); + AttributeSet::iterator RI = RAS.begin(), RE = RAS.end(); for (; LI != LE && RI != RE; ++LI, ++RI) { Attribute LA = *LI; Attribute RA = *RI; diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 9cb4762b683c..0ca9f4c484e6 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -1397,11 +1397,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, const Optional &CalleeEntryCount, const Instruction *TheCall, - ProfileSummaryInfo *PSI) { + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *CallerBFI) { if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1) return; Optional CallSiteCount = - PSI ? PSI->getProfileCount(TheCall, nullptr) : None; + PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None; uint64_t CallCount = std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0, CalleeEntryCount.getValue()); @@ -1637,7 +1638,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, CalledFunc->front()); updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, - IFI.PSI); + IFI.PSI, IFI.CallerBFI); // Update the profile count of callee. updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI); diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 1ca509472b5f..ebd528bc8ec1 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1037,17 +1037,15 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, const DominatorTree *DT) { assert(V->getType()->isPointerTy() && "getOrEnforceKnownAlignment expects a pointer!"); - unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType()); - KnownBits Known(BitWidth); - computeKnownBits(V, Known, DL, 0, AC, CxtI, DT); + KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT); unsigned TrailZ = Known.countMinTrailingZeros(); // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1)); - unsigned Align = 1u << std::min(BitWidth - 1, TrailZ); + unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ); // LLVM doesn't support alignments larger than this currently. Align = std::min(Align, +Value::MaximumAlignment); @@ -1796,6 +1794,23 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To, return Count; } +unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) { + assert(From->getType() == To->getType()); + auto *BB = From->getParent(); + unsigned Count = 0; + + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE;) { + Use &U = *UI++; + auto *I = cast(U.getUser()); + if (I->getParent() == BB) + continue; + U.set(To); + ++Count; + } + return Count; +} + unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root) { @@ -2094,3 +2109,48 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin( !F->doesNotAccessMemory()) CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin); } + +bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { + // We can't have a PHI with a metadata type. + if (I->getOperand(OpIdx)->getType()->isMetadataTy()) + return false; + + // Early exit. + if (!isa(I->getOperand(OpIdx))) + return true; + + switch (I->getOpcode()) { + default: + return true; + case Instruction::Call: + case Instruction::Invoke: + // Many arithmetic intrinsics have no issue taking a + // variable, however it's hard to distingish these from + // specials such as @llvm.frameaddress that require a constant. + if (isa(I)) + return false; + + // Constant bundle operands may need to retain their constant-ness for + // correctness. + if (ImmutableCallSite(I).isBundleOperand(OpIdx)) + return false; + return true; + case Instruction::ShuffleVector: + // Shufflevector masks are constant. + return OpIdx != 2; + case Instruction::ExtractValue: + case Instruction::InsertValue: + // All operands apart from the first are constant. + return OpIdx == 0; + case Instruction::Alloca: + return false; + case Instruction::GetElementPtr: + if (OpIdx == 0) + return true; + gep_type_iterator It = gep_type_begin(I); + for (auto E = std::next(It, OpIdx); It != E; ++It) + if (It.isStruct()) + return false; + return true; + } +} diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 27f72fcd8bda..1b442a9a264d 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1376,53 +1376,6 @@ HoistTerminator: return true; } -// Is it legal to place a variable in operand \c OpIdx of \c I? -// FIXME: This should be promoted to Instruction. -static bool canReplaceOperandWithVariable(const Instruction *I, - unsigned OpIdx) { - // We can't have a PHI with a metadata type. - if (I->getOperand(OpIdx)->getType()->isMetadataTy()) - return false; - - // Early exit. - if (!isa(I->getOperand(OpIdx))) - return true; - - switch (I->getOpcode()) { - default: - return true; - case Instruction::Call: - case Instruction::Invoke: - // FIXME: many arithmetic intrinsics have no issue taking a - // variable, however it's hard to distingish these from - // specials such as @llvm.frameaddress that require a constant. - if (isa(I)) - return false; - - // Constant bundle operands may need to retain their constant-ness for - // correctness. - if (ImmutableCallSite(I).isBundleOperand(OpIdx)) - return false; - - return true; - - case Instruction::ShuffleVector: - // Shufflevector masks are constant. - return OpIdx != 2; - case Instruction::ExtractValue: - case Instruction::InsertValue: - // All operands apart from the first are constant. - return OpIdx == 0; - case Instruction::Alloca: - return false; - case Instruction::GetElementPtr: - if (OpIdx == 0) - return true; - gep_type_iterator It = std::next(gep_type_begin(I), OpIdx - 1); - return It.isSequential(); - } -} - // All instructions in Insts belong to different blocks that all unconditionally // branch to a common successor. Analyze each instruction and return true if it // would be possible to sink them into their successor, creating one common @@ -4368,8 +4321,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, const DataLayout &DL) { Value *Cond = SI->getCondition(); unsigned Bits = Cond->getType()->getIntegerBitWidth(); - KnownBits Known(Bits); - computeKnownBits(Cond, Known, DL, 0, AC, SI); + KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 85c9464b5569..49effda5d833 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -466,9 +466,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B, } Value *Offset = GEP->getOperand(2); - unsigned BitWidth = Offset->getType()->getIntegerBitWidth(); - KnownBits Known(BitWidth); - computeKnownBits(Offset, Known, DL, 0, nullptr, CI, nullptr); + KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr); Known.Zero.flipAllBits(); uint64_t ArrSize = cast(GEP->getSourceElementType())->getNumElements(); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 1dc554bede7e..3b036a6ac430 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2092,6 +2092,10 @@ private: /// The data is collected per VF. DenseMap> Scalars; + /// Holds the instructions (address computations) that are forced to be + /// scalarized. + DenseMap> ForcedScalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -5086,12 +5090,18 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } bool LoopVectorizationLegality::canVectorize() { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // FIXME: The code is currently dead, since the loop gets sent to @@ -5101,21 +5111,30 @@ bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->empty()) { ORE->emit(createMissedAnalysis("NotInnermostLoop") << "loop is not the innermost loop"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We must have a single backedge. if (TheLoop->getNumBackEdges() != 1) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We must have a single exiting block. if (!TheLoop->getExitingBlock()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We only handle bottom-tested loops, i.e. loop in which the condition is @@ -5124,7 +5143,10 @@ bool LoopVectorizationLegality::canVectorize() { if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // We need to have a loop header. @@ -5135,28 +5157,28 @@ bool LoopVectorizationLegality::canVectorize() { unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); - return false; - } - - // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = PSE.getBackedgeTakenCount(); - if (ExitCount == PSE.getSE()->getCouldNotCompute()) { - ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations") - << "could not determine number of loop iterations"); - DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } // Go over each instruction and look at memory deps. if (!canVectorizeMemory()) { DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } DEBUG(dbgs() << "LV: We can vectorize this loop" @@ -5184,13 +5206,17 @@ bool LoopVectorizationLegality::canVectorize() { << "Too many SCEV assumptions need to be made and checked " << "at runtime"); DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); - return false; + if (ORE->allowExtraAnalysis()) + Result = false; + else + return false; } - // Okay! We can vectorize. At this point we don't have any other mem analysis + // Okay! We've done all the tests. If any have failed, return false. Otherwise + // we can vectorize, and at this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. - return true; + return Result; } static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { @@ -5554,6 +5580,13 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } + // Insert the forced scalars. + // FIXME: Currently widenPHIInstruction() often creates a dead vector + // induction variable when the PHI user is scalarized. + if (ForcedScalars.count(VF)) + for (auto *I : ForcedScalars.find(VF)->second) + Worklist.insert(I); + // Expand the worklist by looking through any bitcasts and getelementptr // instructions we've already identified as scalar. This is similar to the // expansion step in collectLoopUniforms(); however, here we're only @@ -7129,11 +7162,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (VF > 1 && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); + // Forced scalars do not have any scalarization overhead. + if (VF > 1 && ForcedScalars.count(VF) && + ForcedScalars.find(VF)->second.count(I)) + return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); + // Note: Even if all instructions are scalarized, return true if any memory + // accesses appear in the loop to get benefits from address folding etc. bool TypeNotScalarized = - VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; + VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); } @@ -7208,6 +7248,62 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { setWideningDecision(&I, VF, Decision, Cost); } } + + // Make sure that any load of address and any other address computation + // remains scalar unless there is gather/scatter support. This avoids + // inevitable extracts into address registers, and also has the benefit of + // activating LSR more, since that pass can't optimize vectorized + // addresses. + if (TTI.prefersVectorizedAddressing()) + return; + + // Start with all scalar pointer uses. + SmallPtrSet AddrDefs; + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + Instruction *PtrDef = + dyn_cast_or_null(getPointerOperand(&I)); + if (PtrDef && TheLoop->contains(PtrDef) && + getWideningDecision(&I, VF) != CM_GatherScatter) + AddrDefs.insert(PtrDef); + } + + // Add all instructions used to generate the addresses. + SmallVector Worklist; + for (auto *I : AddrDefs) + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (auto &Op : I->operands()) + if (auto *InstOp = dyn_cast(Op)) + if ((InstOp->getParent() == I->getParent()) && !isa(InstOp) && + AddrDefs.insert(InstOp).second == true) + Worklist.push_back(InstOp); + } + + for (auto *I : AddrDefs) { + if (isa(I)) { + // Setting the desired widening decision should ideally be handled in + // by cost functions, but since this involves the task of finding out + // if the loaded register is involved in an address computation, it is + // instead changed here when we know this is the case. + if (getWideningDecision(I, VF) == CM_Widen) + // Scalarize a widened load of address. + setWideningDecision(I, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(I, 1))); + else if (auto Group = Legal->getInterleavedAccessGroup(I)) { + // Scalarize an interleave group of address loads. + for (unsigned I = 0; I < Group->getFactor(); ++I) { + if (Instruction *Member = Group->getMember(I)) + setWideningDecision(Member, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(Member, 1))); + } + } + } else + // Make sure I gets scalarized and a cost estimate without + // scalarization overhead. + ForcedScalars[VF].insert(I); + } } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, @@ -7216,7 +7312,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. @@ -7349,9 +7445,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } else if (Legal->isUniform(Op2)) { Op2VK = TargetTransformInfo::OK_UniformValue; } - SmallVector Operands(I->operand_values()); - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + SmallVector Operands(I->operand_values()); + unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; + return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, + Op2VK, Op1VP, Op2VP, Operands); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -7374,7 +7471,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } case Instruction::Store: case Instruction::Load: { - VectorTy = ToVectorTy(getMemInstValueType(I), VF); + unsigned Width = VF; + if (Width > 1) { + InstWidening Decision = getWideningDecision(I, Width); + assert(Decision != CM_Unknown && + "CM decision should be taken at this point"); + if (Decision == CM_Scalarize) + Width = 1; + } + VectorTy = ToVectorTy(getMemInstValueType(I), Width); return getMemoryInstructionCost(I, VF); } case Instruction::ZExt: diff --git a/test/Analysis/CostModel/AArch64/falkor.ll b/test/Analysis/CostModel/AArch64/falkor.ll deleted file mode 100644 index e9563191f077..000000000000 --- a/test/Analysis/CostModel/AArch64/falkor.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s - -target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" -target triple = "aarch64--linux-gnu" - -; CHECK-LABEL: vectorInstrCost -define void @vectorInstrCost() { - - ; Vector extracts - extracting the first element should have a zero cost; - ; all other elements should have a cost of two. - ; - ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0 - ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1 - %t1 = extractelement <2 x i64> undef, i32 0 - %t2 = extractelement <2 x i64> undef, i32 1 - - ; Vector inserts - inserting the first element should have a zero cost; all - ; other elements should have a cost of two. - ; - ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0 - ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1 - %t3 = insertelement <2 x i64> undef, i64 undef, i32 0 - %t4 = insertelement <2 x i64> undef, i64 undef, i32 1 - - ret void -} diff --git a/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/test/Analysis/Delinearization/constant_functions_multi_dim.ll new file mode 100644 index 000000000000..b44b900d3f52 --- /dev/null +++ b/test/Analysis/Delinearization/constant_functions_multi_dim.ll @@ -0,0 +1,80 @@ +; RUN: opt -delinearize -analyze < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK: Inst: %tmp = load float, float* %arrayidx, align 4 +; CHECK-NEXT: In Loop with Header: for.inc +; CHECK-NEXT: AccessFunction: {(4 * %N * %call),+,4}<%for.inc> +; CHECK-NEXT: Base offset: %A +; CHECK-NEXT: ArrayDecl[UnknownSize][%N] with elements of 4 bytes. +; CHECK-NEXT: ArrayRef[%call][{0,+,1}<%for.inc>] + +; CHECK: Inst: %tmp5 = load float, float* %arrayidx4, align 4 +; CHECK-NEXT: In Loop with Header: for.inc +; CHECK-NEXT: AccessFunction: {(4 * %call1),+,(4 * %N)}<%for.inc> +; CHECK-NEXT: Base offset: %B +; CHECK-NEXT: ArrayDecl[UnknownSize][%N] with elements of 4 bytes. +; CHECK-NEXT: ArrayRef[{0,+,1}<%for.inc>][%call1] + +; Function Attrs: noinline nounwind uwtable +define void @mat_mul(float* %C, float* %A, float* %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %call = tail call i64 @_Z13get_global_idj(i32 0) #3 + %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3 + %cmp1 = icmp sgt i64 %N, 0 + %mul = mul nsw i64 %call, %N + br i1 %cmp1, label %for.inc.lr.ph, label %for.end + +for.inc.lr.ph: ; preds = %entry.split + br label %for.inc + +for.inc: ; preds = %for.inc.lr.ph, %for.inc + %acc.03 = phi float [ 0.000000e+00, %for.inc.lr.ph ], [ %tmp6, %for.inc ] + %m.02 = phi i64 [ 0, %for.inc.lr.ph ], [ %inc, %for.inc ] + %add = add nsw i64 %m.02, %mul + %arrayidx = getelementptr inbounds float, float* %A, i64 %add + %tmp = load float, float* %arrayidx, align 4 + %mul2 = mul nsw i64 %m.02, %N + %add3 = add nsw i64 %mul2, %call1 + %arrayidx4 = getelementptr inbounds float, float* %B, i64 %add3 + %tmp5 = load float, float* %arrayidx4, align 4 + %tmp6 = tail call float @llvm.fmuladd.f32(float %tmp, float %tmp5, float %acc.03) + %inc = add nuw nsw i64 %m.02, 1 + %exitcond = icmp ne i64 %inc, %N + br i1 %exitcond, label %for.inc, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + %.lcssa = phi float [ %tmp6, %for.inc ] + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split + %acc.0.lcssa = phi float [ %.lcssa, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry.split ] + %add7 = add nsw i64 %mul, %call1 + %arrayidx8 = getelementptr inbounds float, float* %C, i64 %add7 + store float %acc.0.lcssa, float* %arrayidx8, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i64 @_Z13get_global_idj(i32) #1 + +; Function Attrs: nounwind readnone speculatable +declare float @llvm.fmuladd.f32(float, float, float) #2 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303846) (llvm/trunk 303834)"} +!2 = !{i32 1, i32 1, i32 1, i32 0} +!3 = !{!"none", !"none", !"none", !"none"} +!4 = !{!"float*", !"float*", !"float*", !"long"} +!5 = !{!"", !"", !"", !""} diff --git a/test/Analysis/IVUsers/quadradic-exit-value.ll b/test/Analysis/IVUsers/quadradic-exit-value.ll index 6d4f1b039b48..afc215198218 100644 --- a/test/Analysis/IVUsers/quadradic-exit-value.ll +++ b/test/Analysis/IVUsers/quadradic-exit-value.ll @@ -30,13 +30,47 @@ exit: ret i64 %r } +; PR15470: LSR miscompile. The test1 function should return '1'. +; It is valid to fold SCEVUnknown into the recurrence because it +; was defined before the loop. +; +; SCEV does not know how to denormalize chained recurrences, so make +; sure they aren't marked as post-inc users. +; +; CHECK-LABEL: IV Users for loop %test1.loop +; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us)),+,33554432}<%test1.loop> (post-inc with loop %test1.loop) in %f = ashr i32 %sext.us, 24 +define i32 @test1(i1 %cond) { +entry: + %sub.us = select i1 %cond, i32 0, i32 0 + br label %test1.loop + +test1.loop: + %inc1115.us = phi i32 [ 0, %entry ], [ %inc11.us, %test1.loop ] + %inc11.us = add nsw i32 %inc1115.us, 1 + %cmp.us = icmp slt i32 %inc11.us, 2 + br i1 %cmp.us, label %test1.loop, label %for.end + +for.end: + %tobool.us = icmp eq i32 %inc1115.us, 0 + %mul.us = shl i32 %inc1115.us, 24 + %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us + %sext.us = mul i32 %mul.us, %sub.cond.us + %f = ashr i32 %sext.us, 24 + br label %exit + +exit: + ret i32 %f +} + ; PR15470: LSR miscompile. The test2 function should return '1'. +; It is illegal to fold SCEVUnknown (sext.us) into the recurrence +; because it is defined after the loop where this recurrence belongs. ; ; SCEV does not know how to denormalize chained recurrences, so make ; sure they aren't marked as post-inc users. ; ; CHECK-LABEL: IV Users for loop %test2.loop -; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us)),+,33554432}<%test2.loop> (post-inc with loop %test2.loop) in %f = ashr i32 %sext.us, 24 +; CHECK-NO-LCSSA: %sub.cond.us = ((-1 * %sub.us) + {0,+,1}<%test2.loop>) (post-inc with loop %test2.loop) in %sext.us = mul i32 %mul.us, %sub.cond.us define i32 @test2() { entry: br label %test2.loop diff --git a/test/Analysis/ScalarEvolution/different-loops-recs.ll b/test/Analysis/ScalarEvolution/different-loops-recs.ll index ad3d1e0bd110..6b88f09e936f 100644 --- a/test/Analysis/ScalarEvolution/different-loops-recs.ll +++ b/test/Analysis/ScalarEvolution/different-loops-recs.ll @@ -220,7 +220,8 @@ exit: ; Mix of previous use cases that demonstrates %s3 can be incorrectly treated as ; a recurrence of loop1 because of operands order if we pick recurrencies in an -; incorrect order. +; incorrect order. It also shows that we cannot safely fold v1 (SCEVUnknown) +; because we cannot prove for sure that it doesn't use Phis of loop 2. define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) { @@ -228,9 +229,9 @@ define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) { ; CHECK: %v1 = load i32, i32* %p ; CHECK-NEXT: --> %v1 ; CHECK: %s1 = add i32 %phi1, %v1 -; CHECK-NEXT: --> {(%a + %v1),+,1}<%loop1> +; CHECK-NEXT: --> ({%a,+,1}<%loop1> + %v1) ; CHECK: %s2 = add i32 %s1, %b -; CHECK-NEXT: --> {(%a + %b + %v1),+,1}<%loop1> +; CHECK-NEXT: --> ({(%a + %b),+,1}<%loop1> + %v1) ; CHECK: %s3 = add i32 %s2, %phi2 ; CHECK-NEXT: --> ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1) @@ -452,3 +453,60 @@ exit: %s6 = add i32 %phi3, %phi2 ret void } + +; Make sure that a complicated Phi does not get folded with rec's start value +; of a loop which is above. +define void @test_08() { + +; CHECK-LABEL: Classifying expressions for: @test_08 +; CHECK: %tmp11 = add i64 %iv.2.2, %iv.2.1 +; CHECK-NEXT: --> ({0,+,-1}<%loop_2> + %iv.2.1) +; CHECK: %tmp12 = trunc i64 %tmp11 to i32 +; CHECK-NEXT: --> (trunc i64 ({0,+,-1}<%loop_2> + %iv.2.1) to i32) +; CHECK: %tmp14 = mul i32 %tmp12, %tmp7 +; CHECK-NEXT: --> ((trunc i64 ({0,+,-1}<%loop_2> + %iv.2.1) to i32) * {-1,+,-1}<%loop_1>) +; CHECK: %tmp16 = mul i64 %iv.2.1, %iv.1.1 +; CHECK-NEXT: --> ({2,+,1}<%loop_1> * %iv.2.1) + +entry: + br label %loop_1 + +loop_1: + %iv.1.1 = phi i64 [ 2, %entry ], [ %iv.1.1.next, %loop_1_back_branch ] + %iv.1.2 = phi i32 [ -1, %entry ], [ %iv.1.2.next, %loop_1_back_branch ] + br label %loop_1_exit + +dead: + br label %loop_1_exit + +loop_1_exit: + %tmp5 = icmp sgt i64 %iv.1.1, 2 + br i1 %tmp5, label %loop_2_preheader, label %loop_1_back_branch + +loop_1_back_branch: + %iv.1.1.next = add nuw nsw i64 %iv.1.1, 1 + %iv.1.2.next = add nsw i32 %iv.1.2, 1 + br label %loop_1 + +loop_2_preheader: + %tmp6 = sub i64 1, %iv.1.1 + %tmp7 = trunc i64 %tmp6 to i32 + br label %loop_2 + +loop_2: + %iv.2.1 = phi i64 [ 0, %loop_2_preheader ], [ %tmp16, %loop_2 ] + %iv.2.2 = phi i64 [ 0, %loop_2_preheader ], [ %iv.2.2.next, %loop_2 ] + %iv.2.3 = phi i64 [ 2, %loop_2_preheader ], [ %iv.2.3.next, %loop_2 ] + %tmp11 = add i64 %iv.2.2, %iv.2.1 + %tmp12 = trunc i64 %tmp11 to i32 + %tmp14 = mul i32 %tmp12, %tmp7 + %tmp16 = mul i64 %iv.2.1, %iv.1.1 + %iv.2.3.next = add nuw nsw i64 %iv.2.3, 1 + %iv.2.2.next = add nsw i64 %iv.2.2, -1 + %tmp17 = icmp slt i64 %iv.2.3.next, %iv.1.1 + br i1 %tmp17, label %loop_2, label %exit + +exit: + %tmp10 = add i32 %iv.1.2, 3 + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 71ea9d54f647..0298315a5510 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -43,7 +43,7 @@ define [1 x double] @constant() { ; The key problem here is that we may fail to create an MBB referenced by a ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things ; happen. -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: G_STORE %vreg4, %vreg2; mem:ST4[%addr] GPR:%vreg4,%vreg2 (in function: pending_phis) +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: G_STORE %vreg5, %vreg2; mem:ST4[%addr] GPR:%vreg5,%vreg2 (in function: pending_phis) ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis: define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) { diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll index 3ecdb7bbedfb..0972840de47b 100644 --- a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll +++ b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll @@ -1,10 +1,10 @@ ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ ; RUN: -O0 -aarch64-enable-global-isel-at-O=0 \ -; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK +; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix NOFALLBACK ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ ; RUN: -O0 -aarch64-enable-global-isel-at-O=0 -global-isel-abort=2 \ -; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK +; RUN: | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix FALLBACK ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \ ; RUN: -global-isel \ @@ -32,6 +32,7 @@ ; ENABLED: IRTranslator ; ENABLED-NEXT: Legalizer ; ENABLED-NEXT: RegBankSelect +; ENABLED-O0-NEXT: Localizer ; ENABLED-NEXT: InstructionSelect ; ENABLED-NEXT: ResetMachineFunction diff --git a/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir b/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir new file mode 100644 index 000000000000..ea8a77ca3917 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir @@ -0,0 +1,96 @@ +# RUN: llc -O0 -mtriple aarch64-apple-ios %s -global-isel -start-after regbankselect \ +# RUN: -stop-before instruction-select -o - | FileCheck --check-prefix=CHECK --check-prefix=OPTNONE %s +# RUN: llc -mtriple aarch64-apple-ios %s -global-isel -start-after regbankselect \ +# RUN: -stop-before instruction-select -o - | FileCheck --check-prefix=CHECK --check-prefix=OPT %s +# +# Check that we are only running the localizer at O0 and that it runs +# between the regbankselect pass and the instruction-select. +# Moreover, check that it does what we expect. +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-apple-ios" + + define float @foo(float %arg, i1 %cond) { + br i1 %cond, label %true, label %false + + true: ; preds = %0 + br label %end + + false: ; preds = %0 + br label %end + + end: ; preds = %false, %true + %val = phi float [ 1.000000e+00, %true ], [ 2.000000e+00, %false ] + %res = fadd float %arg, %val + ret float %res + } + +... +--- +# CHECK-LABEL: name: foo +name: foo +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: fpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: fpr } +# CHECK-NEXT: - { id: 3, class: fpr } +# CHECK-NEXT: - { id: 4, class: fpr } +# CHECK-NEXT: - { id: 5, class: fpr } +# The localizer will create two new values to materialize the constants. +# OPTNONE-NEXT: - { id: 6, class: fpr } +# OPTNONE-NEXT: - { id: 7, class: fpr } + - { id: 0, class: fpr } + - { id: 1, class: gpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + - { id: 5, class: fpr } + +# First block remains untouched +# CHECK: body +# CHECK: %4(s32) = G_FCONSTANT float 1.000000e+00 +# CHECK: %5(s32) = G_FCONSTANT float 2.000000e+00 + +# Second block will get the constant 1.0 when the localizer is enabled. +# CHECK: bb.1.true: +# OPT-NOT: G_FCONSTANT +# OPTNONE: [[FONE:%[0-9]+]](s32) = G_FCONSTANT float 1.000000e+00 +# CHECK: G_BR %bb.3.end + +# Thrid block will get the constant 2.0 when the localizer is enabled. +# CHECK: bb.2.false: +# OPT-NOT: G_FCONSTANT +# OPTNONE: [[FTWO:%[0-9]+]](s32) = G_FCONSTANT float 2.000000e+00 + +# CHECK: bb.3.end +# OPTNONE: %2(s32) = PHI [[FONE]](s32), %bb.1.true, [[FTWO]](s32), %bb.2.false +# OPT: %2(s32) = PHI %4(s32), %bb.1.true, %5(s32), %bb.2.false +# CHECK-NEXT: G_FADD %0, %2 +body: | + bb.0 (%ir-block.0): + liveins: %s0, %w0 + + %0(s32) = COPY %s0 + %1(s1) = COPY %w0 + %4(s32) = G_FCONSTANT float 1.000000e+00 + %5(s32) = G_FCONSTANT float 2.000000e+00 + G_BRCOND %1(s1), %bb.1.true + G_BR %bb.2.false + + bb.1.true: + G_BR %bb.3.end + + bb.2.false: + + bb.3.end: + %2(s32) = PHI %4(s32), %bb.1.true, %5(s32), %bb.2.false + %3(s32) = G_FADD %0, %2 + %s0 = COPY %3(s32) + RET_ReallyLR implicit %s0 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/localizer.mir b/test/CodeGen/AArch64/GlobalISel/localizer.mir new file mode 100644 index 000000000000..8fbb2040157e --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/localizer.mir @@ -0,0 +1,312 @@ +# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=localizer -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK + +# Test the localizer. + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @local_use() { ret void } + define void @non_local_1use() { ret void } + define void @non_local_2uses() { ret void } + define void @non_local_phi_use() { ret void } + define void @non_local_phi_use_followed_by_use() { ret void } + define void @non_local_phi_use_followed_by_use_fi() { ret void } + define void @float_non_local_phi_use_followed_by_use_fi() { ret void } +... + +--- +# CHECK-LABEL: name: local_use +name: local_use +legalized: true +regBankSelected: true + +# CHECK: registers: +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0(s32) = G_CONSTANT 1 +# CHECK-NEXT: %1(s32) = G_ADD %0, %0 +body: | + bb.0: + %0(s32) = G_CONSTANT 1 + %1(s32) = G_ADD %0, %0 +... + +--- +# CHECK-LABEL: name: non_local_1use +name: non_local_1use +legalized: true +regBankSelected: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: gpr } +#CHECK-NEXT: - { id: 1, class: gpr } +#CHECK-NEXT: - { id: 2, class: gpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 3, class: gpr } + +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0(s32) = G_CONSTANT 1 +# CHECK-NEXT: %1(s32) = G_ADD %0, %0 + +# CHECK: bb.1: +# CHECK: %3(s32) = G_CONSTANT 1 +# CHECK-NEXT: %2(s32) = G_ADD %3, %1 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_CONSTANT 1 + %1(s32) = G_ADD %0, %0 + + bb.1: + %2(s32) = G_ADD %0, %1 +... + + +--- +# CHECK-LABEL: name: non_local_2uses +name: non_local_2uses +legalized: true +regBankSelected: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: gpr } +#CHECK-NEXT: - { id: 1, class: gpr } +#CHECK-NEXT: - { id: 2, class: gpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 3, class: gpr } + +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0(s32) = G_CONSTANT 1 +# CHECK-NEXT: %1(s32) = G_ADD %0, %0 + +# CHECK: bb.1: +# CHECK: %3(s32) = G_CONSTANT 1 +# CHECK-NEXT: %2(s32) = G_ADD %3, %3 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_CONSTANT 1 + %1(s32) = G_ADD %0, %0 + + bb.1: + %2(s32) = G_ADD %0, %0 +... + +--- +# CHECK-LABEL: name: non_local_phi_use +name: non_local_phi_use +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: gpr } +#CHECK-NEXT: - { id: 1, class: gpr } +#CHECK-NEXT: - { id: 2, class: gpr } +#CHECK-NEXT: - { id: 3, class: gpr } +#CHECK-NEXT: - { id: 4, class: gpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 5, class: gpr } + +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + +# CHECK: body: +# CHECK: %0(s32) = G_CONSTANT 1 +# CHECK-NEXT: %1(s32) = G_ADD %0, %0 + +# CHECK: bb.1: +# CHECK: %5(s32) = G_CONSTANT 1 + +# CHECK: bb.2: +# CHECK: %3(s32) = PHI %5(s32), %bb.1 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_CONSTANT 1 + %1(s32) = G_ADD %0, %0 + + bb.1: + successors: %bb.2 + + bb.2: + %3(s32) = PHI %0(s32), %bb.1 + %2(s32) = G_ADD %3, %3 +... + +--- +# CHECK-LABEL: name: non_local_phi_use_followed_by_use +name: non_local_phi_use_followed_by_use +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: gpr } +#CHECK-NEXT: - { id: 1, class: gpr } +#CHECK-NEXT: - { id: 2, class: gpr } +#CHECK-NEXT: - { id: 3, class: gpr } +#CHECK-NEXT: - { id: 4, class: gpr } +# The newly created regs should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 5, class: gpr } +#CHECK-NEXT: - { id: 6, class: gpr } + +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + +# CHECK: body: +# CHECK: %0(s32) = G_CONSTANT 1 +# CHECK-NEXT: %1(s32) = G_ADD %0, %0 + +# CHECK: bb.1: +# CHECK: %5(s32) = G_CONSTANT 1 + +# CHECK: bb.2: +# CHECK: %3(s32) = PHI %5(s32), %bb.1 +# CHECK-NEXT: %6(s32) = G_CONSTANT 1 +# CHECK-NEXT: %2(s32) = G_ADD %3, %6 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_CONSTANT 1 + %1(s32) = G_ADD %0, %0 + + bb.1: + successors: %bb.2 + + bb.2: + %3(s32) = PHI %0(s32), %bb.1 + %2(s32) = G_ADD %3, %0 +... + +--- +# CHECK-LABEL: name: non_local_phi_use_followed_by_use_fi +name: non_local_phi_use_followed_by_use_fi +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: gpr } +#CHECK-NEXT: - { id: 1, class: gpr } +#CHECK-NEXT: - { id: 2, class: gpr } +#CHECK-NEXT: - { id: 3, class: gpr } +#CHECK-NEXT: - { id: 4, class: gpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 5, class: gpr } +#CHECK-NEXT: - { id: 6, class: gpr } + +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + +# CHECK: body: +# CHECK: %0(s32) = G_FRAME_INDEX 1 +# CHECK-NEXT: %1(s32) = G_ADD %0, %0 + +# CHECK: bb.1: +# CHECK: %5(s32) = G_FRAME_INDEX 1 + +# CHECK: bb.2: +# CHECK: %3(s32) = PHI %5(s32), %bb.1 +# CHECK-NEXT: %6(s32) = G_FRAME_INDEX 1 +# CHECK-NEXT: %2(s32) = G_ADD %3, %6 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_FRAME_INDEX 1 + %1(s32) = G_ADD %0, %0 + + bb.1: + successors: %bb.2 + + bb.2: + %3(s32) = PHI %0(s32), %bb.1 + %2(s32) = G_ADD %3, %0 +... + +--- +# CHECK-LABEL: name: float_non_local_phi_use_followed_by_use_fi +name: float_non_local_phi_use_followed_by_use_fi +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: fpr } +#CHECK-NEXT: - { id: 1, class: fpr } +#CHECK-NEXT: - { id: 2, class: fpr } +#CHECK-NEXT: - { id: 3, class: fpr } +#CHECK-NEXT: - { id: 4, class: fpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 5, class: fpr } +#CHECK-NEXT: - { id: 6, class: fpr } + +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + - { id: 4, class: fpr } + +# CHECK: body: +# CHECK: %0(s32) = G_FCONSTANT float 1.0 +# CHECK-NEXT: %1(s32) = G_FADD %0, %0 + +# CHECK: bb.1: +# CHECK: %5(s32) = G_FCONSTANT float 1.0 + +# CHECK: bb.2: +# CHECK: %3(s32) = PHI %5(s32), %bb.1 +# CHECK-NEXT: %6(s32) = G_FCONSTANT float 1.0 +# CHECK-NEXT: %2(s32) = G_FADD %3, %6 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_FCONSTANT float 1.0 + %1(s32) = G_FADD %0, %0 + + bb.1: + successors: %bb.2 + + bb.2: + %3(s32) = PHI %0(s32), %bb.1 + %2(s32) = G_FADD %3, %0 +... diff --git a/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/test/CodeGen/AArch64/aarch64-stp-cluster.ll index fe5abbf15eff..25cf313b81e7 100644 --- a/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: stp_i64_scale:BB#0 diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll index 4930c493d62c..cfb8e3a38c49 100644 --- a/test/CodeGen/AArch64/arm64-csldst-mmo.ll +++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; REQUIRES: asserts @G = external global [0 x i32], align 4 diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll index f28d0ab07c5a..f849df2a51ec 100644 --- a/test/CodeGen/AArch64/arm64-inline-asm.ll +++ b/test/CodeGen/AArch64/arm64-inline-asm.ll @@ -254,3 +254,10 @@ define void @test_constraint_w(i32 %a) { tail call void asm sideeffect "sqxtn h0, ${0:s}\0A", "w"(i32 %a) ret void } + +define void @test_inline_modifier_a(i8* %ptr) nounwind { + ; CHECK-LABEL: test_inline_modifier_a: + tail call void asm sideeffect "prfm pldl1keep, ${0:a}\0A", "r"(i8* %ptr) + ; CHECK: prfm pldl1keep, [x0] + ret void +} diff --git a/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/test/CodeGen/AArch64/arm64-ldp-cluster.ll index 0cfbe5958f4d..64e535ca7499 100644 --- a/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ b/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s ; Test ldr clustering. ; CHECK: ********** MI Scheduling ********** diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll index 41287a17da86..307d1ec1aa8c 100644 --- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll +++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s ; ; The Cortex-A53 machine model will cause the MADD instruction to be scheduled ; much higher than the ADD instructions in order to hide latency. When not diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll index fac5f8ad2e9f..82ba18ce72ca 100644 --- a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll +++ b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll @@ -6,7 +6,7 @@ ; the loads to avoid unnecessary stalls. The generic machine model schedules 4 ; loads consecutively for this case and will cause stalls. ; -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; CHECK: ********** MI Scheduling ********** ; CHECK: main:BB#2 ; CHECK: LDR diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll index 0ee74d1f782e..cde62fcb3f95 100644 --- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll +++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; ; For Cortex-A53, shiftable operands that are not actually shifted ; are not needed for an additional two cycles. diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll index 0ec754f97ec7..748a4762d82f 100644 --- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll +++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; ; Test for bug in misched memory dependency calculation. ; diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll index 3593668e0156..75f45da0e48f 100644 --- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll +++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s @G1 = common global [100 x i32] zeroinitializer, align 4 diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll index ff7a0a8300e2..6b754b0a169e 100644 --- a/test/CodeGen/AArch64/arm64-vabs.ll +++ b/test/CodeGen/AArch64/arm64-vabs.ll @@ -33,7 +33,7 @@ define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: sabdl2_8h: -;CHECK: sabdl2.8h +;CHECK: sabdl.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> @@ -45,7 +45,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: sabdl2_4s: -;CHECK: sabdl2.4s +;CHECK: sabdl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -57,7 +57,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: sabdl2_2d: -;CHECK: sabdl2.2d +;CHECK: sabdl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -99,7 +99,7 @@ define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: uabdl2_8h: -;CHECK: uabdl2.8h +;CHECK: uabdl.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> @@ -112,7 +112,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: uabdl2_4s: -;CHECK: uabdl2.4s +;CHECK: uabdl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -124,7 +124,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: uabdl2_2d: -;CHECK: uabdl2.2d +;CHECK: uabdl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -561,7 +561,7 @@ define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { ;CHECK-LABEL: sabal2_8h: -;CHECK: sabal2.8h +;CHECK: sabal.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp3 = load <8 x i16>, <8 x i16>* %C @@ -575,7 +575,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sabal2_4s: -;CHECK: sabal2.4s +;CHECK: sabal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -589,7 +589,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sabal2_2d: -;CHECK: sabal2.2d +;CHECK: sabal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -639,7 +639,7 @@ define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind { ;CHECK-LABEL: uabal2_8h: -;CHECK: uabal2.8h +;CHECK: uabal.8h %load1 = load <16 x i8>, <16 x i8>* %A %load2 = load <16 x i8>, <16 x i8>* %B %tmp3 = load <8 x i16>, <8 x i16>* %C @@ -653,7 +653,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: uabal2_4s: -;CHECK: uabal2.4s +;CHECK: uabal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -667,7 +667,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: uabal2_2d: -;CHECK: uabal2.2d +;CHECK: uabal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll index 9d09251524ea..2a25538250e4 100644 --- a/test/CodeGen/AArch64/arm64-vadd.ll +++ b/test/CodeGen/AArch64/arm64-vadd.ll @@ -318,7 +318,7 @@ define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: uaddw2_8h: -;CHECK: uaddw2.8h +;CHECK: uaddw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -331,7 +331,7 @@ define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: uaddw2_4s: -;CHECK: uaddw2.4s +;CHECK: uaddw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -344,7 +344,7 @@ define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: uaddw2_2d: -;CHECK: uaddw2.2d +;CHECK: uaddw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -387,7 +387,7 @@ define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: saddw2_8h: -;CHECK: saddw2.8h +;CHECK: saddw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -400,7 +400,7 @@ define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: saddw2_4s: -;CHECK: saddw2.4s +;CHECK: saddw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -413,7 +413,7 @@ define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: saddw2_2d: -;CHECK: saddw2.2d +;CHECK: saddw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll index a7668ec97979..f70ed9a43427 100644 --- a/test/CodeGen/AArch64/arm64-vmul.ll +++ b/test/CodeGen/AArch64/arm64-vmul.ll @@ -83,7 +83,7 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: sqdmull2_4s: -;CHECK: sqdmull2.4s +;CHECK: sqdmull.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -94,7 +94,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: sqdmull2_2d: -;CHECK: sqdmull2.2d +;CHECK: sqdmull.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -324,7 +324,7 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_4s: -;CHECK: sqdmlal2.4s +;CHECK: sqdmlal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -337,7 +337,7 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_2d: -;CHECK: sqdmlal2.2d +;CHECK: sqdmlal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -372,7 +372,7 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_4s: -;CHECK: sqdmlsl2.4s +;CHECK: sqdmlsl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -385,7 +385,7 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_2d: -;CHECK: sqdmlsl2.2d +;CHECK: sqdmlsl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -874,7 +874,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: sqdmull2_lane_4s: ;CHECK-NOT: dup -;CHECK: sqdmull2.4s +;CHECK: sqdmull.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -886,7 +886,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: sqdmull2_lane_2d: ;CHECK-NOT: dup -;CHECK: sqdmull2.2d +;CHECK: sqdmull.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -994,7 +994,7 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_lane_4s: ;CHECK-NOT: dup -;CHECK: sqdmlal2.4s +;CHECK: sqdmlal.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1008,7 +1008,7 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlal2_lane_2d: ;CHECK-NOT: dup -;CHECK: sqdmlal2.2d +;CHECK: sqdmlal.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1147,7 +1147,7 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_lane_4s: ;CHECK-NOT: dup -;CHECK: sqdmlsl2.4s +;CHECK: sqdmlsl.4s %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1161,7 +1161,7 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { ;CHECK-LABEL: sqdmlsl2_lane_2d: ;CHECK-NOT: dup -;CHECK: sqdmlsl2.2d +;CHECK: sqdmlsl.2d %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll index c1c4649bd6a4..6b0fe40b5a09 100644 --- a/test/CodeGen/AArch64/arm64-vshift.ll +++ b/test/CodeGen/AArch64/arm64-vshift.ll @@ -1164,7 +1164,7 @@ define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind { define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind { ;CHECK-LABEL: ushll2_8h: -;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1 +;CHECK: ushll.8h v0, {{v[0-9]+}}, #1 %load1 = load <16 x i8>, <16 x i8>* %A %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> @@ -1174,7 +1174,7 @@ define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind { define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind { ;CHECK-LABEL: ushll2_4s: -;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1 +;CHECK: ushll.4s v0, {{v[0-9]+}}, #1 %load1 = load <8 x i16>, <8 x i16>* %A %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> @@ -1184,7 +1184,7 @@ define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind { define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind { ;CHECK-LABEL: ushll2_2d: -;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1 +;CHECK: ushll.2d v0, {{v[0-9]+}}, #1 %load1 = load <4 x i32>, <4 x i32>* %A %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> @@ -1221,7 +1221,7 @@ define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind { define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind { ;CHECK-LABEL: sshll2_8h: -;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1 +;CHECK: sshll.8h v0, {{v[0-9]+}}, #1 %load1 = load <16 x i8>, <16 x i8>* %A %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> @@ -1231,7 +1231,7 @@ define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind { define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind { ;CHECK-LABEL: sshll2_4s: -;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1 +;CHECK: sshll.4s v0, {{v[0-9]+}}, #1 %load1 = load <8 x i16>, <8 x i16>* %A %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> @@ -1241,7 +1241,7 @@ define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind { define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind { ;CHECK-LABEL: sshll2_2d: -;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1 +;CHECK: sshll.2d v0, {{v[0-9]+}}, #1 %load1 = load <4 x i32>, <4 x i32>* %A %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> diff --git a/test/CodeGen/AArch64/arm64-vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll index 7af69118347e..6746e49989cb 100644 --- a/test/CodeGen/AArch64/arm64-vsub.ll +++ b/test/CodeGen/AArch64/arm64-vsub.ll @@ -157,7 +157,7 @@ define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: ssubl2_8h: -;CHECK: ssubl2.8h +;CHECK: ssubl.8h %tmp1 = load <16 x i8>, <16 x i8>* %A %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> %ext1 = sext <8 x i8> %high1 to <8 x i16> @@ -172,7 +172,7 @@ define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: ssubl2_4s: -;CHECK: ssubl2.4s +;CHECK: ssubl.4s %tmp1 = load <8 x i16>, <8 x i16>* %A %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> %ext1 = sext <4 x i16> %high1 to <4 x i32> @@ -187,7 +187,7 @@ define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: ssubl2_2d: -;CHECK: ssubl2.2d +;CHECK: ssubl.2d %tmp1 = load <4 x i32>, <4 x i32>* %A %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> %ext1 = sext <2 x i32> %high1 to <2 x i64> @@ -235,7 +235,7 @@ define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: usubl2_8h: -;CHECK: usubl2.8h +;CHECK: usubl.8h %tmp1 = load <16 x i8>, <16 x i8>* %A %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> %ext1 = zext <8 x i8> %high1 to <8 x i16> @@ -250,7 +250,7 @@ define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: usubl2_4s: -;CHECK: usubl2.4s +;CHECK: usubl.4s %tmp1 = load <8 x i16>, <8 x i16>* %A %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> %ext1 = zext <4 x i16> %high1 to <4 x i32> @@ -265,7 +265,7 @@ define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: usubl2_2d: -;CHECK: usubl2.2d +;CHECK: usubl.2d %tmp1 = load <4 x i32>, <4 x i32>* %A %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> %ext1 = zext <2 x i32> %high1 to <2 x i64> @@ -310,7 +310,7 @@ define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: ssubw2_8h: -;CHECK: ssubw2.8h +;CHECK: ssubw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -323,7 +323,7 @@ define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: ssubw2_4s: -;CHECK: ssubw2.4s +;CHECK: ssubw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -336,7 +336,7 @@ define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: ssubw2_2d: -;CHECK: ssubw2.2d +;CHECK: ssubw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -379,7 +379,7 @@ define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind { define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: usubw2_8h: -;CHECK: usubw2.8h +;CHECK: usubw.8h %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -392,7 +392,7 @@ define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind { define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: usubw2_4s: -;CHECK: usubw2.4s +;CHECK: usubw.4s %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -405,7 +405,7 @@ define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind { define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: usubw2_2d: -;CHECK: usubw2.2d +;CHECK: usubw.2d %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B diff --git a/test/CodeGen/AArch64/asm-print-comments.ll b/test/CodeGen/AArch64/asm-print-comments.ll new file mode 100644 index 000000000000..e997dce23583 --- /dev/null +++ b/test/CodeGen/AArch64/asm-print-comments.ll @@ -0,0 +1,17 @@ +; RUN: llc %s -mtriple=arm64-apple-darwin -o - | FileCheck %s + +; CHECK-LABEL: ; -- Begin function foo +; CHECK: foo: +define hidden i32 @foo() { + entry: + ret i32 30 +} +; CHECK: ; -- End function + +; CHECK-LABEL: ; -- Begin function bar +; CHECK: bar: +define i32 @bar() { + entry: + ret i32 30 +} +; CHECK: ; -- End function diff --git a/test/CodeGen/AArch64/cmpxchg-O0.ll b/test/CodeGen/AArch64/cmpxchg-O0.ll index 8432b15ea523..1bfbcf851c0e 100644 --- a/test/CodeGen/AArch64/cmpxchg-O0.ll +++ b/test/CodeGen/AArch64/cmpxchg-O0.ll @@ -3,10 +3,11 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { ; CHECK-LABEL: test_cmpxchg_8: ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov [[STATUS:w[3-9]+]], #0 ; CHECK: ldaxrb [[OLD:w[0-9]+]], [x0] ; CHECK: cmp [[OLD]], w1, uxtb ; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: stlxrb [[STATUS:w[3-9]]], w2, [x0] +; CHECK: stlxrb [[STATUS]], w2, [x0] ; CHECK: cbnz [[STATUS]], [[RETRY]] ; CHECK: [[DONE]]: ; CHECK: subs {{w[0-9]+}}, [[OLD]], w1 @@ -18,6 +19,7 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind { ; CHECK-LABEL: test_cmpxchg_16: ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov [[STATUS:w[3-9]+]], #0 ; CHECK: ldaxrh [[OLD:w[0-9]+]], [x0] ; CHECK: cmp [[OLD]], w1, uxth ; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] @@ -33,10 +35,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind { ; CHECK-LABEL: test_cmpxchg_32: ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov [[STATUS:w[3-9]+]], #0 ; CHECK: ldaxr [[OLD:w[0-9]+]], [x0] ; CHECK: cmp [[OLD]], w1 ; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: stlxr [[STATUS:w[3-9]]], w2, [x0] +; CHECK: stlxr [[STATUS]], w2, [x0] ; CHECK: cbnz [[STATUS]], [[RETRY]] ; CHECK: [[DONE]]: ; CHECK: subs {{w[0-9]+}}, [[OLD]], w1 @@ -48,10 +51,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind { ; CHECK-LABEL: test_cmpxchg_64: ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov [[STATUS:w[3-9]+]], #0 ; CHECK: ldaxr [[OLD:x[0-9]+]], [x0] ; CHECK: cmp [[OLD]], x1 ; CHECK: b.ne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: stlxr [[STATUS:w[3-9]]], x2, [x0] +; CHECK: stlxr [[STATUS]], x2, [x0] ; CHECK: cbnz [[STATUS]], [[RETRY]] ; CHECK: [[DONE]]: ; CHECK: subs {{x[0-9]+}}, [[OLD]], x1 diff --git a/test/CodeGen/AArch64/fast-isel-cmpxchg.ll b/test/CodeGen/AArch64/fast-isel-cmpxchg.ll index aa78210fae74..7ef625abab20 100644 --- a/test/CodeGen/AArch64/fast-isel-cmpxchg.ll +++ b/test/CodeGen/AArch64/fast-isel-cmpxchg.ll @@ -2,11 +2,12 @@ ; CHECK-LABEL: cmpxchg_monotonic_32: ; CHECK: [[RETRY:.LBB[0-9_]+]]: +; CHECK-NEXT: mov [[STATUS:w[0-9]+]], #0 ; CHECK-NEXT: ldaxr [[OLD:w[0-9]+]], [x0] ; CHECK-NEXT: cmp [[OLD]], w1 ; CHECK-NEXT: b.ne [[DONE:.LBB[0-9_]+]] ; CHECK-NEXT: // BB#2: -; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w2, [x0] +; CHECK-NEXT: stlxr [[STATUS]], w2, [x0] ; CHECK-NEXT: cbnz [[STATUS]], [[RETRY]] ; CHECK-NEXT: [[DONE]]: ; CHECK-NEXT: cmp [[OLD]], w1 @@ -27,11 +28,12 @@ define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 { ; CHECK: // BB#0: ; CHECK: ldr [[NEW:w[0-9]+]], [x2] ; CHECK-NEXT: [[RETRY:.LBB[0-9_]+]]: +; CHECK-NEXT: mov [[STATUS:w[0-9]+]], #0 ; CHECK-NEXT: ldaxr [[OLD:w[0-9]+]], [x0] ; CHECK-NEXT: cmp [[OLD]], w1 ; CHECK-NEXT: b.ne [[DONE:.LBB[0-9_]+]] ; CHECK-NEXT: // BB#2: -; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x0] +; CHECK-NEXT: stlxr [[STATUS]], [[NEW]], [x0] ; CHECK-NEXT: cbnz [[STATUS]], [[RETRY]] ; CHECK-NEXT: [[DONE]]: ; CHECK-NEXT: cmp [[OLD]], w1 @@ -51,11 +53,12 @@ define i32 @cmpxchg_acq_rel_32_load(i32* %p, i32 %cmp, i32* %pnew, i32* %ps) #0 ; CHECK-LABEL: cmpxchg_seq_cst_64: ; CHECK: [[RETRY:.LBB[0-9_]+]]: +; CHECK-NEXT: mov [[STATUS:w[0-9]+]], #0 ; CHECK-NEXT: ldaxr [[OLD:x[0-9]+]], [x0] ; CHECK-NEXT: cmp [[OLD]], x1 ; CHECK-NEXT: b.ne [[DONE:.LBB[0-9_]+]] ; CHECK-NEXT: // BB#2: -; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], x2, [x0] +; CHECK-NEXT: stlxr [[STATUS]], x2, [x0] ; CHECK-NEXT: cbnz [[STATUS]], [[RETRY]] ; CHECK-NEXT: [[DONE]]: ; CHECK-NEXT: cmp [[OLD]], x1 diff --git a/test/CodeGen/AArch64/live-interval-analysis.mir b/test/CodeGen/AArch64/live-interval-analysis.mir index d44300973566..93dfcf507fff 100644 --- a/test/CodeGen/AArch64/live-interval-analysis.mir +++ b/test/CodeGen/AArch64/live-interval-analysis.mir @@ -6,7 +6,7 @@ --- # CHECK-LABEL: ********** INTERVALS ********** # W29 is reserved, so we should only see dead defs -# CHECK-DAG: W29 [0B,0d:{{[0-9]+}})[32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}}) +# CHECK-DAG: W29 [32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}}) # For normal registers like x28 we should see the full intervals # CHECK-DAG: W28 [0B,16r:{{[0-9]+}})[32r,48r:{{[0-9]+}})[48r,48d:{{[0-9]+}}) # CHECK: # End machine code for function reserved_reg_liveness. @@ -14,7 +14,7 @@ name: reserved_reg_liveness tracksRegLiveness: true body: | bb.0: - liveins: %x28_fp + liveins: %x28 %6 : xseqpairsclass = COPY %x28_fp %x28_fp = COPY %6 %x28 = COPY %x28 diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll index 4c682e594e66..1d8787212579 100644 --- a/test/CodeGen/AArch64/misched-fusion-aes.ll +++ b/test/CodeGen/AArch64/misched-fusion-aes.ll @@ -1,5 +1,5 @@ -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57 -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) @@ -72,55 +72,40 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesea: -; CHECKA57: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VC]] -; CHECKA57: aesmc {{v[0-7].16b}}, [[VA]] -; CHECKA57: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VD]] -; CHECKA57: aesmc {{v[0-7].16b}}, [[VB]] -; CHECKA57: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VE]] -; CHECKA57: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VF]] -; CHECKA57: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]] -; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]] -; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] -; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] -; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] -; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] -; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] -; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] -; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] -; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] + ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]] ; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}} ; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]] ; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKM1: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKM1: aesmc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]] ; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]] ; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]] -; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VH]] } define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) { @@ -188,53 +173,65 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesda: -; CHECKA57: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VC]] -; CHECKA57: aesimc {{v[0-7].16b}}, [[VA]] -; CHECKA57: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VD]] -; CHECKA57: aesimc {{v[0-7].16b}}, [[VB]] -; CHECKA57: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VE]] -; CHECKA57: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VF]] -; CHECKA57: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]] -; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]] -; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] -; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] -; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] -; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] -; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] -; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] -; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] -; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] + ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]] ; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}} ; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]] ; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKM1: aesimc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]] ; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]] ; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]] -; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +} + +define void @aes_load_store(<16 x i8> *%p1, <16 x i8> *%p2 , <16 x i8> *%p3) { +entry: + %x1 = alloca <16 x i8>, align 16 + %x2 = alloca <16 x i8>, align 16 + %x3 = alloca <16 x i8>, align 16 + %x4 = alloca <16 x i8>, align 16 + %x5 = alloca <16 x i8>, align 16 + %in1 = load <16 x i8>, <16 x i8>* %p1, align 16 + store <16 x i8> %in1, <16 x i8>* %x1, align 16 + %aese1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in1) #2 + store <16 x i8> %aese1, <16 x i8>* %x2, align 16 + %in2 = load <16 x i8>, <16 x i8>* %p2, align 16 + %aesmc1= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese1) #2 + store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16 + %aese2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in2) #2 + store <16 x i8> %aese2, <16 x i8>* %x4, align 16 + %aesmc2= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese2) #2 + store <16 x i8> %aesmc2, <16 x i8>* %x5, align 16 + ret void + +; CHECK-LABEL: aes_load_store: +; CHECK: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VB]] } diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll index a4725c65aa26..f960a3a95fc9 100644 --- a/test/CodeGen/AArch64/optimize-imm.ll +++ b/test/CodeGen/AArch64/optimize-imm.ll @@ -62,3 +62,22 @@ entry: %and = xor i32 %xor, 56 ret i32 %and } + +; Check that, when (and %t1, 129) is transformed to (and %t0, 0), +; (xor %arg, 129) doesn't get transformed to (xor %arg, 0). +; +; CHECK-LABEL: PR33100: +; CHECK: mov w[[R0:[0-9]+]], #129 +; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, x[[R0]] + +define i64 @PR33100(i64 %arg) { +entry: + %alloca0 = alloca i64 + store i64 8, i64* %alloca0, align 4 + %t0 = load i64, i64* %alloca0, align 4 + %t1 = shl i64 %arg, %t0 + %and0 = and i64 %t1, 129 + %xor0 = xor i64 %arg, 129 + %t2 = add i64 %and0, %xor0 + ret i64 %t2 +} diff --git a/test/CodeGen/AArch64/scheduledag-constreg.mir b/test/CodeGen/AArch64/scheduledag-constreg.mir index 23c785504f01..6b83dc715e0a 100644 --- a/test/CodeGen/AArch64/scheduledag-constreg.mir +++ b/test/CodeGen/AArch64/scheduledag-constreg.mir @@ -1,4 +1,4 @@ -# RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=misched 2>&1 | FileCheck %s +# RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts --- | define void @func() { ret void } diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll index 0e4eb2b5fad9..4fbd8944f032 100644 --- a/test/CodeGen/AArch64/tailcall_misched_graph.ll +++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll @@ -1,4 +1,4 @@ -; RUN: llc -mcpu=cyclone -debug-only=misched < %s 2>&1 | FileCheck %s +; RUN: llc -mcpu=cyclone -debug-only=machine-scheduler < %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir index 8839ba8e0ab2..0557008ceb4f 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir @@ -5,6 +5,11 @@ entry: ret void } + + define void @test_fconstant() { + entry: + ret void + } ... --- @@ -18,3 +23,18 @@ body: | %0(s32) = G_CONSTANT i32 5 ... + +--- +name: test_fconstant +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_fconstant + ; CHECK: %0(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK: %1(s32) = G_FCONSTANT float 7.5 + + %0(s32) = G_FCONSTANT float 1.0 + %1(s32) = G_FCONSTANT float 7.5 +... diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll new file mode 100644 index 000000000000..791b49f0e143 --- /dev/null +++ b/test/CodeGen/AMDGPU/bfe-combine.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s + +; GCN-LABEL: {{^}}bfe_combine8: +; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8 +; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]] +; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}} +; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]] +; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] +; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %idx = add i32 %x, %id + %srl = lshr i32 %idx, 8 + %and = and i32 %srl, 255 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}bfe_combine16: +; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16 +; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]] +; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}} +; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]] +; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2 +; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] +; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %idx = add i32 %x, %id + %srl = lshr i32 %idx, 1 + %and = and i32 %srl, 2147450880 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/extload-align.ll b/test/CodeGen/AMDGPU/extload-align.ll index 4644800421d8..12cf27b918af 100644 --- a/test/CodeGen/AMDGPU/extload-align.ll +++ b/test/CodeGen/AMDGPU/extload-align.ll @@ -1,4 +1,4 @@ -; RUN: llc -debug-only=misched -march=amdgcn -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s +; RUN: llc -debug-only=machine-scheduler -march=amdgcn -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s ; REQUIRES: asserts ; Verify that the extload generated from %eval has the default @@ -20,4 +20,4 @@ define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 { %eval = sext i16 %val to i32 store i32 %eval, i32* %out ret void -} \ No newline at end of file +} diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 6fa26cb38793..9441bf208829 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -1,8 +1,8 @@ ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC ; FUNC-LABEL: {{^}}i8_arg: ; HSA-VI: kernarg_segment_alignment = 4 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll new file mode 100644 index 000000000000..22e15e216805 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i64 @llvm.amdgcn.s.getpc() #0 + +; GCN-LABEL: {{^}}test_s_getpc: +; GCN: s_load_dwordx2 +; GCN-DAG: s_getpc_b64 s{{\[[0-9]+:[0-9]+\]}} +; GCN: buffer_store_dwordx2 +define amdgpu_kernel void @test_s_getpc(i64 addrspace(1)* %out) #0 { + %tmp = call i64 @llvm.amdgcn.s.getpc() #1 + store volatile i64 %tmp, i64 addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll index 5dd2efdf6382..72fde04ba391 100644 --- a/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}constant_load_i16: ; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll index 6e56b9f9b6d6..bdfc3caf9d01 100644 --- a/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}constant_load_i8: diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll index dcdd1a947cd4..e3415b9c47de 100644 --- a/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll index 71adf090532f..fc0cbf916b52 100644 --- a/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i8: diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll index bbbb34e8d333..7de3f3b28c6d 100644 --- a/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i16: ; GCN: ds_read_u16 v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll index 731996ec6c45..16eb366a4b15 100644 --- a/test/CodeGen/AMDGPU/load-local-i8.ll +++ b/test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i8: diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll index e85a724c1567..60e43f8fb2a7 100644 --- a/test/CodeGen/AMDGPU/min.ll +++ b/test/CodeGen/AMDGPU/min.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; GCN: v_min_i32_e32 diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll index a90f200f79e3..190d2b72ebaf 100644 --- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck %s ; ; CFG flattening should use parallel-and mode to generate branch conditions and ; then merge if-regions with the same bodies. diff --git a/test/CodeGen/AMDGPU/r600.bitcast.ll b/test/CodeGen/AMDGPU/r600.bitcast.ll index acf7a66a357f..67431e6a4825 100644 --- a/test/CodeGen/AMDGPU/r600.bitcast.ll +++ b/test/CodeGen/AMDGPU/r600.bitcast.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; This test just checks that the compiler doesn't crash. diff --git a/test/CodeGen/AMDGPU/schedule-regpressure.mir b/test/CodeGen/AMDGPU/schedule-regpressure.mir index c71de87eeece..3a20ec732e5b 100644 --- a/test/CodeGen/AMDGPU/schedule-regpressure.mir +++ b/test/CodeGen/AMDGPU/schedule-regpressure.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=misched 2>&1 | FileCheck %s +# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts # Check there is no SReg_32 pressure created by DS_* instructions because of M0 use diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll index add90e9c2f3a..f63719d62a84 100644 --- a/test/CodeGen/AMDGPU/setcc.ll +++ b/test/CodeGen/AMDGPU/setcc.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll new file mode 100644 index 000000000000..1cdfec9fdb59 --- /dev/null +++ b/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s + +; Check transformation shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) +; Only one shift if expected, GEP shall not produce a separate shift + +; CHECK-LABEL: {{^}}add_const_offset: +; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0 +; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 0xc80, v[[SHL]] +; CHECK-NOT: v_lshl +; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]] +; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @add_const_offset(i32 addrspace(1)* nocapture %arg) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %id, 200 + %shl = shl i32 %add, 2 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; CHECK-LABEL: {{^}}or_const_offset: +; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0 +; CHECK: v_or_b32_e32 v[[OR:[0-9]+]], 0x1000, v[[SHL]] +; CHECK-NOT: v_lshl +; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]] +; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @or_const_offset(i32 addrspace(1)* nocapture %arg) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = or i32 %id, 256 + %shl = shl i32 %add, 2 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index ff666cc3653b..edc313ee323b 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll index b4355b76016a..44cfdf6398ae 100644 --- a/test/CodeGen/AMDGPU/sra.ll +++ b/test/CodeGen/AMDGPU/sra.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll index 160e921fc075..f61e524ee2e5 100644 --- a/test/CodeGen/AMDGPU/store-global.ll +++ b/test/CodeGen/AMDGPU/store-global.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}store_i1: ; EG: MEM_RAT MSKOR diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll index ab73ada370ea..ce7656adc0b4 100644 --- a/test/CodeGen/AMDGPU/store-private.ll +++ b/test/CodeGen/AMDGPU/store-private.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}store_i1: ; EG: MOVA_INT diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index f366029fdea2..e7655df15520 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() readnone diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll index 25a700a943d2..e25f2235993f 100644 --- a/test/CodeGen/AMDGPU/unknown-processor.ll +++ b/test/CodeGen/AMDGPU/unknown-processor.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s -; RUN: llc -march=r600 -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s +; RUN: llc -march=amdgcn -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s +; RUN: llc -march=r600 -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s ; Should not crash when the processor is not recognized and the ; wavefront size feature not set. diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll index 03cf725601b7..a0aac8c1d9ba 100644 --- a/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/test/CodeGen/AMDGPU/vector-alloca.ll @@ -138,3 +138,25 @@ entry: store float %tmp2, float addrspace(1)* %out ret void } + +; The pointer arguments in local address space should not affect promotion to vector. + +; OPT-LABEL: @vector_read_with_local_arg( +; OPT: %0 = extractelement <4 x i32> , i32 %index +; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 +define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) { +entry: + %tmp = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3 + store i32 0, i32* %x + store i32 1, i32* %y + store i32 2, i32* %z + store i32 3, i32* %w + %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index + %tmp2 = load i32, i32* %tmp1 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll index 1a0c7fd8e1d6..f4aba880ff76 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -620,6 +620,360 @@ entry: ret float %r } +declare arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32]) + +define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) { +; CHECK-LABEL: name: test_tiny_int_arrays +; CHECK: liveins: %r0, %r1 +; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF +; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0 +; CHECK: [[ARG_ARR2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32 +; CHECK: [[ARG_ARR:%[0-9]+]](s64) = COPY [[ARG_ARR2]] +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 32 +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: BLX @tiny_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1 +; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2 +; CHECK: [[RES_ARR0:%[0-9]+]](s96) = IMPLICIT_DEF +; CHECK: [[RES_ARR1:%[0-9]+]](s96) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0 +; CHECK: [[RES_ARR2:%[0-9]+]](s96) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32 +; CHECK: [[RES_ARR3:%[0-9]+]](s96) = G_INSERT [[RES_ARR2]], [[R2]](s32), 64 +; CHECK: [[RES_ARR:%[0-9]+]](s96) = COPY [[RES_ARR3]] +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 32 +; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 64 +; FIXME: This doesn't seem correct with regard to the AAPCS docs (which say +; that composite types larger than 4 bytes should be passed through memory), +; but it's what DAGISel does. We should fix it in the common code for both. +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: %r2 = COPY [[R2]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1, implicit %r2 +entry: + %r = notail call arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32] %arr) + ret [3 x i32] %r +} + +declare arm_aapcscc void @multiple_int_arrays_target([2 x i32], [2 x i32]) + +define arm_aapcscc void @test_multiple_int_arrays([2 x i32] %arr0, [2 x i32] %arr1) { +; CHECK-LABEL: name: test_multiple_int_arrays +; CHECK: liveins: %r0, %r1 +; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2 +; CHECK: [[R3:%[0-9]+]](s32) = COPY %r3 +; CHECK: [[ARG_ARR0_0:%[0-9]+]](s64) = IMPLICIT_DEF +; CHECK: [[ARG_ARR0_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_0]], [[R0]](s32), 0 +; CHECK: [[ARG_ARR0_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_1]], [[R1]](s32), 32 +; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = COPY [[ARG_ARR0_2]] +; CHECK: [[ARG_ARR1_0:%[0-9]+]](s64) = IMPLICIT_DEF +; CHECK: [[ARG_ARR1_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_0]], [[R2]](s32), 0 +; CHECK: [[ARG_ARR1_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_1]], [[R3]](s32), 32 +; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = COPY [[ARG_ARR1_2]] +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 32 +; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 0 +; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 32 +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: %r2 = COPY [[R2]] +; CHECK: %r3 = COPY [[R3]] +; CHECK: BLX @multiple_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3 +; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: BX_RET 14, _ +entry: + notail call arm_aapcscc void @multiple_int_arrays_target([2 x i32] %arr0, [2 x i32] %arr1) + ret void +} + +declare arm_aapcscc void @large_int_arrays_target([20 x i32]) + +define arm_aapcscc void @test_large_int_arrays([20 x i32] %arr) { +; CHECK-LABEL: name: test_large_int_arrays +; CHECK: fixedStack: +; The parameters live in separate stack locations, one for each element that +; doesn't fit in the registers. +; CHECK-DAG: id: [[FIRST_STACK_ID:[0-9]+]], offset: 0, size: 4 +; CHECK-DAG: id: [[LAST_STACK_ID:[-0]+]], offset: 60, size: 4 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK-DAG: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r2 +; CHECK-DAG: [[R3:%[0-9]+]](s32) = COPY %r3 +; CHECK: [[FIRST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FIRST_STACK_ID]] +; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]] +; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]] +; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]] +; CHECK: [[ARG_ARR0:%[0-9]+]](s640) = IMPLICIT_DEF +; CHECK: [[ARG_ARR1:%[0-9]+]](s640) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0 +; CHECK: [[ARG_ARR2:%[0-9]+]](s640) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32 +; CHECK: [[ARG_ARR3:%[0-9]+]](s640) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64 +; CHECK: [[ARG_ARR4:%[0-9]+]](s640) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96 +; CHECK: [[ARG_ARR5:%[0-9]+]](s640) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128 +; CHECK: [[ARG_ARR6:%[0-9]+]](s640) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 608 +; CHECK: [[ARG_ARR:%[0-9]+]](s640) = COPY [[ARG_ARR6]] +; CHECK: ADJCALLSTACKDOWN 64, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 32 +; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 64 +; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 96 +; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 128 +; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 608 +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: %r2 = COPY [[R2]] +; CHECK: %r3 = COPY [[R3]] +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF_FIRST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[FIRST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_FIRST_ELEMENT]](s32) +; CHECK: G_STORE [[FIRST_STACK_ELEMENT]](s32), [[FIRST_STACK_ARG_ADDR]]{{.*}}store 4 +; Match the second-to-last offset, so we can get the correct SP for the last element +; CHECK: G_CONSTANT i32 56 +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 60 +; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32) +; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4 +; CHECK: BLX @large_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3 +; CHECK: ADJCALLSTACKUP 64, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: BX_RET 14, _ +entry: + notail call arm_aapcscc void @large_int_arrays_target([20 x i32] %arr) + ret void +} + +declare arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double]) + +define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) { +; CHECK-LABEL: name: test_fp_arrays_aapcs +; CHECK: fixedStack: +; CHECK: id: [[ARR2_ID:[0-9]+]], offset: 0, size: 8 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK: [[ARR0_0:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[ARR0_1:%[0-9]+]](s32) = COPY %r1 +; LITTLE: [[ARR0:%[0-9]+]](s64) = G_SEQUENCE [[ARR0_0]](s32), 0, [[ARR0_1]](s32), 32 +; BIG: [[ARR0:%[0-9]+]](s64) = G_SEQUENCE [[ARR0_1]](s32), 0, [[ARR0_0]](s32), 32 +; CHECK: [[ARR1_0:%[0-9]+]](s32) = COPY %r2 +; CHECK: [[ARR1_1:%[0-9]+]](s32) = COPY %r3 +; LITTLE: [[ARR1:%[0-9]+]](s64) = G_SEQUENCE [[ARR1_0]](s32), 0, [[ARR1_1]](s32), 32 +; BIG: [[ARR1:%[0-9]+]](s64) = G_SEQUENCE [[ARR1_1]](s32), 0, [[ARR1_0]](s32), 32 +; CHECK: [[ARR2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[ARR2_ID]] +; CHECK: [[ARR2:%[0-9]+]](s64) = G_LOAD [[ARR2_FI]]{{.*}}load 8 from %fixed-stack.[[ARR2_ID]] +; CHECK: [[ARR_MERGED_0:%[0-9]+]](s192) = IMPLICIT_DEF +; CHECK: [[ARR_MERGED_1:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_0]], [[ARR0]](s64), 0 +; CHECK: [[ARR_MERGED_2:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_1]], [[ARR1]](s64), 64 +; CHECK: [[ARR_MERGED_3:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_2]], [[ARR2]](s64), 128 +; CHECK: [[ARR_MERGED:%[0-9]+]](s192) = COPY [[ARR_MERGED_3]] +; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[ARR0:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 0 +; CHECK: [[ARR1:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 64 +; CHECK: [[ARR2:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 128 +; CHECK: [[ARR0_0:%[0-9]+]](s32) = G_EXTRACT [[ARR0]](s64), 0 +; CHECK: [[ARR0_1:%[0-9]+]](s32) = G_EXTRACT [[ARR0]](s64), 32 +; LITTLE: %r0 = COPY [[ARR0_0]](s32) +; LITTLE: %r1 = COPY [[ARR0_1]](s32) +; BIG: %r0 = COPY [[ARR0_1]](s32) +; BIG: %r1 = COPY [[ARR0_0]](s32) +; CHECK: [[ARR1_0:%[0-9]+]](s32) = G_EXTRACT [[ARR1]](s64), 0 +; CHECK: [[ARR1_1:%[0-9]+]](s32) = G_EXTRACT [[ARR1]](s64), 32 +; LITTLE: %r2 = COPY [[ARR1_0]](s32) +; LITTLE: %r3 = COPY [[ARR1_1]](s32) +; BIG: %r2 = COPY [[ARR1_1]](s32) +; BIG: %r3 = COPY [[ARR1_0]](s32) +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[ARR2_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[ARR2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[ARR2_OFFSET]](s32) +; CHECK: G_STORE [[ARR2]](s64), [[ARR2_ADDR]](p0){{.*}}store 8 +; CHECK: BLX @fp_arrays_aapcs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 +; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[R_MERGED_0:%[0-9]+]](s64) = IMPLICIT_DEF +; CHECK: [[R_MERGED_1:%[0-9]+]](s64) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0 +; CHECK: [[R_MERGED_2:%[0-9]+]](s64) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32 +; CHECK: [[R_MERGED:%[0-9]+]](s64) = COPY [[R_MERGED_2]] +; CHECK: ADJCALLSTACKUP 8, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 32 +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +entry: + %r = notail call arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double] %arr) + ret [2 x float] %r +} + +declare arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double], [3 x float], [4 x double]) + +define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3 x float] %y, [4 x double] %z) { +; CHECK-LABEL: name: test_fp_arrays_aapcs_vfp +; CHECK: fixedStack: +; CHECK-DAG: id: [[Z0_ID:[0-9]+]], offset: 0, size: 8 +; CHECK-DAG: id: [[Z1_ID:[0-9]+]], offset: 8, size: 8 +; CHECK-DAG: id: [[Z2_ID:[0-9]+]], offset: 16, size: 8 +; CHECK-DAG: id: [[Z3_ID:[0-9]+]], offset: 24, size: 8 +; CHECK: liveins: %d0, %d1, %d2, %s6, %s7, %s8 +; CHECK: [[X0:%[0-9]+]](s64) = COPY %d0 +; CHECK: [[X1:%[0-9]+]](s64) = COPY %d1 +; CHECK: [[X2:%[0-9]+]](s64) = COPY %d2 +; CHECK: [[Y0:%[0-9]+]](s32) = COPY %s6 +; CHECK: [[Y1:%[0-9]+]](s32) = COPY %s7 +; CHECK: [[Y2:%[0-9]+]](s32) = COPY %s8 +; CHECK: [[Z0_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z0_ID]] +; CHECK: [[Z0:%[0-9]+]](s64) = G_LOAD [[Z0_FI]]{{.*}}load 8 +; CHECK: [[Z1_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z1_ID]] +; CHECK: [[Z1:%[0-9]+]](s64) = G_LOAD [[Z1_FI]]{{.*}}load 8 +; CHECK: [[Z2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z2_ID]] +; CHECK: [[Z2:%[0-9]+]](s64) = G_LOAD [[Z2_FI]]{{.*}}load 8 +; CHECK: [[Z3_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z3_ID]] +; CHECK: [[Z3:%[0-9]+]](s64) = G_LOAD [[Z3_FI]]{{.*}}load 8 +; CHECK: [[X_ARR_0:%[0-9]+]](s192) = IMPLICIT_DEF +; CHECK: [[X_ARR_1:%[0-9]+]](s192) = G_INSERT [[X_ARR_0]], [[X0]](s64), 0 +; CHECK: [[X_ARR_2:%[0-9]+]](s192) = G_INSERT [[X_ARR_1]], [[X1]](s64), 64 +; CHECK: [[X_ARR_3:%[0-9]+]](s192) = G_INSERT [[X_ARR_2]], [[X2]](s64), 128 +; CHECK: [[X_ARR:%[0-9]+]](s192) = COPY [[X_ARR_3]](s192) +; CHECK: [[Y_ARR_0:%[0-9]+]](s96) = IMPLICIT_DEF +; CHECK: [[Y_ARR_1:%[0-9]+]](s96) = G_INSERT [[Y_ARR_0]], [[Y0]](s32), 0 +; CHECK: [[Y_ARR_2:%[0-9]+]](s96) = G_INSERT [[Y_ARR_1]], [[Y1]](s32), 32 +; CHECK: [[Y_ARR_3:%[0-9]+]](s96) = G_INSERT [[Y_ARR_2]], [[Y2]](s32), 64 +; CHECK: [[Y_ARR:%[0-9]+]](s96) = COPY [[Y_ARR_3]](s96) +; CHECK: [[Z_ARR_0:%[0-9]+]](s256) = IMPLICIT_DEF +; CHECK: [[Z_ARR_1:%[0-9]+]](s256) = G_INSERT [[Z_ARR_0]], [[Z0]](s64), 0 +; CHECK: [[Z_ARR_2:%[0-9]+]](s256) = G_INSERT [[Z_ARR_1]], [[Z1]](s64), 64 +; CHECK: [[Z_ARR_3:%[0-9]+]](s256) = G_INSERT [[Z_ARR_2]], [[Z2]](s64), 128 +; CHECK: [[Z_ARR_4:%[0-9]+]](s256) = G_INSERT [[Z_ARR_3]], [[Z3]](s64), 192 +; CHECK: [[Z_ARR:%[0-9]+]](s256) = COPY [[Z_ARR_4]](s256) +; CHECK: ADJCALLSTACKDOWN 32, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[X0:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 0 +; CHECK: [[X1:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 64 +; CHECK: [[X2:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 128 +; CHECK: [[Y0:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 0 +; CHECK: [[Y1:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 32 +; CHECK: [[Y2:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 64 +; CHECK: [[Z0:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 0 +; CHECK: [[Z1:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 64 +; CHECK: [[Z2:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 128 +; CHECK: [[Z3:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 192 +; CHECK: %d0 = COPY [[X0]](s64) +; CHECK: %d1 = COPY [[X1]](s64) +; CHECK: %d2 = COPY [[X2]](s64) +; CHECK: %s6 = COPY [[Y0]](s32) +; CHECK: %s7 = COPY [[Y1]](s32) +; CHECK: %s8 = COPY [[Y2]](s32) +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[Z0_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[Z0_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z0_OFFSET]](s32) +; CHECK: G_STORE [[Z0]](s64), [[Z0_ADDR]](p0){{.*}}store 8 +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[Z1_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 8 +; CHECK: [[Z1_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z1_OFFSET]](s32) +; CHECK: G_STORE [[Z1]](s64), [[Z1_ADDR]](p0){{.*}}store 8 +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[Z2_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 16 +; CHECK: [[Z2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z2_OFFSET]](s32) +; CHECK: G_STORE [[Z2]](s64), [[Z2_ADDR]](p0){{.*}}store 8 +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[Z3_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 24 +; CHECK: [[Z3_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z3_OFFSET]](s32) +; CHECK: G_STORE [[Z3]](s64), [[Z3_ADDR]](p0){{.*}}store 8 +; CHECK: BLX @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %d0, implicit %d1, implicit %d2, implicit %s6, implicit %s7, implicit %s8, implicit-def %s0, implicit-def %s1, implicit-def %s2, implicit-def %s3 +; CHECK: [[R0:%[0-9]+]](s32) = COPY %s0 +; CHECK: [[R1:%[0-9]+]](s32) = COPY %s1 +; CHECK: [[R2:%[0-9]+]](s32) = COPY %s2 +; CHECK: [[R3:%[0-9]+]](s32) = COPY %s3 +; CHECK: [[R_MERGED_0:%[0-9]+]](s128) = IMPLICIT_DEF +; CHECK: [[R_MERGED_1:%[0-9]+]](s128) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0 +; CHECK: [[R_MERGED_2:%[0-9]+]](s128) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32 +; CHECK: [[R_MERGED_3:%[0-9]+]](s128) = G_INSERT [[R_MERGED_2]], [[R2]](s32), 64 +; CHECK: [[R_MERGED_4:%[0-9]+]](s128) = G_INSERT [[R_MERGED_3]], [[R3]](s32), 96 +; CHECK: [[R_MERGED:%[0-9]+]](s128) = COPY [[R_MERGED_4]] +; CHECK: ADJCALLSTACKUP 32, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 32 +; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 64 +; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 96 +; CHECK: %s0 = COPY [[R0]] +; CHECK: %s1 = COPY [[R1]] +; CHECK: %s2 = COPY [[R2]] +; CHECK: %s3 = COPY [[R3]] +; CHECK: BX_RET 14, _, implicit %s0, implicit %s1, implicit %s2, implicit %s3 +entry: + %r = notail call arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double] %x, [3 x float] %y, [4 x double] %z) + ret [4 x float] %r +} + +declare arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr) + +define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) { +; CHECK-LABEL: name: test_tough_arrays +; CHECK: fixedStack: +; The parameters live in separate stack locations, one for each element that +; doesn't fit in the registers. +; CHECK-DAG: id: [[FIRST_STACK_ID:[0-9]+]], offset: 0, size: 4 +; CHECK-DAG: id: [[LAST_STACK_ID:[-0]+]], offset: 76, size: 4 +; CHECK: liveins: %r0, %r1, %r2, %r3 +; CHECK-DAG: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r2 +; CHECK-DAG: [[R3:%[0-9]+]](s32) = COPY %r3 +; CHECK: [[FIRST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FIRST_STACK_ID]] +; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]] +; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]] +; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]] +; CHECK: [[ARG_ARR0:%[0-9]+]](s768) = IMPLICIT_DEF +; CHECK: [[ARG_ARR1:%[0-9]+]](s768) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0 +; CHECK: [[ARG_ARR2:%[0-9]+]](s768) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32 +; CHECK: [[ARG_ARR3:%[0-9]+]](s768) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64 +; CHECK: [[ARG_ARR4:%[0-9]+]](s768) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96 +; CHECK: [[ARG_ARR5:%[0-9]+]](s768) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128 +; CHECK: [[ARG_ARR6:%[0-9]+]](s768) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 736 +; CHECK: [[ARG_ARR:%[0-9]+]](s768) = COPY [[ARG_ARR6]] +; CHECK: ADJCALLSTACKDOWN 80, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 32 +; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 64 +; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 96 +; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 128 +; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 736 +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: %r2 = COPY [[R2]] +; CHECK: %r3 = COPY [[R3]] +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF_FIRST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 0 +; CHECK: [[FIRST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_FIRST_ELEMENT]](s32) +; CHECK: G_STORE [[FIRST_STACK_ELEMENT]](s32), [[FIRST_STACK_ARG_ADDR]]{{.*}}store 4 +; Match the second-to-last offset, so we can get the correct SP for the last element +; CHECK: G_CONSTANT i32 72 +; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp +; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 76 +; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32) +; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4 +; CHECK: BLX @tough_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 +; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0 +; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1 +; CHECK: [[RES_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF +; CHECK: [[RES_ARR1:%[0-9]+]](s64) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0 +; CHECK: [[RES_ARR2:%[0-9]+]](s64) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32 +; CHECK: [[RES_ARR:%[0-9]+]](s64) = COPY [[RES_ARR2]] +; CHECK: ADJCALLSTACKUP 80, 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 0 +; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 32 +; CHECK: %r0 = COPY [[R0]] +; CHECK: %r1 = COPY [[R1]] +; CHECK: BX_RET 14, _, implicit %r0, implicit %r1 +entry: + %r = notail call arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr) + ret [2 x i32*] %r +} + define i32 @test_shufflevector_s32_v2s32(i32 %arg) { ; CHECK-LABEL: name: test_shufflevector_s32_v2s32 ; CHECK: [[ARG:%[0-9]+]](s32) = COPY %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll index e3680ed2b929..ef30cb1063f8 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll @@ -45,11 +45,13 @@ define half @test_half(half %a, half %b) { ret half %res } -; On ARM, clang lowers structs to arrays. -define void @test_arrays([2 x i32] %this.could.come.from.a.struct) { -; CHECK: remark: {{.*}} unable to lower arguments: void ([2 x i32])* -; CHECK-LABEL: warning: Instruction selection used fallback path for test_arrays - ret void +declare [16 x i32] @ret_demotion_target() + +define [16 x i32] @test_ret_demotion() { +; CHECK: remark: {{.*}} unable to translate instruction: call{{.*}} @ret_demotion_target +; CHECK-LABEL: warning: Instruction selection used fallback path for test_ret_demotion + %res = call [16 x i32] @ret_demotion_target() + ret [16 x i32] %res } define void @test_structs({i32, i32} %struct) { diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll index 9cce19417047..1985ff9b4a27 100644 --- a/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -656,6 +656,9 @@ declare double @llvm.pow.f64(double, double) ; ; DISABLE: pop ; +; FIXME: This is flakey passing by finding 'bl' somewhere amongst the debug +; info (like labels named 'line_table) not because it's found a bl instruction. +; ; CHECK: bl define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %tmp) "no-frame-pointer-elim"="true" { bb: @@ -681,7 +684,9 @@ bb13: ; preds = %bb3, %bb !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !4, globals: !2, imports: !2) !1 = !DIFile(filename: "a.cpp", directory: "b") !2 = !{} !3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{!5} +!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) diff --git a/test/CodeGen/ARM/constantpool-promote-dbg.ll b/test/CodeGen/ARM/constantpool-promote-dbg.ll index ae765d26dcac..84386d2975f0 100644 --- a/test/CodeGen/ARM/constantpool-promote-dbg.ll +++ b/test/CodeGen/ARM/constantpool-promote-dbg.ll @@ -1,4 +1,4 @@ -; RUN: llc -relocation-model=static < %s | FileCheck %s +; RUN: llc -relocation-model=static -arm-promote-constant < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv7m--linux-gnu" diff --git a/test/CodeGen/ARM/constantpool-promote-ldrh.ll b/test/CodeGen/ARM/constantpool-promote-ldrh.ll index 9e369dc08c4b..59970495874b 100644 --- a/test/CodeGen/ARM/constantpool-promote-ldrh.ll +++ b/test/CodeGen/ARM/constantpool-promote-ldrh.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -O0 -fast-isel=false | FileCheck %s -; RUN: llc < %s -O0 -fast-isel=false -filetype=obj +; RUN: llc < %s -O0 -fast-isel=false -arm-promote-constant | FileCheck %s +; RUN: llc < %s -O0 -fast-isel=false -filetype=obj -arm-promote-constant target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv6m-arm-linux-gnueabi" diff --git a/test/CodeGen/ARM/constantpool-promote.ll b/test/CodeGen/ARM/constantpool-promote.ll index 8df7e100c051..d5361f33a98b 100644 --- a/test/CodeGen/ARM/constantpool-promote.ll +++ b/test/CodeGen/ARM/constantpool-promote.ll @@ -1,15 +1,15 @@ -; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM -; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM -; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM -; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM -; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB -; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB -; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB -; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB -; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M -; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M -; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M -; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M +; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M @.str = private unnamed_addr constant [2 x i8] c"s\00", align 1 @.str1 = private unnamed_addr constant [69 x i8] c"this string is far too long to fit in a literal pool by far and away\00", align 1 diff --git a/test/CodeGen/ARM/cortexr52-misched-basic.ll b/test/CodeGen/ARM/cortexr52-misched-basic.ll index 3ccb34d9fc90..eb2c29a3a5d1 100644 --- a/test/CodeGen/ARM/cortexr52-misched-basic.ll +++ b/test/CodeGen/ARM/cortexr52-misched-basic.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=R52_SCHED -; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=R52_SCHED +; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC ; ; Check the latency for instructions for both generic and cortex-r52. ; Cortex-r52 machine model will cause the div to be sceduled before eor diff --git a/test/CodeGen/ARM/fastisel-thumb-litpool.ll b/test/CodeGen/ARM/fastisel-thumb-litpool.ll index aa9e7260fb2e..53653a5a4f57 100644 --- a/test/CodeGen/ARM/fastisel-thumb-litpool.ll +++ b/test/CodeGen/ARM/fastisel-thumb-litpool.ll @@ -5,6 +5,7 @@ ; hence the CHECK-NOT. define i32 @test_thumb_ldrlit() minsize { +; CHECK-LABEL: test_thumb_ldrlit: ; CHECK: ldr r0, LCPI0_0 ; CHECK-NOT: ldr ret i32 12345678 diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index fb204debf612..b447497b270a 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -35,6 +35,8 @@ entry: ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]] ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK-T1-LABEL: t1: +; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false) ret void } @@ -51,6 +53,8 @@ entry: ; CHECK: str [[REG2]], [r0] ; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3] +; CHECK-T1-LABEL: t2: +; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false) ret void } @@ -62,6 +66,8 @@ entry: ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]! ; CHECK: vldr d{{[0-9]+}}, [r1] ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0] +; CHECK-T1-LABEL: t3: +; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false) ret void } @@ -72,6 +78,8 @@ entry: ; CHECK: vld1.64 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1] ; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]! ; CHECK: strh [[REG5:r[0-9]+]], [r0] +; CHECK-T1-LABEL: t4: +; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false) ret void } @@ -87,10 +95,7 @@ entry: ; CHECK: movt [[REG7:r[0-9]+]], #22866 ; CHECK: str [[REG7]] ; CHECK-T1-LABEL: t5: -; CHECK-T1: movs [[TREG3:r[0-9]]], -; CHECK-T1: strb [[TREG3]], -; CHECK-T1: movs [[TREG4:r[0-9]]], -; CHECK-T1: strb [[TREG4]], +; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false) ret void } diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll index b86874692aca..b2bd257701d3 100644 --- a/test/CodeGen/ARM/memset-inline.ll +++ b/test/CodeGen/ARM/memset-inline.ll @@ -1,22 +1,36 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-7A +; RUN: llc < %s -mtriple=thumbv6m -pre-RA-sched=source -disable-post-ra -mattr=+strict-align | FileCheck %s -check-prefix=CHECK-6M define void @t1(i8* nocapture %c) nounwind optsize { entry: -; CHECK-LABEL: t1: -; CHECK: movs r1, #0 -; CHECK: strd r1, r1, [r0] -; CHECK: str r1, [r0, #8] +; CHECK-7A-LABEL: t1: +; CHECK-7A: movs r1, #0 +; CHECK-7A: strd r1, r1, [r0] +; CHECK-7A: str r1, [r0, #8] +; CHECK-6M-LABEL: t1: +; CHECK-6M: movs r1, #0 +; CHECK-6M: str r1, [r0] +; CHECK-6M: str r1, [r0, #4] +; CHECK-6M: str r1, [r0, #8] call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) ret void } define void @t2() nounwind ssp { entry: -; CHECK-LABEL: t2: -; CHECK: vmov.i32 {{q[0-9]+}}, #0x0 -; CHECK: movs r1, #10 -; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1 -; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] +; CHECK-7A-LABEL: t2: +; CHECK-7A: vmov.i32 {{q[0-9]+}}, #0x0 +; CHECK-7A: movs r1, #10 +; CHECK-7A: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1 +; CHECK-7A: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] +; CHECK-6M-LABEL: t2: +; CHECK-6M: movs [[REG:r[0-9]+]], #0 +; CHECK-6M: str [[REG]], [sp, #20] +; CHECK-6M: str [[REG]], [sp, #16] +; CHECK-6M: str [[REG]], [sp, #12] +; CHECK-6M: str [[REG]], [sp, #8] +; CHECK-6M: str [[REG]], [sp, #4] +; CHECK-6M: str [[REG]], [sp] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false) @@ -24,6 +38,56 @@ entry: ret void } +define void @t3(i8* %p) { +entry: +; CHECK-7A-LABEL: t3: +; CHECK-7A: muls [[REG:r[0-9]+]], +; CHECK-7A: str [[REG]], +; CHECK-6M-LABEL: t3: +; CHECK-6M-NOT: muls +; CHECK-6M: strb [[REG:r[0-9]+]], +; CHECK-6M: strb [[REG]], +; CHECK-6M: strb [[REG]], +; CHECK-6M: strb [[REG]], + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = trunc i32 %i to i8 + call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false) + call void @something(i8* %p) + %inc = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, 255 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @t4(i8* %p) { +entry: +; CHECK-7A-LABEL: t4: +; CHECK-7A: muls [[REG:r[0-9]+]], +; CHECK-7A: str [[REG]], +; CHECK-6M-LABEL: t4: +; CHECK-6M: muls [[REG:r[0-9]+]], +; CHECK-6M: strh [[REG]], +; CHECK-6M: strh [[REG]], + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = trunc i32 %i to i8 + call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false) + call void @something(i8* %p) + %inc = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, 255 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + declare void @something(i8*) nounwind declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll index 330252a90d7c..53f8b8d15042 100644 --- a/test/CodeGen/ARM/misched-copy-arm.ll +++ b/test/CodeGen/ARM/misched-copy-arm.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s +; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=machine-scheduler -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s ; ; Loop counter copies should be eliminated. ; There is also a MUL here, but we don't care where it is scheduled. diff --git a/test/CodeGen/ARM/misched-fp-basic.ll b/test/CodeGen/ARM/misched-fp-basic.ll index 27ad2cec34fd..2f672b0cb540 100644 --- a/test/CodeGen/ARM/misched-fp-basic.ll +++ b/test/CodeGen/ARM/misched-fp-basic.ll @@ -1,9 +1,9 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \ ; RUN: /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 -; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \ ; RUN: /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT -; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \ ; RUN: /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 ; ; Check the latency of instructions for processors with sched-models diff --git a/test/CodeGen/ARM/misched-int-basic-thumb2.mir b/test/CodeGen/ARM/misched-int-basic-thumb2.mir index 86ef1e26f636..32d1e03d9a1b 100644 --- a/test/CodeGen/ARM/misched-int-basic-thumb2.mir +++ b/test/CodeGen/ARM/misched-int-basic-thumb2.mir @@ -1,10 +1,10 @@ # Basic machine sched model test for Thumb2 int instructions # RUN: llc -o /dev/null %s -mtriple=thumbv7-eabi -mcpu=swift -run-pass machine-scheduler -enable-misched -verify-misched \ -# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT # RUN: llc -o /dev/null %s -mtriple=thumbv7--eabi -mcpu=cortex-a9 -run-pass machine-scheduler -enable-misched -verify-misched \ -# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 # RUN: llc -o /dev/null %s -mtriple=thumbv8r-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -verify-misched \ -# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 # REQUIRES: asserts --- | ; ModuleID = 'foo.ll' diff --git a/test/CodeGen/ARM/misched-int-basic.mir b/test/CodeGen/ARM/misched-int-basic.mir index f237c0a07b2e..d5231269d732 100644 --- a/test/CodeGen/ARM/misched-int-basic.mir +++ b/test/CodeGen/ARM/misched-int-basic.mir @@ -1,9 +1,9 @@ # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=swift -run-pass machine-scheduler -enable-misched -verify-misched \ -# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-a9 -run-pass machine-scheduler -enable-misched -verify-misched \ -# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -verify-misched \ -# RUN: -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 +# RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52 # REQUIRES: asserts --- | ; ModuleID = 'foo.ll' diff --git a/test/CodeGen/ARM/single-issue-r52.mir b/test/CodeGen/ARM/single-issue-r52.mir index 6c95f7603e6e..1eba074dafb3 100644 --- a/test/CodeGen/ARM/single-issue-r52.mir +++ b/test/CodeGen/ARM/single-issue-r52.mir @@ -1,5 +1,5 @@ -# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN -# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=machine-scheduler -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN +# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass machine-scheduler -enable-misched -debug-only=machine-scheduler -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP # REQUIRES: asserts --- | ; ModuleID = 'foo.ll' diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll index 81b22ee12cdd..c08ed81d042a 100644 --- a/test/CodeGen/ARM/vcombine.ll +++ b/test/CodeGen/ARM/vcombine.ll @@ -99,7 +99,9 @@ define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind { define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind { ; CHECK: vget_high8 ; CHECK-NOT: vst -; CHECK-LE: vmov r0, r1, d17 +; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0] +; CHECK-LE: vldr d16, [r0, #8] +; CHECK-LE: vmov r0, r1, d16 ; CHECK-BE: vmov r1, r0, d16 %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll index e44e757a3169..5742dc314978 100644 --- a/test/CodeGen/ARM/vext.ll +++ b/test/CodeGen/ARM/vext.ll @@ -199,10 +199,10 @@ define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: test_undef: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vzip.16 d19, d16 -; CHECK-NEXT: vmov r0, r1, d19 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0, #8] +; CHECK-NEXT: vzip.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B diff --git a/test/CodeGen/Hexagon/post-ra-kill-update.mir b/test/CodeGen/Hexagon/post-ra-kill-update.mir index c43624d7a8d3..ac46a70a68a5 100644 --- a/test/CodeGen/Hexagon/post-ra-kill-update.mir +++ b/test/CodeGen/Hexagon/post-ra-kill-update.mir @@ -6,7 +6,7 @@ # CHECK-LABEL: name: foo # Check for no-kill of r9 in the first instruction, after reordering: -# CHECK: %d7 = S2_lsr_r_p_or %d7, killed %d1, %r9 +# CHECK: %d7 = S2_lsr_r_p_or killed %d7, killed %d1, %r9 # CHECK: %d13 = S2_lsr_r_p killed %d0, killed %r9 --- | diff --git a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll index 18cca5c356e3..242ee53f19f2 100644 --- a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll +++ b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=misched -o /dev/null 2>&1 | FileCheck %s +; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=machine-scheduler -o /dev/null 2>&1 | FileCheck %s ; Make sure there are no control dependencies between memory operations that ; are trivially disjoint. diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir deleted file mode 100644 index 96801f5b0a37..000000000000 --- a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir +++ /dev/null @@ -1,24 +0,0 @@ -# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s ---- -name: foo -body: | - bb.0: - B %bb.2 - - bb.1: - BX_RET 14, 0 - - bb.2: - Bcc %bb.1, 1, %cpsr - - bb.3: - B %bb.1 - -... - -# We should get a single block containing the BX_RET, with no successors at all - -# CHECK: body: -# CHECK-NEXT: bb.0: -# CHECK-NEXT: BX_RET - diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll index b23f1ad37d81..87b6a7aeacf5 100644 --- a/test/CodeGen/MSP430/hwmult16.ll +++ b/test/CodeGen/MSP430/hwmult16.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s +; RUN: llc -O0 -mattr=+hwmult16 < %s | FileCheck %s target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" target triple = "msp430---elf" diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll index 6ffeb9698862..10c831e77ffb 100644 --- a/test/CodeGen/MSP430/hwmult32.ll +++ b/test/CodeGen/MSP430/hwmult32.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s +; RUN: llc -O0 -mattr=+hwmult32 < %s | FileCheck %s target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" target triple = "msp430---elf" diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll index 51ca4be4a654..c57922ece7d0 100644 --- a/test/CodeGen/MSP430/hwmultf5.ll +++ b/test/CodeGen/MSP430/hwmultf5.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s +; RUN: llc -O0 -mattr=+hwmultf5 < %s | FileCheck %s target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" target triple = "msp430---elf" diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll index a708b89cbd8f..4baf499848fd 100644 --- a/test/CodeGen/MSP430/vararg.ll +++ b/test/CodeGen/MSP430/vararg.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc -verify-machineinstrs < %s | FileCheck %s target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16" target triple = "msp430---elf" diff --git a/test/CodeGen/Nios2/lit.local.cfg b/test/CodeGen/Nios2/lit.local.cfg new file mode 100644 index 000000000000..84c8b039391b --- /dev/null +++ b/test/CodeGen/Nios2/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'Nios2' in config.root.targets: + config.unsupported = True + diff --git a/test/CodeGen/Nios2/target_support.ll b/test/CodeGen/Nios2/target_support.ll new file mode 100644 index 000000000000..90e7020b2fcc --- /dev/null +++ b/test/CodeGen/Nios2/target_support.ll @@ -0,0 +1,11 @@ +; This tests that llc accepts Nios2 target. + +; RUN: not not llc < %s -asm-verbose=false -march=nios2 2>&1 | FileCheck %s --check-prefix=ARCH +; RUN: not not llc < %s -asm-verbose=false -mtriple=nios2 2>&1 | FileCheck %s --check-prefix=TRIPLE + +; ARCH-NOT: invalid target +; TRIPLE-NOT: unable to get target + +define i32 @f(i32 %i) { + ret i32 %i +} diff --git a/test/CodeGen/PowerPC/atomics-constant.ll b/test/CodeGen/PowerPC/atomics-constant.ll new file mode 100644 index 000000000000..a92ca813af85 --- /dev/null +++ b/test/CodeGen/PowerPC/atomics-constant.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "powerpc64le-unknown-linux-gnu" + +@a = constant i64 zeroinitializer + +define i64 @foo() { +; CHECK-LABEL: foo: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis 3, 2, .LC0@toc@ha +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: ld 3, .LC0@toc@l(3) +; CHECK-NEXT: cmpw 7, 4, 4 +; CHECK-NEXT: ld 3, 0(3) +; CHECK-NEXT: bne- 7, .+4 +; CHECK-NEXT: isync +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: blr +entry: + %value = load atomic i64, i64* @a acquire, align 8 + ret i64 %value +} diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll index 1bce9d4cb439..c42f677d17ab 100644 --- a/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/test/CodeGen/PowerPC/build-vector-tests.ll @@ -869,9 +869,9 @@ entry: ; P9LE-LABEL: fromDiffConstsi ; P8BE-LABEL: fromDiffConstsi ; P8LE-LABEL: fromDiffConstsi -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -899,9 +899,9 @@ entry: ; P9LE-LABEL: fromDiffMemConsAi ; P8BE-LABEL: fromDiffMemConsAi ; P8LE-LABEL: fromDiffMemConsAi -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -929,12 +929,12 @@ entry: ; P9LE-LABEL: fromDiffMemConsDi ; P8BE-LABEL: fromDiffMemConsDi ; P8LE-LABEL: fromDiffMemConsDi -; P9BE: lxvx -; P9BE: lxvx +; P9BE: lxv +; P9BE: lxv ; P9BE: vperm ; P9BE: blr -; P9LE: lxvx -; P9LE: lxvx +; P9LE: lxv +; P9LE: lxv ; P9LE: vperm ; P9LE: blr ; P8BE: lxvw4x @@ -1018,13 +1018,13 @@ entry: ; P8BE-LABEL: fromDiffMemVarDi ; P8LE-LABEL: fromDiffMemVarDi ; P9BE: sldi {{r[0-9]+}}, r4, 2 -; P9BE-DAG: lxvx {{v[0-9]+}}, r3, -; P9BE-DAG: lxvx +; P9BE-DAG: lxv {{v[0-9]+}} +; P9BE-DAG: lxv ; P9BE: vperm ; P9BE: blr ; P9LE: sldi {{r[0-9]+}}, r4, 2 -; P9LE-DAG: lxvx {{v[0-9]+}}, r3, -; P9LE-DAG: lxvx +; P9LE-DAG: lxv {{v[0-9]+}} +; P9LE-DAG: lxv ; P9LE: vperm ; P9LE: blr ; P8BE: sldi {{r[0-9]+}}, r4, 2 @@ -1281,9 +1281,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvftoi ; P8BE-LABEL: fromDiffConstsConvftoi ; P8LE-LABEL: fromDiffConstsConvftoi -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -1303,10 +1303,10 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvftoi ; P8BE-LABEL: fromDiffMemConsAConvftoi ; P8LE-LABEL: fromDiffMemConsAConvftoi -; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3 +; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9BE: xvcvspsxws v2, [[REG1]] ; P9BE: blr -; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3 +; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9LE: xvcvspsxws v2, [[REG1]] ; P9LE: blr ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3 @@ -1341,13 +1341,13 @@ entry: ; P9LE-LABEL: fromDiffMemConsDConvftoi ; P8BE-LABEL: fromDiffMemConsDConvftoi ; P8LE-LABEL: fromDiffMemConsDConvftoi -; P9BE: lxvx -; P9BE: lxvx +; P9BE: lxv +; P9BE: lxv ; P9BE: vperm ; P9BE: xvcvspsxws ; P9BE: blr -; P9LE: lxvx -; P9LE: lxvx +; P9LE: lxv +; P9LE: lxv ; P9LE: vperm ; P9LE: xvcvspsxws ; P9LE: blr @@ -1557,9 +1557,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvdtoi ; P8BE-LABEL: fromDiffConstsConvdtoi ; P8LE-LABEL: fromDiffConstsConvdtoi -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -1584,16 +1584,16 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvdtoi ; P8BE-LABEL: fromDiffMemConsAConvdtoi ; P8LE-LABEL: fromDiffMemConsAConvdtoi -; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3 -; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4 +; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ; P9BE: vmrgew v2, [[REG6]], [[REG5]] ; P9BE: xvcvspsxws v2, v2 -; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3 -; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4 +; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] @@ -2027,9 +2027,9 @@ entry: ; P9LE-LABEL: fromDiffConstsui ; P8BE-LABEL: fromDiffConstsui ; P8LE-LABEL: fromDiffConstsui -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -2057,9 +2057,9 @@ entry: ; P9LE-LABEL: fromDiffMemConsAui ; P8BE-LABEL: fromDiffMemConsAui ; P8LE-LABEL: fromDiffMemConsAui -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -2087,12 +2087,12 @@ entry: ; P9LE-LABEL: fromDiffMemConsDui ; P8BE-LABEL: fromDiffMemConsDui ; P8LE-LABEL: fromDiffMemConsDui -; P9BE: lxvx -; P9BE: lxvx +; P9BE: lxv +; P9BE: lxv ; P9BE: vperm ; P9BE: blr -; P9LE: lxvx -; P9LE: lxvx +; P9LE: lxv +; P9LE: lxv ; P9LE: vperm ; P9LE: blr ; P8BE: lxvw4x @@ -2177,13 +2177,13 @@ entry: ; P8BE-LABEL: fromDiffMemVarDui ; P8LE-LABEL: fromDiffMemVarDui ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P9BE-DAG: lxvx {{v[0-9]+}}, r3 -; P9BE-DAG: lxvx +; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3) +; P9BE-DAG: lxv ; P9BE: vperm ; P9BE: blr ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P9LE-DAG: lxvx {{v[0-9]+}}, r3 -; P9LE-DAG: lxvx +; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3) +; P9LE-DAG: lxv ; P9LE: vperm ; P9LE: blr ; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2 @@ -2439,9 +2439,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvftoui ; P8BE-LABEL: fromDiffConstsConvftoui ; P8LE-LABEL: fromDiffConstsConvftoui -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -2461,10 +2461,10 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvftoui ; P8BE-LABEL: fromDiffMemConsAConvftoui ; P8LE-LABEL: fromDiffMemConsAConvftoui -; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3 +; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9BE: xvcvspuxws v2, [[REG1]] ; P9BE: blr -; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3 +; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) ; P9LE: xvcvspuxws v2, [[REG1]] ; P9LE: blr ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3 @@ -2499,13 +2499,13 @@ entry: ; P9LE-LABEL: fromDiffMemConsDConvftoui ; P8BE-LABEL: fromDiffMemConsDConvftoui ; P8LE-LABEL: fromDiffMemConsDConvftoui -; P9BE: lxvx -; P9BE: lxvx +; P9BE: lxv +; P9BE: lxv ; P9BE: vperm ; P9BE: xvcvspuxws ; P9BE: blr -; P9LE: lxvx -; P9LE: lxvx +; P9LE: lxv +; P9LE: lxv ; P9LE: vperm ; P9LE: xvcvspuxws ; P9LE: blr @@ -2715,9 +2715,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvdtoui ; P8BE-LABEL: fromDiffConstsConvdtoui ; P8LE-LABEL: fromDiffConstsConvdtoui -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvw4x ; P8BE: blr @@ -2742,16 +2742,16 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvdtoui ; P8BE-LABEL: fromDiffMemConsAConvdtoui ; P8LE-LABEL: fromDiffMemConsAConvdtoui -; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3 -; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4 +; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ; P9BE: vmrgew v2, [[REG6]], [[REG5]] ; P9BE: xvcvspuxws v2, v2 -; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3 -; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4 +; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3) +; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3) ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] @@ -3087,9 +3087,9 @@ entry: ; P9LE-LABEL: spltConst1ll ; P8BE-LABEL: spltConst1ll ; P8LE-LABEL: spltConst1ll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3105,9 +3105,9 @@ entry: ; P9LE-LABEL: spltConst16kll ; P8BE-LABEL: spltConst16kll ; P8LE-LABEL: spltConst16kll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3123,9 +3123,9 @@ entry: ; P9LE-LABEL: spltConst32kll ; P8BE-LABEL: spltConst32kll ; P8LE-LABEL: spltConst32kll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3165,9 +3165,9 @@ entry: ; P9LE-LABEL: fromDiffConstsll ; P8BE-LABEL: fromDiffConstsll ; P8LE-LABEL: fromDiffConstsll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3188,9 +3188,9 @@ entry: ; P9LE-LABEL: fromDiffMemConsAll ; P8BE-LABEL: fromDiffMemConsAll ; P8LE-LABEL: fromDiffMemConsAll -; P9BE: lxvx v2 +; P9BE: lxv v2 ; P9BE: blr -; P9LE: lxvx v2 +; P9LE: lxv v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr @@ -3213,9 +3213,9 @@ entry: ; P9LE-LABEL: fromDiffMemConsDll ; P8BE-LABEL: fromDiffMemConsDll ; P8LE-LABEL: fromDiffMemConsDll -; P9BE: lxvx v2 +; P9BE: lxv v2 ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE: blr ; P8BE: lxvd2x @@ -3275,11 +3275,11 @@ entry: ; P8BE-LABEL: fromDiffMemVarDll ; P8LE-LABEL: fromDiffMemVarDll ; P9BE: sldi -; P9BE: lxvx +; P9BE: lxv ; P9BE: xxswapd v2 ; P9BE-NEXT: blr ; P9LE: sldi -; P9LE: lxvx +; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE-NEXT: blr ; P8BE: sldi @@ -3422,9 +3422,9 @@ entry: ; P9LE-LABEL: spltCnstConvftoll ; P8BE-LABEL: spltCnstConvftoll ; P8LE-LABEL: spltCnstConvftoll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3466,9 +3466,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvftoll ; P8BE-LABEL: fromDiffConstsConvftoll ; P8LE-LABEL: fromDiffConstsConvftoll -; P9BE: lxvx v2 +; P9BE: lxv v2 ; P9BE: blr -; P9LE: lxvx v2 +; P9LE: lxv v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr @@ -3705,9 +3705,9 @@ entry: ; P9LE-LABEL: spltCnstConvdtoll ; P8BE-LABEL: spltCnstConvdtoll ; P8LE-LABEL: spltCnstConvdtoll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3749,9 +3749,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvdtoll ; P8BE-LABEL: fromDiffConstsConvdtoll ; P8LE-LABEL: fromDiffConstsConvdtoll -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -3770,10 +3770,10 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvdtoll ; P8BE-LABEL: fromDiffMemConsAConvdtoll ; P8LE-LABEL: fromDiffMemConsAConvdtoll -; P9BE: lxvx +; P9BE: lxv ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr ; P8BE: lxvd2x @@ -3801,11 +3801,11 @@ entry: ; P9LE-LABEL: fromDiffMemConsDConvdtoll ; P8BE-LABEL: fromDiffMemConsDConvdtoll ; P8LE-LABEL: fromDiffMemConsDConvdtoll -; P9BE: lxvx +; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr @@ -3876,12 +3876,12 @@ entry: ; P8BE-LABEL: fromDiffMemVarDConvdtoll ; P8LE-LABEL: fromDiffMemVarDConvdtoll ; P9BE: sldi -; P9BE: lxvx +; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpsxds v2 ; P9BE-NEXT: blr ; P9LE: sldi -; P9LE: lxvx +; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpsxds v2 ; P9LE-NEXT: blr @@ -3991,9 +3991,9 @@ entry: ; P9LE-LABEL: spltConst1ull ; P8BE-LABEL: spltConst1ull ; P8LE-LABEL: spltConst1ull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4009,9 +4009,9 @@ entry: ; P9LE-LABEL: spltConst16kull ; P8BE-LABEL: spltConst16kull ; P8LE-LABEL: spltConst16kull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4027,9 +4027,9 @@ entry: ; P9LE-LABEL: spltConst32kull ; P8BE-LABEL: spltConst32kull ; P8LE-LABEL: spltConst32kull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4069,9 +4069,9 @@ entry: ; P9LE-LABEL: fromDiffConstsull ; P8BE-LABEL: fromDiffConstsull ; P8LE-LABEL: fromDiffConstsull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4092,9 +4092,9 @@ entry: ; P9LE-LABEL: fromDiffMemConsAull ; P8BE-LABEL: fromDiffMemConsAull ; P8LE-LABEL: fromDiffMemConsAull -; P9BE: lxvx v2 +; P9BE: lxv v2 ; P9BE: blr -; P9LE: lxvx v2 +; P9LE: lxv v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr @@ -4117,9 +4117,9 @@ entry: ; P9LE-LABEL: fromDiffMemConsDull ; P8BE-LABEL: fromDiffMemConsDull ; P8LE-LABEL: fromDiffMemConsDull -; P9BE: lxvx v2 +; P9BE: lxv v2 ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE: blr ; P8BE: lxvd2x @@ -4179,11 +4179,11 @@ entry: ; P8BE-LABEL: fromDiffMemVarDull ; P8LE-LABEL: fromDiffMemVarDull ; P9BE: sldi -; P9BE: lxvx +; P9BE: lxv ; P9BE: xxswapd v2 ; P9BE-NEXT: blr ; P9LE: sldi -; P9LE: lxvx +; P9LE: lxv ; P9LE: xxswapd v2 ; P9LE-NEXT: blr ; P8BE: sldi @@ -4326,9 +4326,9 @@ entry: ; P9LE-LABEL: spltCnstConvftoull ; P8BE-LABEL: spltCnstConvftoull ; P8LE-LABEL: spltCnstConvftoull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4370,9 +4370,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvftoull ; P8BE-LABEL: fromDiffConstsConvftoull ; P8LE-LABEL: fromDiffConstsConvftoull -; P9BE: lxvx v2 +; P9BE: lxv v2 ; P9BE: blr -; P9LE: lxvx v2 +; P9LE: lxv v2 ; P9LE: blr ; P8BE: lxvd2x v2 ; P8BE: blr @@ -4609,9 +4609,9 @@ entry: ; P9LE-LABEL: spltCnstConvdtoull ; P8BE-LABEL: spltCnstConvdtoull ; P8LE-LABEL: spltCnstConvdtoull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4653,9 +4653,9 @@ entry: ; P9LE-LABEL: fromDiffConstsConvdtoull ; P8BE-LABEL: fromDiffConstsConvdtoull ; P8LE-LABEL: fromDiffConstsConvdtoull -; P9BE: lxvx +; P9BE: lxv ; P9BE: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE: blr ; P8BE: lxvd2x ; P8BE: blr @@ -4674,10 +4674,10 @@ entry: ; P9LE-LABEL: fromDiffMemConsAConvdtoull ; P8BE-LABEL: fromDiffMemConsAConvdtoull ; P8LE-LABEL: fromDiffMemConsAConvdtoull -; P9BE: lxvx +; P9BE: lxv ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr ; P8BE: lxvd2x @@ -4705,11 +4705,11 @@ entry: ; P9LE-LABEL: fromDiffMemConsDConvdtoull ; P8BE-LABEL: fromDiffMemConsDConvdtoull ; P8LE-LABEL: fromDiffMemConsDConvdtoull -; P9BE: lxvx +; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr -; P9LE: lxvx +; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr @@ -4780,12 +4780,12 @@ entry: ; P8BE-LABEL: fromDiffMemVarDConvdtoull ; P8LE-LABEL: fromDiffMemVarDConvdtoull ; P9BE: sldi -; P9BE: lxvx +; P9BE: lxv ; P9BE-NEXT: xxswapd ; P9BE-NEXT: xvcvdpuxds v2 ; P9BE-NEXT: blr ; P9LE: sldi -; P9LE: lxvx +; P9LE: lxv ; P9LE-NEXT: xxswapd ; P9LE-NEXT: xvcvdpuxds v2 ; P9LE-NEXT: blr diff --git a/test/CodeGen/PowerPC/livephysregs.mir b/test/CodeGen/PowerPC/livephysregs.mir new file mode 100644 index 000000000000..6b6268778e99 --- /dev/null +++ b/test/CodeGen/PowerPC/livephysregs.mir @@ -0,0 +1,52 @@ +# RUN: llc -o - %s -mtriple=powerpc64le--linux-gnu -run-pass=branch-folder | FileCheck %s +# The branch-folder should merge bb.1 and bb.5 below and therefore recalculate +# the liveins list of the merged block. This test is checking whether this +# recalculated list if okay and contains all the non-saved and saved CSRs. +# CHECK-LABEL: name: func +# CHECK: bb.3: +# CHECK-NEXT: liveins: %x30, %x29, %x3, %x6 +# CHECK: %x4 = RLDICR killed %x6, 16, 47 +# CHECK: %x3 = OR8 killed %x4, killed %x3 +# CHECK: BLR8 implicit %lr8, implicit %rm, implicit %x3 +--- +name: func +tracksRegLiveness: true +fixedStack: + - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' } + - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' } + - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false } +body: | + bb.0: + liveins: %x3, %x5, %x29, %x30 + + %x6 = RLWINM8 %x3, 16, 16, 31 + %x3 = RLDICL killed %x3, 0, 48 + BC undef %cr5lt, %bb.3 + + bb.1: + liveins: %x3, %x6, %x29, %x30 + + %x4 = RLDICR killed %x6, 16, 47 + %x3 = OR8 killed %x4, killed %x3 + BLR8 implicit %lr8, implicit %rm, implicit %x3 + + bb.3: + liveins: %x3, %x5, %x6, %x29, %x30 + + dead %x5 = ADD8 %x5, %x6 + BC undef %cr5lt, %bb.1 + + bb.6: + liveins: %x3, %x6, %x29, %x30 + STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1) + STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16) + NOP implicit-def dead %x29 + NOP implicit-def dead %x30 + + %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16) + %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1) + + %x4 = RLDICR killed %x6, 16, 47 + %x3 = OR8 killed %x4, killed %x3 + BLR8 implicit %lr8, implicit %rm, implicit %x3 +... diff --git a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll index 329f5bb59cb1..de930af75b2d 100644 --- a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll +++ b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll @@ -21,7 +21,7 @@ entry: ret <16 x i8> %strided.vec ; CHECK-LABEL: @test2 -; CHECK: vsldoi 2, 2, 2, 12 +; CHECK: xxsldwi 34, 34, 34, 3 ; CHECK: blr } diff --git a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll index e3326595d132..fe34bcb85637 100644 --- a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll +++ b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll @@ -6,7 +6,7 @@ define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -45,7 +45,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 0 %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %vecins @@ -54,7 +54,7 @@ entry: define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -93,7 +93,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 4 %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %vecins @@ -102,7 +102,7 @@ entry: define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -141,7 +141,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 8 %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %vecins @@ -150,7 +150,7 @@ entry: define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -189,7 +189,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 12 %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %vecins @@ -198,7 +198,7 @@ entry: define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -237,7 +237,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 0 %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %vecins @@ -246,7 +246,7 @@ entry: define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -285,7 +285,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 4 %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %vecins @@ -294,7 +294,7 @@ entry: define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -333,7 +333,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 8 %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %vecins @@ -342,7 +342,7 @@ entry: define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -381,7 +381,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 12 %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %vecins @@ -546,7 +546,7 @@ entry: define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -585,7 +585,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 0 %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> ret <4 x float> %vecins @@ -594,7 +594,7 @@ entry: define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -633,7 +633,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 4 %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> ret <4 x float> %vecins @@ -642,7 +642,7 @@ entry: define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -681,7 +681,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 8 %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> ret <4 x float> %vecins @@ -690,7 +690,7 @@ entry: define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) { entry: ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -729,7 +729,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 12 %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> ret <4 x float> %vecins @@ -738,7 +738,7 @@ entry: define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -777,7 +777,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 12 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 0 %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> ret <4 x i32> %vecins @@ -786,7 +786,7 @@ entry: define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -825,7 +825,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 8 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 4 %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> ret <4 x i32> %vecins @@ -834,7 +834,7 @@ entry: define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -873,7 +873,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 4 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 8 %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> ret <4 x i32> %vecins @@ -882,7 +882,7 @@ entry: define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) { entry: ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_ -; CHECK: xxsldwi 0, 35, 35, 2 +; CHECK: xxswapd 0, 35 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_ ; CHECK-BE: xxsldwi 0, 35, 35, 3 @@ -921,7 +921,7 @@ entry: ; CHECK: xxsldwi 0, 35, 35, 3 ; CHECK: xxinsertw 34, 0, 0 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_ -; CHECK-BE: xxsldwi 0, 35, 35, 2 +; CHECK-BE: xxswapd 0, 35 ; CHECK-BE: xxinsertw 34, 0, 12 %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> ret <4 x i32> %vecins @@ -972,10 +972,10 @@ define <4 x float> @insertVarF(<4 x float> %a, float %f, i32 %el) { entry: ; CHECK-LABEL: insertVarF ; CHECK: stxsspx 1, -; CHECK: lxvx +; CHECK: lxv ; CHECK-BE-LABEL: insertVarF ; CHECK-BE: stxsspx 1, -; CHECK-BE: lxvx +; CHECK-BE: lxv %vecins = insertelement <4 x float> %a, float %f, i32 %el ret <4 x float> %vecins } @@ -983,10 +983,10 @@ define <4 x i32> @insertVarI(<4 x i32> %a, i32 %i, i32 %el) { entry: ; CHECK-LABEL: insertVarI ; CHECK: stwx -; CHECK: lxvx +; CHECK: lxv ; CHECK-BE-LABEL: insertVarI ; CHECK-BE: stwx -; CHECK-BE: lxvx +; CHECK-BE: lxv %vecins = insertelement <4 x i32> %a, i32 %i, i32 %el ret <4 x i32> %vecins } diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll index 4a8fd90db3eb..90dd1d84fc23 100644 --- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll +++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll @@ -63,7 +63,7 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind { ; FIXME: li [[R1:r[0-9]+]], 1 ; FIXME: li [[R2:r[0-9]+]], 0 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]] -; CHECK-P9: lxvx [[V1:v[0-9]+]] +; CHECK-P9: lxv [[V1:v[0-9]+]] ; CHECK-P9: vadduqm v2, v2, [[V1]] ; CHECK-P9: blr @@ -207,7 +207,7 @@ define <1 x i128> @call_v1i128_increment_by_one() nounwind { ; CHECK-LE: blr ; CHECK-P9-LABEL: @call_v1i128_increment_by_one -; CHECK-P9: lxvx +; CHECK-P9: lxv ; CHECK-P9: bl v1i128_increment_by_one ; CHECK-P9: blr @@ -237,8 +237,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind { ; CHECK-LE: blr ; CHECK-P9-LABEL: @call_v1i128_increment_by_val -; CHECK-P9-DAG: lxvx v2 -; CHECK-P9-DAG: lxvx v3 +; CHECK-P9-DAG: lxv v2 +; CHECK-P9-DAG: lxv v3 ; CHECK-P9: bl v1i128_increment_by_val ; CHECK-P9: blr diff --git a/test/CodeGen/PowerPC/pr25157-peephole.ll b/test/CodeGen/PowerPC/pr25157-peephole.ll index 7f959add00f6..aacd64e401f5 100644 --- a/test/CodeGen/PowerPC/pr25157-peephole.ll +++ b/test/CodeGen/PowerPC/pr25157-peephole.ll @@ -65,5 +65,5 @@ L.LB38_2452: ; CHECK-P9-LABEL: @aercalc_ ; CHECK-P9: lfs ; CHECK-P9: xxspltd -; CHECK-P9: stxvx +; CHECK-P9: stxv ; CHECK-P9-NOT: xxswapd diff --git a/test/CodeGen/PowerPC/pr27078.ll b/test/CodeGen/PowerPC/pr27078.ll index b100e3a5ba53..d97008ee5578 100644 --- a/test/CodeGen/PowerPC/pr27078.ll +++ b/test/CodeGen/PowerPC/pr27078.ll @@ -9,11 +9,11 @@ define <4 x float> @bar(float* %p, float* %q) { %6 = shufflevector <12 x float> %5, <12 x float> undef, <4 x i32> ret <4 x float> %6 -; CHECK: vsldoi +; CHECK: xxsldwi ; CHECK-NEXT: vmrghw ; CHECK-NEXT: vmrglw -; CHECK-NEXT: vsldoi -; CHECK-NEXT: vsldoi -; CHECK-NEXT: vsldoi +; CHECK-NEXT: xxsldwi +; CHECK-NEXT: xxsldwi +; CHECK-NEXT: xxsldwi ; CHECK-NEXT: blr } diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll index d573441f2cc9..e7640cab6aef 100644 --- a/test/CodeGen/PowerPC/swaps-le-6.ll +++ b/test/CodeGen/PowerPC/swaps-le-6.ll @@ -33,11 +33,11 @@ entry: ; CHECK: stxvd2x [[REG5]] ; CHECK-P9-LABEL: @bar0 -; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] +; CHECK-P9-DAG: lxv [[REG1:[0-9]+]] ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1 -; CHECK-P9: stxvx [[REG5]] +; CHECK-P9: stxv [[REG5]] define void @bar1() { entry: @@ -56,9 +56,9 @@ entry: ; CHECK: stxvd2x [[REG5]] ; CHECK-P9-LABEL: @bar1 -; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]] +; CHECK-P9-DAG: lxv [[REG1:[0-9]+]] ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]] -; CHECK-P9: stxvx [[REG5]] +; CHECK-P9: stxv [[REG5]] diff --git a/test/CodeGen/PowerPC/vec_sldwi.ll b/test/CodeGen/PowerPC/vec_sldwi.ll new file mode 100644 index 000000000000..01537d1f5927 --- /dev/null +++ b/test/CodeGen/PowerPC/vec_sldwi.ll @@ -0,0 +1,307 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-BE + +; Possible LE ShuffleVector masks (Case 1): +; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3) +; ShuffleVector((vector int)a, vector(int)b, 7, 0, 1, 2) +; ShuffleVector((vector int)a, vector(int)b, 6, 7, 0, 1) +; ShuffleVector((vector int)a, vector(int)b, 5, 6, 7, 0) +; which targets at: +; xxsldwi a, b, 0 +; xxsldwi a, b, 1 +; xxsldwi a, b, 2 +; xxsldwi a, b, 3 +; Possible LE Swap ShuffleVector masks (Case 2): +; ShuffleVector((vector int)a, vector(int)b, 4, 5, 6, 7) +; ShuffleVector((vector int)a, vector(int)b, 3, 4, 5, 6) +; ShuffleVector((vector int)a, vector(int)b, 2, 3, 4, 5) +; ShuffleVector((vector int)a, vector(int)b, 1, 2, 3, 4) +; which targets at: +; xxsldwi b, a, 0 +; xxsldwi b, a, 1 +; xxsldwi b, a, 2 +; xxsldwi b, a, 3 +; Possible LE ShuffleVector masks when a == b, b is undef (Case 3): +; ShuffleVector((vector int)a, vector(int)a, 0, 1, 2, 3) +; ShuffleVector((vector int)a, vector(int)a, 3, 0, 1, 2) +; ShuffleVector((vector int)a, vector(int)a, 2, 3, 0, 1) +; ShuffleVector((vector int)a, vector(int)a, 1, 2, 3, 0) +; which targets at: +; xxsldwi a, a, 0 +; xxsldwi a, a, 1 +; xxsldwi a, a, 2 +; xxsldwi a, a, 3 + +; Possible BE ShuffleVector masks (Case 4): +; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3) +; ShuffleVector((vector int)a, vector(int)b, 1, 2, 3, 4) +; ShuffleVector((vector int)a, vector(int)b, 2, 3, 4, 5) +; ShuffleVector((vector int)a, vector(int)b, 3, 4, 5, 6) +; which targets at: +; xxsldwi b, a, 0 +; xxsldwi b, a, 1 +; xxsldwi a, a, 2 +; xxsldwi a, a, 3 +; Possible BE Swap ShuffleVector masks (Case 5): +; ShuffleVector((vector int)a, vector(int)b, 4, 5, 6, 7) +; ShuffleVector((vector int)a, vector(int)b, 5, 6, 7, 0) +; ShuffleVector((vector int)a, vector(int)b, 6, 7, 0, 1) +; ShuffleVector((vector int)a, vector(int)b, 7, 0, 1, 2) +; which targets at: +; xxsldwi b, a, 0 +; xxsldwi b, a, 1 +; xxsldwi b, a, 2 +; xxsldwi b, a, 3 +; Possible BE ShuffleVector masks when a == b, b is undef (Case 6): +; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3) +; ShuffleVector((vector int)a, vector(int)a, 1, 2, 3, 0) +; ShuffleVector((vector int)a, vector(int)a, 2, 3, 0, 1) +; ShuffleVector((vector int)a, vector(int)a, 3, 0, 1, 2) +; which targets at: +; xxsldwi a, a, 0 +; xxsldwi a, a, 1 +; xxsldwi a, a, 2 +; xxsldwi a, a, 3 + +define <4 x i32> @check_le_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_0 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_1 +; CHECK-LE: xxsldwi 34, 34, 35, 1 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_2 +; CHECK-LE: xxsldwi 34, 34, 35, 2 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_3 +; CHECK-LE: xxsldwi 34, 34, 35, 3 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_swap_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_0 +; CHECK-LE; vmr 2, 3 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_swap_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_1 +; CHECK-LE: xxsldwi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_swap_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_2 +; CHECK-LE: xxsldwi 34, 35, 34, 2 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_swap_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_3 +; CHECK-LE: xxsldwi 34, 35, 34, 3 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_undef_0(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_0 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_undef_1(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_le_vec_sldwi_va_undef_1 +; CHECK-LE: xxsldwi 34, 34, 34, 1 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_undef_2(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_2 +; CHECK-LE: xxswapd 34, 34 +; CHECK-LE: blr +} + +define <4 x i32> @check_le_vec_sldwi_va_undef_3(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_3 +; CHECK-LE: xxsldwi 34, 34, 34, 3 +; CHECK-LE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_0 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_1 +; CHECK-BE: xxsldwi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_2 +; CHECK-BE: xxsldwi 34, 34, 35, 2 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_3 +; CHECK-BE: xxsldwi 34, 34, 35, 3 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_swap_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_0 +; CHECK-LE; vmr 2, 3 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_swap_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_1 +; CHECK-BE: xxsldwi 34, 35, 34, 1 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_swap_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_2 +; CHECK-BE: xxsldwi 34, 35, 34, 2 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_swap_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_3 +; CHECK-BE: xxsldwi 34, 35, 34, 3 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_undef_0(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @check_be_vec_sldwi_va_undef_0 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_undef_1(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_1 +; CHECK-BE: xxsldwi 34, 34, 34, 1 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_undef_2(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_2 +; CHECK-BE: xxswapd 34, 34 +; CHECK-BE: blr +} + +define <4 x i32> @check_be_vec_sldwi_va_undef_3(<4 x i32> %VA) { +entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> + ret <4 x i32> %0 +; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_3 +; CHECK-BE: xxsldwi 34, 34, 34, 3 +; CHECK-BE: blr +} + +; More test cases to test different types of vector inputs +define <16 x i8> @test_le_vec_sldwi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) { + entry: + %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> + ret <16 x i8> %0 +; CHECK-LE-LABEL: @test_le_vec_sldwi_v16i8_v16i8 +; CHECK-LE: xxsldwi 34, 34, 35, 1 +; CHECK-LE: blr +} + +define <8 x i16> @test_le_vec_sldwi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) { + entry: + %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> + ret <8 x i16> %0 +; CHECK-LE-LABEL: @test_le_vec_sldwi_v8i16_v8i16 +; CHECK-LE: xxsldwi 34, 34, 35, 1 +; CHECK-LE: blr +} + +; Note here xxpermdi 34, 34, 35, 2 <=> xxsldwi 34, 34, 35, 2 +define <2 x i64> @test_be_vec_sldwi_v2i64_v2i64(<2 x i64> %VA, <2 x i64> %VB) { + entry: + %0 = shufflevector <2 x i64> %VA, <2 x i64> %VB,<2 x i32> + ret <2 x i64> %0 +; CHECK-LE-LABEL: @test_be_vec_sldwi_v2i64_v2i64 +; CHECK-LE: xxpermdi 34, 34, 35, 2 +; CHECK-LE: blr +} diff --git a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll index acedc2606331..0f0426526cc1 100644 --- a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll +++ b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll @@ -26,82 +26,82 @@ entry: ; CHECK-LABEL: test1 ; CHECK-P9-LABEL: test1 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %0 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vsi to i8*)) ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv store <4 x i32> %0, <4 x i32>* @res_vsi, align 16 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %1 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vui to i8*)) ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv store <4 x i32> %1, <4 x i32>* @res_vui, align 16 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x float>* @vf to i8*)) %3 = bitcast <4 x i32> %2 to <4 x float> ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv store <4 x float> %3, <4 x float>* @res_vf, align 16 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %4 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vsll to i8*)) %5 = bitcast <2 x double> %4 to <2 x i64> ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv store <2 x i64> %5, <2 x i64>* @res_vsll, align 16 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %6 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vull to i8*)) %7 = bitcast <2 x double> %6 to <2 x i64> ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv store <2 x i64> %7, <2 x i64>* @res_vull, align 16 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %8 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x double>* @vd to i8*)) ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv store <2 x double> %8, <2 x double>* @res_vd, align 16 ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %9 = load <4 x i32>, <4 x i32>* @vsi, align 16 ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %9, i8* bitcast (<4 x i32>* @res_vsi to i8*)) ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %10 = load <4 x i32>, <4 x i32>* @vui, align 16 ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %10, i8* bitcast (<4 x i32>* @res_vui to i8*)) ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %11 = load <4 x float>, <4 x float>* @vf, align 16 %12 = bitcast <4 x float> %11 to <4 x i32> ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %12, i8* bitcast (<4 x float>* @res_vf to i8*)) ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %13 = load <2 x i64>, <2 x i64>* @vsll, align 16 %14 = bitcast <2 x i64> %13 to <2 x double> ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv call void @llvm.ppc.vsx.stxvd2x(<2 x double> %14, i8* bitcast (<2 x i64>* @res_vsll to i8*)) ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %15 = load <2 x i64>, <2 x i64>* @vull, align 16 %16 = bitcast <2 x i64> %15 to <2 x double> ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv call void @llvm.ppc.vsx.stxvd2x(<2 x double> %16, i8* bitcast (<2 x i64>* @res_vull to i8*)) ; CHECK: lxvd2x -; CHECK-P9-DAG: lxvx +; CHECK-P9-DAG: lxv %17 = load <2 x double>, <2 x double>* @vd, align 16 ; CHECK: stxvd2x -; CHECK-P9-DAG: stxvx +; CHECK-P9-DAG: stxv call void @llvm.ppc.vsx.stxvd2x(<2 x double> %17, i8* bitcast (<2 x double>* @res_vd to i8*)) ret void } diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll index d8dd635aab5f..0bbc633363a7 100644 --- a/test/CodeGen/PowerPC/vsx-ldst.ll +++ b/test/CodeGen/PowerPC/vsx-ldst.ll @@ -21,8 +21,8 @@ ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O2 \ ; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s > %t -; RUN: grep lxvx < %t | count 6 -; RUN: grep stxvx < %t | count 6 +; RUN: grep lxv < %t | count 6 +; RUN: grep stxv < %t | count 6 @vsi = global <4 x i32> , align 16 diff --git a/test/CodeGen/PowerPC/vsx-p9.ll b/test/CodeGen/PowerPC/vsx-p9.ll index ba359501ccc5..0c29b6adad77 100644 --- a/test/CodeGen/PowerPC/vsx-p9.ll +++ b/test/CodeGen/PowerPC/vsx-p9.ll @@ -36,109 +36,109 @@ entry: %1 = load <16 x i8>, <16 x i8>* @ucb, align 16 %add.i = add <16 x i8> %1, %0 tail call void (...) @sink(<16 x i8> %add.i) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vaddubm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %2 = load <16 x i8>, <16 x i8>* @sca, align 16 %3 = load <16 x i8>, <16 x i8>* @scb, align 16 %add.i22 = add <16 x i8> %3, %2 tail call void (...) @sink(<16 x i8> %add.i22) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vaddubm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %4 = load <8 x i16>, <8 x i16>* @usa, align 16 %5 = load <8 x i16>, <8 x i16>* @usb, align 16 %add.i21 = add <8 x i16> %5, %4 tail call void (...) @sink(<8 x i16> %add.i21) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vadduhm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %6 = load <8 x i16>, <8 x i16>* @ssa, align 16 %7 = load <8 x i16>, <8 x i16>* @ssb, align 16 %add.i20 = add <8 x i16> %7, %6 tail call void (...) @sink(<8 x i16> %add.i20) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vadduhm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %8 = load <4 x i32>, <4 x i32>* @uia, align 16 %9 = load <4 x i32>, <4 x i32>* @uib, align 16 %add.i19 = add <4 x i32> %9, %8 tail call void (...) @sink(<4 x i32> %add.i19) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vadduwm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %10 = load <4 x i32>, <4 x i32>* @sia, align 16 %11 = load <4 x i32>, <4 x i32>* @sib, align 16 %add.i18 = add <4 x i32> %11, %10 tail call void (...) @sink(<4 x i32> %add.i18) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vadduwm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %12 = load <2 x i64>, <2 x i64>* @ulla, align 16 %13 = load <2 x i64>, <2 x i64>* @ullb, align 16 %add.i17 = add <2 x i64> %13, %12 tail call void (...) @sink(<2 x i64> %add.i17) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vaddudm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %14 = load <2 x i64>, <2 x i64>* @slla, align 16 %15 = load <2 x i64>, <2 x i64>* @sllb, align 16 %add.i16 = add <2 x i64> %15, %14 tail call void (...) @sink(<2 x i64> %add.i16) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vaddudm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %16 = load <1 x i128>, <1 x i128>* @uxa, align 16 %17 = load <1 x i128>, <1 x i128>* @uxb, align 16 %add.i15 = add <1 x i128> %17, %16 tail call void (...) @sink(<1 x i128> %add.i15) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vadduqm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %18 = load <1 x i128>, <1 x i128>* @sxa, align 16 %19 = load <1 x i128>, <1 x i128>* @sxb, align 16 %add.i14 = add <1 x i128> %19, %18 tail call void (...) @sink(<1 x i128> %add.i14) -; CHECK: lxvx 34, 0, 3 -; CHECK: lxvx 35, 0, 4 +; CHECK: lxv 34, 0(3) +; CHECK: lxv 35, 0(4) ; CHECK: vadduqm 2, 3, 2 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %20 = load <4 x float>, <4 x float>* @vfa, align 16 %21 = load <4 x float>, <4 x float>* @vfb, align 16 %add.i13 = fadd <4 x float> %20, %21 tail call void (...) @sink(<4 x float> %add.i13) -; CHECK: lxvx 0, 0, 3 -; CHECK: lxvx 1, 0, 4 +; CHECK: lxv 0, 0(3) +; CHECK: lxv 1, 0(4) ; CHECK: xvaddsp 34, 0, 1 -; CHECK: stxvx 34, +; CHECK: stxv 34, ; CHECK: bl sink %22 = load <2 x double>, <2 x double>* @vda, align 16 %23 = load <2 x double>, <2 x double>* @vdb, align 16 %add.i12 = fadd <2 x double> %22, %23 tail call void (...) @sink(<2 x double> %add.i12) -; CHECK: lxvx 0, 0, 3 -; CHECK: lxvx 1, 0, 4 +; CHECK: lxv 0, 0(3) +; CHECK: lxv 1, 0(4) ; CHECK: xvadddp 0, 0, 1 -; CHECK: stxvx 0, +; CHECK: stxv 0, ; CHECK: bl sink ret void } diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll index 09bf6830416f..98fe3a813cb7 100644 --- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll +++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll @@ -23,7 +23,7 @@ define <2 x double> @testi0(<2 x double>* %p1, double* %p2) { ; CHECK-P9-LABEL: testi0 ; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4) -; CHECK-P9: lxvx [[REG2:[0-9]+]], 0, 3 +; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0 ; CHECK-P9: xxpermdi 34, [[REG2]], [[REG3]], 1 } @@ -43,7 +43,7 @@ define <2 x double> @testi1(<2 x double>* %p1, double* %p2) { ; CHECK-P9-LABEL: testi1 ; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4) -; CHECK-P9: lxvx [[REG2:[0-9]+]], 0, 3 +; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3) ; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0 ; CHECK-P9: xxmrgld 34, [[REG3]], [[REG2]] } diff --git a/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/test/CodeGen/PowerPC/vsx_shuffle_le.ll index 3bf24adfdd91..cfe201999282 100644 --- a/test/CodeGen/PowerPC/vsx_shuffle_le.ll +++ b/test/CodeGen/PowerPC/vsx_shuffle_le.ll @@ -19,7 +19,7 @@ define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxspltd 34, 0, 0 ; CHECK-P9-LABEL: test00 -; CHECK-P9: lxvx 0, 0, 3 +; CHECK-P9: lxv 0, 0(3) ; CHECK-P9: xxspltd 34, 0, 1 } @@ -34,7 +34,7 @@ define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxswapd 34, 0 ; CHECK-P9-LABEL: test01 -; CHECK-P9: lxvx 34, 0, 3 +; CHECK-P9: lxv 34, 0(3) } define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) { @@ -51,8 +51,8 @@ define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxmrgld 34, 1, 0 ; CHECK-P9-LABEL: @test02 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxmrgld 34, 1, 0 } @@ -70,8 +70,8 @@ define <2 x double> @test03(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxpermdi 34, 1, 0, 1 ; CHECK-P9-LABEL: @test03 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxpermdi 34, 1, 0, 1 } @@ -85,7 +85,7 @@ define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: lxvd2x 34, 0, 3 ; CHECK-P9-LABEL: @test10 -; CHECK-P9: lxvx 0, 0, 3 +; CHECK-P9: lxv 0, 0(3) ; CHECK-P9: xxswapd 34, 0 } @@ -100,7 +100,7 @@ define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxspltd 34, 0, 1 ; CHECK-P9-LABEL: @test11 -; CHECK-P9: lxvx 0, 0, 3 +; CHECK-P9: lxv 0, 0(3) ; CHECK-P9: xxspltd 34, 0, 0 } @@ -118,8 +118,8 @@ define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxpermdi 34, 1, 0, 2 ; CHECK-P9-LABEL: @test12 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxpermdi 34, 1, 0, 2 } @@ -137,8 +137,8 @@ define <2 x double> @test13(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxmrghd 34, 1, 0 ; CHECK-P9-LABEL: @test13 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxmrghd 34, 1, 0 } @@ -156,8 +156,8 @@ define <2 x double> @test20(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxmrgld 34, 0, 1 ; CHECK-P9-LABEL: @test20 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxmrgld 34, 0, 1 } @@ -175,8 +175,8 @@ define <2 x double> @test21(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxpermdi 34, 0, 1, 1 ; CHECK-P9-LABEL: @test21 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxpermdi 34, 0, 1, 1 } @@ -191,7 +191,7 @@ define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxspltd 34, 0, 0 ; CHECK-P9-LABEL: @test22 -; CHECK-P9: lxvx 0, 0, 4 +; CHECK-P9: lxv 0, 0(4) ; CHECK-P9: xxspltd 34, 0, 1 } @@ -206,7 +206,7 @@ define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxswapd 34, 0 ; CHECK-P9-LABEL: @test23 -; CHECK-P9: lxvx 34, 0, 4 +; CHECK-P9: lxv 34, 0(4) } define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) { @@ -223,8 +223,8 @@ define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxpermdi 34, 0, 1, 2 ; CHECK-P9-LABEL: @test30 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxpermdi 34, 0, 1, 2 } @@ -242,8 +242,8 @@ define <2 x double> @test31(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxmrghd 34, 0, 1 ; CHECK-P9-LABEL: @test31 -; CHECK-P9: lxvx 0, 0, 3 -; CHECK-P9: lxvx 1, 0, 4 +; CHECK-P9: lxv 0, 0(3) +; CHECK-P9: lxv 1, 0(4) ; CHECK-P9: xxmrghd 34, 0, 1 } @@ -257,7 +257,7 @@ define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: lxvd2x 34, 0, 4 ; CHECK-P9-LABEL: @test32 -; CHECK-P9: lxvx 0, 0, 4 +; CHECK-P9: lxv 0, 0(4) ; CHECK-P9: xxswapd 34, 0 } @@ -272,6 +272,6 @@ define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) { ; CHECK: xxspltd 34, 0, 1 ; CHECK-P9-LABEL: @test33 -; CHECK-P9: lxvx 0, 0, 4 +; CHECK-P9: lxv 0, 0(4) ; CHECK-P9: xxspltd 34, 0, 0 } diff --git a/test/CodeGen/Thumb/machine-cse-physreg.mir b/test/CodeGen/Thumb/machine-cse-physreg.mir new file mode 100644 index 000000000000..5206e89cf779 --- /dev/null +++ b/test/CodeGen/Thumb/machine-cse-physreg.mir @@ -0,0 +1,35 @@ +# RUN: llc -mtriple thumbv5e -run-pass=machine-cse -o - %s | FileCheck %s + +# This is a contrived example made to expose a bug in +# MachineCSE, see PR32538. + +# MachineCSE must not remove this def of %cpsr: +# CHECK-LABEL: bb.1: +# CHECK: , %cpsr = tLSLri + +... +--- +name: spam +registers: + - { id: 0, class: tgpr } + - { id: 1, class: tgpr } + - { id: 2, class: tgpr } + - { id: 3, class: tgpr } +liveins: + - { reg: '%r0', virtual-reg: '%0' } +body: | + bb.0: + liveins: %r0 + %0 = COPY %r0 + %1, %cpsr = tLSLri %0, 2, 14, _ + tCMPi8 %0, 5, 14, _, implicit-def %cpsr + tBcc %bb.8, 8, %cpsr + + bb.1: + %2, %cpsr = tLSLri %0, 2, 14, _ + + bb.8: + liveins: %cpsr + %3 = COPY %cpsr + tSTRi killed %3, %0, 0, 14, _ +... diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll index 172a00a7c86f..89cb71a52c04 100644 --- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machine-licm" +; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm" ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s ; rdar://6627786 ; rdar://7792037 diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll index f1ffc15f4d03..870e812bbb69 100644 --- a/test/CodeGen/X86/GlobalISel/memop-vec.ll +++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll @@ -1,39 +1,116 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX -; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { -; ALL-LABEL: test_load_v4i32_noalign: -; ALL: # BB#0: -; ALL-NEXT: vmovups (%rdi), %xmm0 -; ALL-NEXT: retq +; SKX-LABEL: test_load_v4i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %xmm0 +; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 1 ret <4 x i32> %r } define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { -; ALL-LABEL: test_load_v4i32_align: -; ALL: # BB#0: -; ALL-NEXT: vmovaps (%rdi), %xmm0 -; ALL-NEXT: retq +; SKX-LABEL: test_load_v4i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %xmm0 +; SKX-NEXT: retq %r = load <4 x i32>, <4 x i32>* %p1, align 16 ret <4 x i32> %r } +define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) { +; SKX-LABEL: test_load_v8i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %ymm0 +; SKX-NEXT: retq + %r = load <8 x i32>, <8 x i32>* %p1, align 1 + ret <8 x i32> %r +} + +define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) { +; SKX-LABEL: test_load_v8i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %ymm0 +; SKX-NEXT: retq + %r = load <8 x i32>, <8 x i32>* %p1, align 32 + ret <8 x i32> %r +} + +define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) { +; SKX-LABEL: test_load_v16i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 +; SKX-NEXT: retq + %r = load <16 x i32>, <16 x i32>* %p1, align 1 + ret <16 x i32> %r +} + +define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) { +; SKX-LABEL: test_load_v16i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 +; SKX-NEXT: retq + %r = load <16 x i32>, <16 x i32>* %p1, align 32 + ret <16 x i32> %r +} + define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { -; ALL-LABEL: test_store_v4i32_noalign: -; ALL: # BB#0: -; ALL-NEXT: vmovups %xmm0, (%rdi) -; ALL-NEXT: retq +; SKX-LABEL: test_store_v4i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 1 ret void } define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { -; ALL-LABEL: test_store_v4i32_align: -; ALL: # BB#0: -; ALL-NEXT: vmovaps %xmm0, (%rdi) -; ALL-NEXT: retq +; SKX-LABEL: test_store_v4i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: retq store <4 x i32> %val, <4 x i32>* %p1, align 16 ret void } + +define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { +; SKX-LABEL: test_store_v8i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <8 x i32> %val, <8 x i32>* %p1, align 1 + ret void +} + +define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) { +; SKX-LABEL: test_store_v8i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <8 x i32> %val, <8 x i32>* %p1, align 32 + ret void +} + +define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { +; SKX-LABEL: test_store_v16i32_noalign: +; SKX: # BB#0: +; SKX-NEXT: vmovups %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <16 x i32> %val, <16 x i32>* %p1, align 1 + ret void +} + +define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) { +; SKX-LABEL: test_store_v16i32_align: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + store <16 x i32> %val, <16 x i32>* %p1, align 64 + ret void +} + diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir index f925c836f3d1..cc03f3a57f0b 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir @@ -14,7 +14,16 @@ ret void } -... + define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) { + %r = load <8 x i32>, <8 x i32>* %p1, align 1 + ret <8 x i32> %r + } + + define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { + store <8 x i32> %val, <8 x i32>* %p1, align 1 + ret void + } + --- name: test_mul_vec256 alignment: 4 @@ -84,3 +93,47 @@ body: | RET 0 ... +--- +name: test_load_v8i32_noalign +# CHECK-LABEL: name: test_load_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: vecr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1) + %ymm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... +--- +name: test_store_v8i32_noalign +# CHECK-LABEL: name: test_store_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %ymm0 + + %0(<8 x s32>) = COPY %ymm0 + %1(p0) = COPY %rdi + G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir index e0c12ff44a2f..278413ad38ef 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir @@ -15,22 +15,29 @@ ret void } + define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) { + %r = load <16 x i32>, <16 x i32>* %p1, align 1 + ret <16 x i32> %r + } + + define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { + store <16 x i32> %val, <16 x i32>* %p1, align 1 + ret void + } + ... --- name: test_mul_vec512 +# CHECK-LABEL: name: test_mul_vec512 alignment: 4 legalized: true regBankSelected: false -selected: false -tracksRegLiveness: true -# CHECK-LABEL: name: test_mul_vec512 -# CHECK: registers: -# CHECK: - { id: 0, class: vecr } -# CHECK: - { id: 1, class: vecr } +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: vecr } registers: - { id: 0, class: _ } - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.1 (%ir-block.0): @@ -41,19 +48,16 @@ body: | ... --- name: test_add_vec512 +# CHECK-LABEL: name: test_add_vec512 alignment: 4 legalized: true regBankSelected: false -selected: false -tracksRegLiveness: true -# CHECK-LABEL: name: test_add_vec512 -# CHECK: registers: -# CHECK: - { id: 0, class: vecr } -# CHECK: - { id: 1, class: vecr } +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: vecr } registers: - { id: 0, class: _ } - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.1 (%ir-block.0): @@ -64,24 +68,65 @@ body: | ... --- name: test_sub_vec512 +# CHECK-LABEL: name: test_sub_vec512 alignment: 4 legalized: true regBankSelected: false -selected: false -tracksRegLiveness: true -# CHECK-LABEL: name: test_sub_vec512 -# CHECK: registers: -# CHECK: - { id: 0, class: vecr } -# CHECK: - { id: 1, class: vecr } +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: vecr } registers: - { id: 0, class: _ } - { id: 1, class: _ } - - { id: 2, class: _ } body: | bb.1 (%ir-block.0): %0(<16 x s32>) = IMPLICIT_DEF %1(<16 x s32>) = G_SUB %0, %0 RET 0 +... +--- + +name: test_load_v16i32_noalign +# CHECK-LABEL: name: test_load_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: vecr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1) + %zmm0 = COPY %1(<16 x s32>) + RET 0, implicit %zmm0 + +... +--- +name: test_store_v16i32_noalign +# CHECK-LABEL: name: test_store_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: vecr } +# CHECK-NEXT: - { id: 1, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %zmm0 + + %0(<16 x s32>) = COPY %zmm0 + %1(p0) = COPY %rdi + G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1) + RET 0 ... diff --git a/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir new file mode 100644 index 000000000000..539520c0b8f5 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir @@ -0,0 +1,96 @@ +# RUN: llc -mtriple=i586-linux-gnu -mcpu=haswell -mattr=-slow-incdec -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK +# +# This is necessary to test that attribute-based rule predicates work and that +# they properly reset between functions. + +--- | + define i32 @const_i32_1() { + ret i32 1 + } + + define i32 @const_i32_1_optsize() #0 { + ret i32 1 + } + + define i32 @const_i32_1b() { + ret i32 1 + } + + define i32 @const_i32_1_optsizeb() #0 { + ret i32 1 + } + + attributes #0 = { optsize } +... +--- +name: const_i32_1 +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1 +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32ri 1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... +--- +name: const_i32_1_optsize +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1_optsize +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32r1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... +--- +name: const_i32_1b +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1b +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32ri 1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... +--- +name: const_i32_1_optsizeb +legalized: true +regBankSelected: true +selected: false +# CHECK-LABEL: name: const_i32_1_optsizeb +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +registers: + - { id: 0, class: gpr } +# CHECK: body: +# CHECK: %0 = MOV32r1 +body: | + bb.1 (%ir-block.0): + %0(s32) = G_CONSTANT i32 1 + %eax = COPY %0(s32) + RET 0, implicit %eax +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v256.mir b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir new file mode 100644 index 000000000000..b9a7e4a8cc4a --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir @@ -0,0 +1,188 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + + +--- | + define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) { + %r = load <8 x i32>, <8 x i32>* %p1, align 1 + ret <8 x i32> %r + } + + define <8 x i32> @test_load_v8i32_align(<8 x i32>* %p1) { + %r = load <8 x i32>, <8 x i32>* %p1, align 32 + ret <8 x i32> %r + } + + define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) { + store <8 x i32> %val, <8 x i32>* %p1, align 1 + ret void + } + + define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) { + store <8 x i32> %val, <8 x i32>* %p1, align 32 + ret void + } + + +... +--- +name: test_load_v8i32_noalign +# ALL-LABEL: name: test_load_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: gr64 } +# NO_AVX512F-NEXT: - { id: 1, class: vr256 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: gr64 } +# AVX512ALL-NEXT: - { id: 1, class: vr256x } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# NO_AVX512F: %0 = COPY %rdi +# NO_AVX512F-NEXT: %1 = VMOVUPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1) +# NO_AVX512F-NEXT: %ymm0 = COPY %1 +# NO_AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVUPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1) +# AVX512F-NEXT: %ymm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512VL: %0 = COPY %rdi +# AVX512VL-NEXT: %1 = VMOVUPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1) +# AVX512VL-NEXT: %ymm0 = COPY %1 +# AVX512VL-NEXT: RET 0, implicit %ymm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1) + %ymm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... +--- +name: test_load_v8i32_align +# ALL-LABEL: name: test_load_v8i32_align +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: gr64 } +# NO_AVX512F-NEXT: - { id: 1, class: vr256 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: gr64 } +# AVX512ALL-NEXT: - { id: 1, class: vr256x } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# NO_AVX512F: %0 = COPY %rdi +# NO_AVX512F-NEXT: %1 = VMOVAPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1) +# NO_AVX512F-NEXT: %ymm0 = COPY %1 +# NO_AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVAPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1) +# AVX512F-NEXT: %ymm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %ymm0 +# +# AVX512VL: %0 = COPY %rdi +# AVX512VL-NEXT: %1 = VMOVAPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1) +# AVX512VL-NEXT: %ymm0 = COPY %1 +# AVX512VL-NEXT: RET 0, implicit %ymm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1) + %ymm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... +--- +name: test_store_v8i32_noalign +# ALL-LABEL: name: test_store_v8i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: vr256 } +# NO_AVX512F-NEXT: - { id: 1, class: gr64 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: vr256x } +# AVX512ALL-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# NO_AVX512F: %0 = COPY %ymm0 +# NO_AVX512F-NEXT: %1 = COPY %rdi +# NO_AVX512F-NEXT: VMOVUPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1) +# NO_AVX512F-NEXT: RET 0 +# +# AVX512F: %0 = COPY %ymm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVUPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1) +# AVX512F-NEXT: RET 0 +# +# AVX512VL: %0 = COPY %ymm0 +# AVX512VL-NEXT: %1 = COPY %rdi +# AVX512VL-NEXT: VMOVUPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1) +# AVX512VL-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %ymm0 + + %0(<8 x s32>) = COPY %ymm0 + %1(p0) = COPY %rdi + G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1) + RET 0 + +... +--- +name: test_store_v8i32_align +# ALL-LABEL: name: test_store_v8i32_align +alignment: 4 +legalized: true +regBankSelected: true +# NO_AVX512F: registers: +# NO_AVX512F-NEXT: - { id: 0, class: vr256 } +# NO_AVX512F-NEXT: - { id: 1, class: gr64 } +# +# AVX512ALL: registers: +# AVX512ALL-NEXT: - { id: 0, class: vr256x } +# AVX512ALL-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# NO_AVX512F: %0 = COPY %ymm0 +# NO_AVX512F-NEXT: %1 = COPY %rdi +# NO_AVX512F-NEXT: VMOVAPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1) +# NO_AVX512F-NEXT: RET 0 +# +# AVX512F: %0 = COPY %ymm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVAPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1) +# AVX512F-NEXT: RET 0 +# +# AVX512VL: %0 = COPY %ymm0 +# AVX512VL-NEXT: %1 = COPY %rdi +# AVX512VL-NEXT: VMOVAPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1) +# AVX512VL-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %ymm0 + + %0(<8 x s32>) = COPY %ymm0 + %1(p0) = COPY %rdi + G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v512.mir b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir new file mode 100644 index 000000000000..87978a684d4c --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir @@ -0,0 +1,127 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512F +--- | + define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) { + %r = load <16 x i32>, <16 x i32>* %p1, align 1 + ret <16 x i32> %r + } + + define <16 x i32> @test_load_v16i32_align(<16 x i32>* %p1) { + %r = load <16 x i32>, <16 x i32>* %p1, align 32 + ret <16 x i32> %r + } + + define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) { + store <16 x i32> %val, <16 x i32>* %p1, align 1 + ret void + } + + define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) { + store <16 x i32> %val, <16 x i32>* %p1, align 32 + ret void + } + +... +--- +name: test_load_v16i32_noalign +# AVX512F-LABEL: name: test_load_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: gr64 } +# AVX512F-NEXT: - { id: 1, class: vr512 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 1) +# AVX512F-NEXT: %zmm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %zmm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1) + %zmm0 = COPY %1(<16 x s32>) + RET 0, implicit %zmm0 + +... +--- +name: test_load_v16i32_align +# AVX512F-LABEL: name: test_load_v16i32_align +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: gr64 } +# AVX512F-NEXT: - { id: 1, class: vr512 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# AVX512F: %0 = COPY %rdi +# AVX512F-NEXT: %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 32) +# AVX512F-NEXT: %zmm0 = COPY %1 +# AVX512F-NEXT: RET 0, implicit %zmm0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 32) + %zmm0 = COPY %1(<16 x s32>) + RET 0, implicit %zmm0 + +... +--- +name: test_store_v16i32_noalign +# AVX512F-LABEL: name: test_store_v16i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: vr512 } +# AVX512F-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# AVX512F: %0 = COPY %zmm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 1) +# AVX512F-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %zmm0 + + %0(<16 x s32>) = COPY %zmm0 + %1(p0) = COPY %rdi + G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1) + RET 0 + +... +--- +name: test_store_v16i32_align +# AVX512F-LABEL: name: test_store_v16i32_align +alignment: 4 +legalized: true +regBankSelected: true +# AVX512F: registers: +# AVX512F-NEXT: - { id: 0, class: vr512 } +# AVX512F-NEXT: - { id: 1, class: gr64 } +registers: + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# AVX512F: %0 = COPY %zmm0 +# AVX512F-NEXT: %1 = COPY %rdi +# AVX512F-NEXT: VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 32) +# AVX512F-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %zmm0 + + %0(<16 x s32>) = COPY %zmm0 + %1(p0) = COPY %rdi + G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 32) + RET 0 + +... diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll index cf514d7aeb31..016ddb9c5e78 100644 --- a/test/CodeGen/X86/avx-vzeroupper.ll +++ b/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,10 +1,8 @@ -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s - -; FAST-YMM-ZMM-NOT: vzeroupper -; BTVER2-NOT: vzeroupper +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512 +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-YMM-ZMM +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2 declare i32 @foo() declare <4 x float> @do_sse(<4 x float>) @@ -15,43 +13,86 @@ declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind ;; Basic checking - don't emit any vzeroupper instruction -; CHECK: _test00 -define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { -entry: - ; CHECK-NOT: vzeroupper +define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind { +; ALL-LABEL: test00: +; ALL: # BB#0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ALL-NEXT: callq do_sse +; ALL-NEXT: popq %rax +; ALL-NEXT: retq %add.i = fadd <4 x float> %a, %b %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind - ; CHECK: ret ret <4 x float> %call3 } ;; Check parameter 256-bit parameter passing -; CHECK: _test01 -define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp { -entry: +define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind { +; VZ-LABEL: test01: +; VZ: # BB#0: +; VZ-NEXT: subq $56, %rsp +; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; VZ-NEXT: vzeroupper +; VZ-NEXT: callq do_sse +; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: callq do_sse +; VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; VZ-NEXT: addq $56, %rsp +; VZ-NEXT: retq +; +; FAST-YMM-ZMM-LABEL: test01: +; FAST-YMM-ZMM: # BB#0: +; FAST-YMM-ZMM-NEXT: subq $56, %rsp +; FAST-YMM-ZMM-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; FAST-YMM-ZMM-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; FAST-YMM-ZMM-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; FAST-YMM-ZMM-NEXT: addq $56, %rsp +; FAST-YMM-ZMM-NEXT: retq +; +; BTVER2-LABEL: test01: +; BTVER2: # BB#0: +; BTVER2-NEXT: subq $56, %rsp +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; BTVER2-NEXT: addq $56, %rsp +; BTVER2-NEXT: retq %tmp = load <4 x float>, <4 x float>* @x, align 16 - ; CHECK: vzeroupper - ; CHECK-NEXT: callq _do_sse %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind store <4 x float> %call, <4 x float>* @x, align 16 - ; CHECK-NOT: vzeroupper - ; CHECK: callq _do_sse %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind store <4 x float> %call2, <4 x float>* @x, align 16 - ; CHECK: ret ret <8 x float> %c } ;; Check that vzeroupper is emitted for tail calls. -; CHECK: _test02 -define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp { -entry: +define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind { +; VZ-LABEL: test02: +; VZ: # BB#0: +; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; VZ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; VZ-NEXT: vzeroupper +; VZ-NEXT: jmp do_sse # TAILCALL +; +; NO-VZ-LABEL: test02: +; NO-VZ: # BB#0: +; NO-VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; NO-VZ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; NO-VZ-NEXT: jmp do_sse # TAILCALL %add.i = fadd <8 x float> %a, %b %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) - ; CHECK: vzeroupper - ; CHECK: jmp _do_sse %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind ret <4 x float> %call3 } @@ -59,30 +100,113 @@ entry: ;; Test the pass convergence and also that vzeroupper is only issued when necessary, ;; for this function it should be only once -; CHECK: _test03 -define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { +define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind { +; VZ-LABEL: test03: +; VZ: # BB#0: # %entry +; VZ-NEXT: pushq %rbx +; VZ-NEXT: subq $16, %rsp +; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; VZ-NEXT: .p2align 4, 0x90 +; VZ-NEXT: .LBB3_1: # %while.cond +; VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-NEXT: callq foo +; VZ-NEXT: testl %eax, %eax +; VZ-NEXT: jne .LBB3_1 +; VZ-NEXT: # BB#2: # %for.body.preheader +; VZ-NEXT: movl $4, %ebx +; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; VZ-NEXT: .p2align 4, 0x90 +; VZ-NEXT: .LBB3_3: # %for.body +; VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; VZ-NEXT: callq do_sse +; VZ-NEXT: callq do_sse +; VZ-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; VZ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; VZ-NEXT: vzeroupper +; VZ-NEXT: callq do_sse +; VZ-NEXT: decl %ebx +; VZ-NEXT: jne .LBB3_3 +; VZ-NEXT: # BB#4: # %for.end +; VZ-NEXT: addq $16, %rsp +; VZ-NEXT: popq %rbx +; VZ-NEXT: retq +; +; FAST-YMM-ZMM-LABEL: test03: +; FAST-YMM-ZMM: # BB#0: # %entry +; FAST-YMM-ZMM-NEXT: pushq %rbx +; FAST-YMM-ZMM-NEXT: subq $16, %rsp +; FAST-YMM-ZMM-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; FAST-YMM-ZMM-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; FAST-YMM-ZMM-NEXT: .p2align 4, 0x90 +; FAST-YMM-ZMM-NEXT: .LBB3_1: # %while.cond +; FAST-YMM-ZMM-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-YMM-ZMM-NEXT: callq foo +; FAST-YMM-ZMM-NEXT: testl %eax, %eax +; FAST-YMM-ZMM-NEXT: jne .LBB3_1 +; FAST-YMM-ZMM-NEXT: # BB#2: # %for.body.preheader +; FAST-YMM-ZMM-NEXT: movl $4, %ebx +; FAST-YMM-ZMM-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; FAST-YMM-ZMM-NEXT: .p2align 4, 0x90 +; FAST-YMM-ZMM-NEXT: .LBB3_3: # %for.body +; FAST-YMM-ZMM-NEXT: # =>This Inner Loop Header: Depth=1 +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; FAST-YMM-ZMM-NEXT: vextractf128 $1, %ymm0, %xmm0 +; FAST-YMM-ZMM-NEXT: callq do_sse +; FAST-YMM-ZMM-NEXT: decl %ebx +; FAST-YMM-ZMM-NEXT: jne .LBB3_3 +; FAST-YMM-ZMM-NEXT: # BB#4: # %for.end +; FAST-YMM-ZMM-NEXT: addq $16, %rsp +; FAST-YMM-ZMM-NEXT: popq %rbx +; FAST-YMM-ZMM-NEXT: retq +; +; BTVER2-LABEL: test03: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: pushq %rbx +; BTVER2-NEXT: subq $16, %rsp +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; BTVER2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BTVER2-NEXT: .p2align 4, 0x90 +; BTVER2-NEXT: .LBB3_1: # %while.cond +; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 +; BTVER2-NEXT: callq foo +; BTVER2-NEXT: testl %eax, %eax +; BTVER2-NEXT: jne .LBB3_1 +; BTVER2-NEXT: # BB#2: # %for.body.preheader +; BTVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; BTVER2-NEXT: movl $4, %ebx +; BTVER2-NEXT: .p2align 4, 0x90 +; BTVER2-NEXT: .LBB3_3: # %for.body +; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 +; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BTVER2-NEXT: callq do_sse +; BTVER2-NEXT: decl %ebx +; BTVER2-NEXT: jne .LBB3_3 +; BTVER2-NEXT: # BB#4: # %for.end +; BTVER2-NEXT: addq $16, %rsp +; BTVER2-NEXT: popq %rbx +; BTVER2-NEXT: retq entry: %add.i = fadd <4 x float> %a, %b br label %while.cond -while.cond: +while.cond: %call = tail call i32 @foo() %tobool = icmp eq i32 %call, 0 br i1 %tobool, label %for.body, label %while.cond for.body: - ; CHECK: LBB - ; CHECK-NOT: vzeroupper %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ] %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ] - ; CHECK: callq _do_sse %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind - ; CHECK-NEXT: callq _do_sse %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind %tmp11 = load <8 x float>, <8 x float>* @g, align 32 %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind - ; CHECK: vzeroupper - ; CHECK-NEXT: callq _do_sse %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind %1 = add nsw i32 %i.018, 1 %exitcond = icmp eq i32 %1, 4 @@ -94,15 +218,30 @@ for.end: ;; Check that we also perform vzeroupper when we return from a function. -; CHECK: _test04 -define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp { -entry: +define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind { +; VZ-LABEL: test04: +; VZ: # BB#0: +; VZ-NEXT: pushq %rax +; VZ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VZ-NEXT: callq do_avx +; VZ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; VZ-NEXT: popq %rax +; VZ-NEXT: vzeroupper +; VZ-NEXT: retq +; +; NO-VZ-LABEL: test04: +; NO-VZ: # BB#0: +; NO-VZ-NEXT: pushq %rax +; NO-VZ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; NO-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; NO-VZ-NEXT: callq do_avx +; NO-VZ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; NO-VZ-NEXT: popq %rax +; NO-VZ-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> - ; CHECK-NOT: vzeroupper - ; CHECK: call %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> - ; CHECK: vzeroupper - ; CHECK: ret ret <4 x float> %shuf2 } + diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 3337f42eb142..51f9a382ccbf 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2216,9 +2216,9 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) { ; ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: kunpckwd %k1, %k0, %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0 ; AVX512F-32-NEXT: kmovd %k0, %eax ; AVX512F-32-NEXT: retl %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1) diff --git a/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll new file mode 100644 index 000000000000..019c5282f63b --- /dev/null +++ b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86_64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; The following tests check that patterns that includes ;; +;; ctpop intrinsic + select are translated to the vpopcntd/q ;; +;; instruction in a correct way. ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) { +; X86_64-LABEL: test_mask_vpopcnt_d: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mask_vpopcnt_d: +; X86: # BB#0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a + ret <16 x i32> %3 +} + +define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) { +; X86_64-LABEL: test_maskz_vpopcnt_d: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_maskz_vpopcnt_d: +; X86: # BB#0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 +} + +define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; X86_64-LABEL: test_mask_vpopcnt_q: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8] +; X86_64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_mask_vpopcnt_q: +; X86: # BB#0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b + ret <8 x i64> %3 +} + +define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) { +; X86_64-LABEL: test_maskz_vpopcnt_q: +; X86_64: # BB#0: +; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X86_64-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0] +; X86_64-NEXT: retq # encoding: [0xc3] +; +; X86-LABEL: test_maskz_vpopcnt_q: +; X86: # BB#0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0] +; X86-NEXT: retl # encoding: [0xc3] + %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 +} + +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) diff --git a/test/CodeGen/X86/fast-isel-select-cmp.ll b/test/CodeGen/X86/fast-isel-select-cmp.ll index 1af30e9f32fe..4a8e8792f98d 100644 --- a/test/CodeGen/X86/fast-isel-select-cmp.ll +++ b/test/CodeGen/X86/fast-isel-select-cmp.ll @@ -4,9 +4,9 @@ ; different basic blocks. define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) { -; CHECK-LABEL: select_cmp_cmov_i32 +; CHECK-LABEL: select_cmp_cmov_i32: ; CHECK-LABEL: continue -; CHECK-NOT: cmp +; CHECK-NOT: cmp{{[^_]}} %1 = icmp ult i32 %a, %b br i1 %1, label %continue, label %exit @@ -19,9 +19,9 @@ exit: } define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) { -; CHECK-LABEL: select_fcmp_oeq_f32 +; CHECK-LABEL: select_fcmp_oeq_f32: ; CHECK-LABEL: continue -; CHECK-NOT: cmp +; CHECK-NOT: cmp{{[^_]}} %1 = fcmp oeq float %a, %b br i1 %1, label %continue, label %exit @@ -34,7 +34,7 @@ exit: } define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) { -; CHECK-LABEL: select_fcmp_one_f32 +; CHECK-LABEL: select_fcmp_one_f32: ; CHECK-LABEL: continue ; CHECK-NOT: ucomi %1 = fcmp one float %a, %b diff --git a/test/CodeGen/X86/fp-intrinsics.ll b/test/CodeGen/X86/fp-intrinsics.ll index 88aef6bb0659..0f8d730d7535 100644 --- a/test/CodeGen/X86/fp-intrinsics.ll +++ b/test/CodeGen/X86/fp-intrinsics.ll @@ -103,9 +103,156 @@ if.end: ret double %a.0 } +; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f5 +; CHECK: sqrtsd +define double @f5() { +entry: + %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f6 +; CHECK: pow +define double @f6() { +entry: + %result = call double @llvm.experimental.constrained.pow.f64(double 42.1, + double 3.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f7 +; CHECK: powi +define double @f7() { +entry: + %result = call double @llvm.experimental.constrained.powi.f64(double 42.1, + i32 3, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that sin(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f8 +; CHECK: sin +define double @f8() { +entry: + %result = call double @llvm.experimental.constrained.sin.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that cos(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f9 +; CHECK: cos +define double @f9() { +entry: + %result = call double @llvm.experimental.constrained.cos.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f10 +; CHECK: exp +define double @f10() { +entry: + %result = call double @llvm.experimental.constrained.exp.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f11 +; CHECK: exp2 +define double @f11() { +entry: + %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that log(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f12 +; CHECK: log +define double @f12() { +entry: + %result = call double @llvm.experimental.constrained.log.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that log10(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f13 +; CHECK: log10 +define double @f13() { +entry: + %result = call double @llvm.experimental.constrained.log10.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that log2(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f14 +; CHECK: log2 +define double @f14() { +entry: + %result = call double @llvm.experimental.constrained.log2.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that rint(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f15 +; CHECK: rint +define double @f15() { +entry: + %result = call double @llvm.experimental.constrained.rint.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} + +; Verify that nearbyint(42.1) isn't simplified when the rounding mode is +; unknown. +; CHECK-LABEL: f16 +; CHECK: nearbyint +define double @f16() { +entry: + %result = call double @llvm.experimental.constrained.nearbyint.f64( + double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret double %result +} @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata) +declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll index 5ade5b470b54..e7929c9cecdc 100644 --- a/test/CodeGen/X86/hoist-invariant-load.ll +++ b/test/CodeGen/X86/hoist-invariant-load.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machine-licm.*hoisted" +; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machinelicm.*hoisted" ; For test: ; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_ ; and 1 for objc_msgSend from the GOT diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll index 3e3729285d27..7abd157f147a 100644 --- a/test/CodeGen/X86/misched-copy.ll +++ b/test/CodeGen/X86/misched-copy.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; ; Test scheduling of copy instructions. ; diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll index 4899a0fc7e88..71d7746642e9 100644 --- a/test/CodeGen/X86/or-branch.ll +++ b/test/CodeGen/X86/or-branch.ll @@ -1,16 +1,34 @@ -; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 --check-prefix=CHECK -; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 --check-prefix=CHECK +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 +; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind { ; JUMP2-LABEL: foo: -; JUMP2-DAG: jl -; JUMP2-DAG: je +; JUMP2: # BB#0: # %entry +; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP2-NEXT: jl .LBB0_3 +; JUMP2-NEXT: # BB#1: # %entry +; JUMP2-NEXT: movl {{[0-9]+}}(%esp), %eax +; JUMP2-NEXT: testl %eax, %eax +; JUMP2-NEXT: je .LBB0_3 +; JUMP2-NEXT: # BB#2: # %UnifiedReturnBlock +; JUMP2-NEXT: retl +; JUMP2-NEXT: .LBB0_3: # %cond_true +; JUMP2-NEXT: jmp bar # TAILCALL ; ; JUMP1-LABEL: foo: -; JUMP1-DAG: sete -; JUMP1-DAG: setl -; JUMP1: orb -; JUMP1: jne +; JUMP1: # BB#0: # %entry +; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; JUMP1-NEXT: sete %al +; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP1-NEXT: setl %cl +; JUMP1-NEXT: orb %al, %cl +; JUMP1-NEXT: cmpb $1, %cl +; JUMP1-NEXT: jne .LBB0_1 +; JUMP1-NEXT: # BB#2: # %cond_true +; JUMP1-NEXT: jmp bar # TAILCALL +; JUMP1-NEXT: .LBB0_1: # %UnifiedReturnBlock +; JUMP1-NEXT: retl entry: %tmp1 = icmp eq i32 %X, 0 %tmp3 = icmp slt i32 %Y, 5 @@ -29,11 +47,33 @@ UnifiedReturnBlock: ; regardless of whether they are expensive or not. define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind { -; CHECK-LABEL: unpredictable: -; CHECK-DAG: sete -; CHECK-DAG: setl -; CHECK: orb -; CHECK: jne +; JUMP2-LABEL: unpredictable: +; JUMP2: # BB#0: # %entry +; JUMP2-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; JUMP2-NEXT: sete %al +; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP2-NEXT: setl %cl +; JUMP2-NEXT: orb %al, %cl +; JUMP2-NEXT: cmpb $1, %cl +; JUMP2-NEXT: jne .LBB1_1 +; JUMP2-NEXT: # BB#2: # %cond_true +; JUMP2-NEXT: jmp bar # TAILCALL +; JUMP2-NEXT: .LBB1_1: # %UnifiedReturnBlock +; JUMP2-NEXT: retl +; +; JUMP1-LABEL: unpredictable: +; JUMP1: # BB#0: # %entry +; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; JUMP1-NEXT: sete %al +; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp) +; JUMP1-NEXT: setl %cl +; JUMP1-NEXT: orb %al, %cl +; JUMP1-NEXT: cmpb $1, %cl +; JUMP1-NEXT: jne .LBB1_1 +; JUMP1-NEXT: # BB#2: # %cond_true +; JUMP1-NEXT: jmp bar # TAILCALL +; JUMP1-NEXT: .LBB1_1: # %UnifiedReturnBlock +; JUMP1-NEXT: retl entry: %tmp1 = icmp eq i32 %X, 0 %tmp3 = icmp slt i32 %Y, 5 diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir index 002761bc1e68..956df172b253 100644 --- a/test/CodeGen/X86/pr27681.mir +++ b/test/CodeGen/X86/pr27681.mir @@ -57,7 +57,7 @@ body: | %cl = SETNEr implicit %eflags ; Verify that removal of the %bl antidependence does not use %ch ; as a replacement register. - ; CHECK: %cl = AND8rr %cl, killed %b + ; CHECK: %cl = AND8rr killed %cl, killed %b %cl = AND8rr killed %cl, killed %bl, implicit-def dead %eflags CMP32ri8 %ebp, -1, implicit-def %eflags %edx = MOV32ri 0 diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll index 2e31154068fc..8570fe7fe7ba 100644 --- a/test/CodeGen/X86/sandybridge-loads.ll +++ b/test/CodeGen/X86/sandybridge-loads.ll @@ -1,13 +1,20 @@ -; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s - -;CHECK-LABEL: wideloads: -;CHECK: vmovaps -;CHECK: vinsertf128 -;CHECK: vmovaps -;CHECK-NOT: vinsertf128 -;CHECK: ret +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +; CHECK-LABEL: wideloads: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vmovaps (%rdx), %ymm2 +; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 ; <---- unaligned! %v1 = load <8 x float>, <8 x float>* %b, align 32 ; <---- aligned! %m0 = fcmp olt <8 x float> %v1, %v0 @@ -19,17 +26,16 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ret void } -; CHECK: widestores -; loads: -; CHECK: vmovaps -; CHECK: vmovaps -; stores: -; CHECK: vmovaps -; CHECK: vextractf128 -; CHECK: vmovaps -;CHECK: ret - define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +; CHECK-LABEL: widestores: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovaps %ymm0, (%rsi) +; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm1, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 32 %v1 = load <8 x float>, <8 x float>* %b, align 32 store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll index 383ab21bd404..19305d0dad62 100644 --- a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll @@ -354,9 +354,8 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) { ; X32-LABEL: test_mm_crc32_u8: ; X32: # BB#0: -; X32-NEXT: movb {{[0-9]+}}(%esp), %cl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: crc32b %cl, %eax +; X32-NEXT: crc32b {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_crc32_u8: @@ -372,9 +371,8 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) { ; X32-LABEL: test_mm_crc32_u16: ; X32: # BB#0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: crc32w %cx, %eax +; X32-NEXT: crc32w {{[0-9]+}}(%esp), %eax ; X32-NEXT: retl ; ; X64-LABEL: test_mm_crc32_u16: diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll index 72542f499087..a00d47bb13e9 100644 --- a/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -1651,26 +1651,9 @@ define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) { } declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone -define double @stack_fold_sqrtsd(double %a0) { - ;CHECK-LABEL: stack_fold_sqrtsd - ;CHECK: vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call double @llvm.sqrt.f64(double %a0) - ret double %2 -} -declare double @llvm.sqrt.f64(double) nounwind readnone - +; TODO stack_fold_sqrtsd ; TODO stack_fold_sqrtsd_int - -define float @stack_fold_sqrtss(float %a0) { - ;CHECK-LABEL: stack_fold_sqrtss - ;CHECK: vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload - %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call float @llvm.sqrt.f32(float %a0) - ret float %2 -} -declare float @llvm.sqrt.f32(float) nounwind readnone - +; TODO stack_fold_sqrtss ; TODO stack_fold_sqrtss_int define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) { diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll index cbcde0655597..9da071f7ede6 100644 --- a/test/CodeGen/X86/twoaddr-coalesce-2.ll +++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \ -; RUN: grep "twoaddrinstr" | grep "Number of instructions aggressively commuted" +; RUN: grep "twoaddressinstruction" | grep "Number of instructions aggressively commuted" ; rdar://6480363 target triple = "i386-apple-darwin9.6" diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll index f737ea2b7fba..4d183f3172b3 100644 --- a/test/CodeGen/X86/vector-narrow-binop.ll +++ b/test/CodeGen/X86/vector-narrow-binop.ll @@ -22,17 +22,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d ; ; AVX1-LABEL: PR32790: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR32790: @@ -60,46 +60,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; SSE-LABEL: do_not_use_256bit_op: ; SSE: # BB#0: -; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: do_not_use_256bit_op: -; AVX1: # BB#0: -; AVX1-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: do_not_use_256bit_op: -; AVX2: # BB#0: -; AVX2-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: do_not_use_256bit_op: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM2 %XMM2 %YMM2 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: do_not_use_256bit_op: +; AVX: # BB#0: +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> %and = and <8 x i32> %concat1, %concat2 diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index 27909c6bb4a0..adda108bdc77 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: @@ -81,19 +82,41 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out } @@ -193,23 +216,49 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) ret <4 x i32> %out } diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index 7a675619d720..accbad35e9d7 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -39,6 +40,13 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv4i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out } @@ -92,6 +100,13 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out } @@ -137,6 +152,21 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out } @@ -173,6 +203,18 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll index cf4f21e62b61..aa50206e7a5e 100644 --- a/test/CodeGen/X86/vector-popcnt-512.ll +++ b/test/CodeGen/X86/vector-popcnt-512.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-LABEL: testv8i64: @@ -39,6 +40,11 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in) ret <8 x i64> %out } @@ -92,6 +98,11 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in) ret <16 x i32> %out } @@ -135,6 +146,30 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in) ret <32 x i16> %out } @@ -169,6 +204,24 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in) ret <64 x i8> %out } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index fa3471c2fe40..2e65bd8c75c7 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -282,8 +282,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rsi), %zmm0 -; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovups 32(%rsi), %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4 diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index 5aab21749d14..706edd27a3f1 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -511,11 +511,10 @@ define <8 x float> @expand14(<4 x float> %a) { ; ; KNL64-LABEL: expand14: ; KNL64: # BB#0: +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; @@ -529,11 +528,10 @@ define <8 x float> @expand14(<4 x float> %a) { ; ; KNL32-LABEL: expand14: ; KNL32: # BB#0: +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> , @@ -545,39 +543,35 @@ define <8 x float> @expand14(<4 x float> %a) { define <8 x float> @expand15(<4 x float> %a) { ; SKX64-LABEL: expand15: ; SKX64: # BB#0: -; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX64-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX64-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; SKX64-NEXT: retq ; ; KNL64-LABEL: expand15: ; KNL64: # BB#0: +; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL64-NEXT: retq ; ; SKX32-LABEL: expand15: ; SKX32: # BB#0: -; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3] -; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u> -; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0] +; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3] -; SKX32-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; SKX32-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 ; SKX32-NEXT: retl ; ; KNL32-LABEL: expand15: ; KNL32: # BB#0: +; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0] +; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u> -; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0] -; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1] ; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; KNL32-NEXT: retl %addV = fadd <4 x float> , diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll index c5ac4466b5fa..13088b7fa5f2 100644 --- a/test/CodeGen/X86/vector-sqrt.ll +++ b/test/CodeGen/X86/vector-sqrt.ll @@ -5,8 +5,10 @@ define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtd2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtsd 8(%rdi), %xmm1, %xmm1 +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq entry: @@ -27,10 +29,14 @@ declare double @sqrt(double) local_unnamed_addr #1 define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtf4: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index a5fac9ac6a41..d4fbb72bbe6d 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -3030,10 +3030,10 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; SSE-LABEL: trunc_and_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 @@ -3786,10 +3786,10 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; SSE-LABEL: trunc_xor_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 @@ -4542,10 +4542,10 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { ; SSE-LABEL: trunc_or_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 22d0065b264f..a22a60756264 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -7,6 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ ; ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 @@ -117,6 +118,17 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -284,6 +296,17 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv2i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -501,6 +524,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -700,6 +735,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -843,6 +890,25 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -984,6 +1050,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1106,6 +1191,22 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1224,6 +1325,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: pxor %xmm1, %xmm1 @@ -1258,6 +1375,12 @@ define <2 x i64> @foldv2i64() nounwind { ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv2i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: movl $8, %eax +; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl $8, %eax @@ -1280,6 +1403,12 @@ define <2 x i64> @foldv2i64u() nounwind { ; AVX-NEXT: vmovq %rax, %xmm0 ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv2i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: movl $8, %eax +; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv2i64u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl $8, %eax @@ -1300,6 +1429,11 @@ define <4 x i32> @foldv4i32() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv4i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] @@ -1319,6 +1453,11 @@ define <4 x i32> @foldv4i32u() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv4i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv4i32u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] @@ -1338,6 +1477,11 @@ define <8 x i16> @foldv8i16() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] @@ -1357,6 +1501,11 @@ define <8 x i16> @foldv8i16u() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv8i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv8i16u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] @@ -1376,6 +1525,11 @@ define <16 x i8> @foldv16i8() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] @@ -1395,6 +1549,11 @@ define <16 x i8> @foldv16i8u() nounwind { ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: foldv16i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-SSE-LABEL: foldv16i8u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index a0b277ddd732..101ae95550e7 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ ; ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 @@ -12,11 +13,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -28,6 +26,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -92,6 +92,17 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i64: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -117,11 +128,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -133,6 +141,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -182,6 +192,17 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv4i64u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -205,28 +226,27 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -234,12 +254,12 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32: @@ -307,6 +327,17 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i32: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv8i32: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -335,28 +366,27 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32u: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 @@ -364,12 +394,12 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32u: @@ -414,6 +444,17 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv8i32u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv8i32u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -442,32 +483,31 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 +; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -532,6 +572,25 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv16i16: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -557,32 +616,31 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16u: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 +; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -647,6 +705,25 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv16i16u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv16i16u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -674,27 +751,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -747,6 +823,22 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv32i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 @@ -771,27 +863,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -844,6 +935,22 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; +; AVX512VPOPCNTDQ-LABEL: testv32i8u: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: retq +; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1 diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 2d1715949a5e..abbe964e983c 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64: @@ -64,6 +65,15 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) ret <8 x i64> %out } @@ -105,6 +115,15 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i64u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) ret <8 x i64> %out } @@ -186,6 +205,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0) ret <16 x i32> %out } @@ -231,6 +259,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i32u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1) ret <16 x i32> %out } @@ -305,6 +342,38 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) ret <32 x i16> %out } @@ -379,6 +448,38 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv32i16u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) ret <32 x i16> %out } @@ -441,6 +542,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0) ret <64 x i8> %out } @@ -503,6 +630,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv64i8u: +; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1) ret <64 x i8> %out } diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll index fbaf500e8333..b5c7f86567a1 100644 --- a/test/CodeGen/X86/wide-integer-cmp.ll +++ b/test/CodeGen/X86/wide-integer-cmp.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-linux-gnu %s -o - | FileCheck %s - define i32 @branch_eq(i64 %a, i64 %b) { ; CHECK-LABEL: branch_eq: ; CHECK: # BB#0: # %entry diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll index 6b2e4de5cdaa..42c4c23c6349 100644 --- a/test/CodeGen/X86/widened-broadcast.ll +++ b/test/CodeGen/X86/widened-broadcast.ll @@ -151,8 +151,7 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl ; ; AVX1-LABEL: load_splat_8i32_8i32_01010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovapd (%rdi), %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -288,8 +287,7 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou ; ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -315,22 +313,10 @@ define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nou ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_16i16_16i16_0123012301230123: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> @@ -513,8 +499,7 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8 ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -587,26 +572,10 @@ define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtabl ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_4f32_8f32_0000: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_4f32_8f32_0000: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_4f32_8f32_0000: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_4f32_8f32_0000: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer @@ -627,22 +596,10 @@ define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind ; SSE42-NEXT: movapd %xmm0, %xmm1 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_splat_8f32_16f32_89898989: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8f32_16f32_89898989: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vbroadcastsd 32(%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8f32_16f32_89898989: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovapd (%rdi), %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8f32_16f32_89898989: +; AVX: # BB#0: # %entry +; AVX-NEXT: vbroadcastsd 32(%rdi), %ymm0 +; AVX-NEXT: retq entry: %ld = load <16 x float>, <16 x float>* %ptr %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 6fbec91e77a3..f4d0503f4a79 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -57,10 +57,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { ; AVX1: # BB#0: ; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; AVX1-NEXT: vmovups 96(%rdi), %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -69,10 +67,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { ; AVX2: # BB#0: ; AVX2-NEXT: vmovupd (%rdi), %ymm0 ; AVX2-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX2-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX2-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll index 683d7b05cf8c..9bc654861b69 100644 --- a/test/CodeGen/X86/x87.ll +++ b/test/CodeGen/X86/x87.ll @@ -1,13 +1,16 @@ ; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87 ; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87 -; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" -; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" -; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" -; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}" +; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 +; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 +; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 +; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone { ; X87-LABEL: test: ; NOX87-LABEL: test: + +; NOX87-NOT: {{ }}f{{.*}} + ; X87: fild ; NOX87: __floatunsisf %tmp = uitofp i32 %i to float diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll index aed49f4b67ba..d214c40dd9b9 100644 --- a/test/CodeGen/XCore/epilogue_prologue.ll +++ b/test/CodeGen/XCore/epilogue_prologue.ll @@ -6,7 +6,7 @@ ; When using FP, for large or small frames, we may need one scratch register. ; FP + small frame: spill FP+SR = entsp 2 -; CHECKFP-LABEL: f1 +; CHECKFP-LABEL: f1: ; CHECKFP: entsp 2 ; CHECKFP-NEXT: stw r10, sp[1] ; CHECKFP-NEXT: ldaw r10, sp[0] @@ -15,7 +15,7 @@ ; CHECKFP-NEXT: retsp 2 ; ; !FP + small frame: no spills = no stack adjustment needed -; CHECK-LABEL: f1 +; CHECK-LABEL: f1: ; CHECK: stw lr, sp[0] ; CHECK: ldw lr, sp[0] ; CHECK-NEXT: retsp 0 @@ -27,7 +27,7 @@ entry: ; FP + small frame: spill FP+SR+R0+LR = entsp 3 + extsp 1 -; CHECKFP-LABEL:f3 +; CHECKFP-LABEL: f3: ; CHECKFP: entsp 3 ; CHECKFP-NEXT: stw r10, sp[1] ; CHECKFP-NEXT: ldaw r10, sp[0] @@ -43,7 +43,7 @@ entry: ; CHECKFP-NEXT: retsp 3 ; ; !FP + small frame: spill R0+LR = entsp 2 -; CHECK-LABEL: f3 +; CHECK-LABEL: f3: ; CHECK: entsp 2 ; CHECK-NEXT: stw [[REG:r[4-9]+]], sp[1] ; CHECK-NEXT: mov [[REG]], r0 @@ -60,7 +60,7 @@ entry: ; FP + large frame: spill FP+SR = entsp 2 + 100000 -; CHECKFP-LABEL: f4 +; CHECKFP-LABEL: f4: ; CHECKFP: entsp 65535 ; CHECKFP-NEXT: .Lcfi{{[0-9]+}} ; CHECKFP-NEXT: .cfi_def_cfa_offset 262140 @@ -81,7 +81,7 @@ entry: ; CHECKFP-NEXT: retsp 34467 ; ; !FP + large frame: spill SR+SR = entsp 2 + 100000 -; CHECK-LABEL: f4 +; CHECK-LABEL: f4: ; CHECK: entsp 65535 ; CHECK-NEXT: .Lcfi{{[0-9]+}} ; CHECK-NEXT: .cfi_def_cfa_offset 262140 @@ -107,7 +107,7 @@ entry: ; CHECKFP-NEXT: .LCPI[[CNST1:[0-9_]+]]: ; CHECKFP-NEXT: .long 200001 ; CHECKFP-NEXT: .text -; CHECKFP-LABEL: f6 +; CHECKFP-LABEL: f6: ; CHECKFP: entsp 65535 ; CHECKFP-NEXT: .Lcfi{{[0-9]+}} ; CHECKFP-NEXT: .cfi_def_cfa_offset 262140 @@ -160,7 +160,7 @@ entry: ; CHECK-NEXT: .LCPI[[CNST1:[0-9_]+]]: ; CHECK-NEXT: .long 200002 ; CHECK-NEXT: .text -; CHECK-LABEL: f6 +; CHECK-LABEL: f6: ; CHECK: entsp 65535 ; CHECK-NEXT: .Lcfi{{[0-9]+}} ; CHECK-NEXT: .cfi_def_cfa_offset 262140 @@ -207,7 +207,7 @@ entry: } ; FP + large frame: spill FP+SR+LR = entsp 2 + 256 + extsp 1 -; CHECKFP-LABEL:f8 +; CHECKFP-LABEL: f8: ; CHECKFP: entsp 258 ; CHECKFP-NEXT: stw r10, sp[1] ; CHECKFP-NEXT: ldaw r10, sp[0] @@ -221,7 +221,7 @@ entry: ; CHECKFP-NEXT: retsp 258 ; ; !FP + large frame: spill SR+SR+LR = entsp 3 + 256 -; CHECK-LABEL:f8 +; CHECK-LABEL: f8: ; CHECK: entsp 257 ; CHECK-NEXT: ldaw r0, sp[254] ; CHECK-NEXT: bl f5 @@ -235,7 +235,7 @@ entry: } ; FP + large frame: spill FP+SR+LR = entsp 2 + 32768 + extsp 1 -; CHECKFP-LABEL:f9 +; CHECKFP-LABEL: f9: ; CHECKFP: entsp 32770 ; CHECKFP-NEXT: stw r10, sp[1] ; CHECKFP-NEXT: ldaw r10, sp[0] @@ -249,7 +249,7 @@ entry: ; CHECKFP-NEXT: retsp 32770 ; ; !FP + large frame: spill SR+SR+LR = entsp 3 + 32768 -; CHECK-LABEL:f9 +; CHECK-LABEL: f9: ; CHECK: entsp 32771 ; CHECK-NEXT: ldaw r0, sp[32768] ; CHECK-NEXT: bl f5 diff --git a/test/DebugInfo/Generic/empty.ll b/test/DebugInfo/Generic/empty.ll index d5f738fa0271..79912841fa6d 100644 --- a/test/DebugInfo/Generic/empty.ll +++ b/test/DebugInfo/Generic/empty.ll @@ -13,10 +13,9 @@ ; CHECK-NOT: file_names[ ; CHECK: .debug_pubnames contents: -; CHECK-NOT: Offset +; CHECK-NOT: {{^}}0x -; CHECK: .debug_pubtypes contents: -; CHECK-NOT: Offset +; CHECK: contents: ; Don't emit DW_AT_addr_base when there are no addresses. ; FISSION-NOT: DW_AT_GNU_addr_base [DW_FORM_sec_offset] @@ -24,8 +23,10 @@ !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!5} -!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !2, globals: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !6, globals: !2) !2 = !{} !3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky") !4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky") !5 = !{i32 1, !"Debug Info Version", i32 3} +!6 = !{!7} +!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) diff --git a/test/DebugInfo/Generic/nodebug.ll b/test/DebugInfo/Generic/nodebug.ll index f85b00bf9f7e..9b0eb9b4dd07 100644 --- a/test/DebugInfo/Generic/nodebug.ll +++ b/test/DebugInfo/Generic/nodebug.ll @@ -1,6 +1,6 @@ ; REQUIRES: object-emission -; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s +; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s ; Test that a nodebug function (a function not appearing in the debug info IR ; metadata subprogram list) with DebugLocs on its IR doesn't cause crashes/does @@ -17,9 +17,16 @@ ; } ; Check that there's no DW_TAG_subprogram, not even for the 'f2' function. +; CHECK: .debug_info contents: ; CHECK: DW_TAG_compile_unit ; CHECK-NOT: DW_TAG_subprogram +; Expect no line table entry since there are no functions and file references in this compile unit +; CHECK: .debug_line contents: +; CHECK: Line table prologue: +; CHECK: total_length: 0x00000019 +; CHECK-NOT: file_names[ + @i = external global i32 ; Function Attrs: uwtable @@ -35,7 +42,7 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"=" !llvm.module.flags = !{!8, !9} !llvm.ident = !{!10} -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !12, globals: !2, imports: !2) !1 = !DIFile(filename: "nodebug.cpp", directory: "/tmp/dbginfo") !2 = !{} !4 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2) @@ -46,3 +53,5 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"=" !9 = !{i32 2, !"Debug Info Version", i32 3} !10 = !{!"clang version 3.5.0 "} !11 = !DILocation(line: 3, scope: !4) +!12 = !{!13} +!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) diff --git a/test/DebugInfo/Generic/skeletoncu.ll b/test/DebugInfo/Generic/skeletoncu.ll index 6d91afd0fa79..b9761b2ab565 100644 --- a/test/DebugInfo/Generic/skeletoncu.ll +++ b/test/DebugInfo/Generic/skeletoncu.ll @@ -7,9 +7,11 @@ !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} -!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981) +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2, dwoId: 43981) !1 = !DIFile(filename: "", directory: "/") !2 = !{} !3 = !{i32 2, !"Dwarf Version", i32 4} !4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{!6} +!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.cpp b/test/DebugInfo/Inputs/split-dwarf-dwp.cpp new file mode 100644 index 000000000000..b07a1537d6bf --- /dev/null +++ b/test/DebugInfo/Inputs/split-dwarf-dwp.cpp @@ -0,0 +1,12 @@ +void f1(); +__attribute__((always_inline)) void f2() { + f1(); +} +void f3() { + f2(); +} + +To produce split-dwarf-dwp.o{,dwp}, run: + + $ clang++ split-dwarf-dwp.cpp -gsplit-dwarf -c -Xclang -fdebug-compilation-dir=Output -fno-split-dwarf-inlining + $ llvm-dwp split-dwarf-dwp.dwo -o split-dwarf-dwp.o.dwp diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.o b/test/DebugInfo/Inputs/split-dwarf-dwp.o new file mode 100644 index 0000000000000000000000000000000000000000..614c62040dec63fc13b815d42ca1ea408e9cbcfa GIT binary patch literal 2744 zcmbtV&1(};5TDJqrmbyEYrmv|3yQR2W7<%x7mf8RV9_GIArgv}sa& zZw0R&Ja|;GcoanN>fhi^@TlhsIV2dBK5dB6F3^WHSOb!G8t%rJn-fb$UT z7zLQ`j`VDxW?>XYU^(~v`w!aR{nDsc%Kmh5!}dzZ>#5>K4Q@95deaAE5Rzl>7ZMYR zorU;^g|R_n=m5r?#??i^(6guJ@axuY95%KBlVAts4eLiyi3OW9@*}uPvv7<=CP1N3 zvE5ax<#>&9&9&xI=~UX9^u4CLZe`M$xpSE*YqC;lRnJJ+>=echhw)%MyamQ)(wOLv z_iW=KsUj_+;7At}$!D=k1Jn*t4p$6LAvcWO!~pW(axu~X^@0lZI)woAhCQG*+HLmb zx{W)r7)A!ReHarmy9O#O1nrkVKCZAnh9}f7pW%n zL>oyn!<&_mP&Hwo10%qM#6I|;4txZWWCt3)>5z19?QzpR+RE+O(o73TWLjY|zA1W1 zI?eAtA@N-FAr_8$=-(d-AY8s3J{#fkYwQho2CkBU(k$E!AV@9h)+j*=iHL(%4Hd=)La{(p-8`TnSX z`n6X3e~Orz6CwCS49N<3#8ltMXwucU2t?vJqxv+hf7Twr9MAXi88O{amjt+oc9~S) zk7(4@U((d4ud-U7@~OR3y;q{p6Cvq4L*Fwso@Ppn8fp<)0z4i;fu^3H&ZLW96X*Z> GkNCeP(yvbd literal 0 HcmV?d00001 diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp b/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp new file mode 100644 index 0000000000000000000000000000000000000000..16a0af8c062f3c5c18997352081cefe736489fad GIT binary patch literal 1256 zcmbtTO>fgc5S_J?v?2m2w1-v#$r3_p510@uhZf0Ll|Vv(#0j-rtbEwT!nG~iaoSTM zCw>S&0R9F11y1~rUU*}6)n(HgPkOsEZ{E%-v(Dx7*Dov~NI{TK*O^j@K3Lar(1ros zf^&?x)8?CkM2D>Hug%p4EiWy*BP?N4A4T!5+Y{Z+E{9!Sdf-5OJ6ibMlUisgT zHi#^jvXzhQu>Z9GY}dhA+ZBV){@q@%qgQa}eR{IhR*w58e8+9jA~Yc(uI}I`R7IN9 zkLJrs8DlNn`7)}C*CMv5W&z!M4bcfDfT`sbZjyLqi2j0H3m!{3xV@(_FueF*L z1%>ZN@u;DA)J7g64|fx7gRj64jKKsfK%O5DiigXwniIknYUsS%AbT;UH5@>B3l6}0 zYt?nZok!EfX*gSiNj8tpN#>L3w2Yc|C{n5FGR)(+it5Uc^)$)iT+h=ai~h;;+20iH z>dBPc>f(&||HU~P@K{XzmUH`J6x2AC%+FwITnl))0jBpQJf_CS0$u=q79FOSpUc$P ze<2U?lj<9I_OXiiKtMh7qfVzrx>l-PWpQ7pI+|;;F cBE@69=?&gb(D-j-;)guHBMa)DO_?kG50jaCX#fBK literal 0 HcmV?d00001 diff --git a/test/DebugInfo/MIR/X86/empty-inline.mir b/test/DebugInfo/MIR/X86/empty-inline.mir new file mode 100644 index 000000000000..1766a8f44616 --- /dev/null +++ b/test/DebugInfo/MIR/X86/empty-inline.mir @@ -0,0 +1,122 @@ +# RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +# +# This testcase has an implicit def pseudo-iunstruction with a debug location. +# +# CHECK: .debug_info contents: +# CHECK: DW_TAG_subprogram +# CHECK: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000) +# CHECK-NOT: DW_TAG +# CHECK: DW_AT_specification {{.*}} "_ZN1C5m_fn3Ev" +# CHECK-NOT: DW_TAG +# Here should not be an inlined subroutine with 0 length. +# CHECK: NULL +# +# CHECK: Address Line Column File ISA Discriminator Flags +# CHECK-NEXT: --- +# CHECK-NEXT: 25 0 1 0 0 is_stmt +# CHECK-NEXT: 29 28 1 0 0 is_stmt prologue_end +# CHECK-NEXT: 29 28 1 0 0 is_stmt end_sequence +--- | + source_filename = "t.ll" + target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-apple-macosx" + + %class.E = type { %class.D } + %class.D = type { %class.B } + %class.B = type { %class.A, %class.A } + %class.A = type { i8 } + %class.C = type <{ %class.E*, %class.B, [2 x i8] }> + + @a = local_unnamed_addr global %class.E* null, align 4 + + define i32 @_ZN1C5m_fn3Ev(%class.C* nocapture) local_unnamed_addr align 2 !dbg !6 { + %2 = alloca %class.B, align 1 + %3 = load %class.E*, %class.E** @a, align 4 + %4 = icmp eq %class.E* %3, null + br i1 %4, label %10, label %5 + + ;