From 97a7b8a20a989eb4cf3d9465e1451de6cd05fa41 Mon Sep 17 00:00:00 2001 From: dim <dim@FreeBSD.org> Date: Sat, 13 Feb 2016 14:57:10 +0000 Subject: [PATCH] Vendor import of llvm release_38 branch r260756: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260756 --- cmake/modules/AddLLVM.cmake | 36 +- cmake/modules/LLVM-Config.cmake | 11 +- docs/ReleaseNotes.rst | 146 ++++++- include/llvm/IR/IntrinsicsPowerPC.td | 2 +- include/llvm/IR/Value.h | 4 - lib/Analysis/DemandedBits.cpp | 7 - .../AsmPrinter/AsmPrinterInlineAsm.cpp | 5 + lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 55 ++- lib/IR/Value.cpp | 4 +- lib/Target/AArch64/AArch64.td | 4 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 3 + lib/Target/AArch64/AArch64SchedM1.td | 359 ++++++++++++++++++ lib/Target/AMDGPU/AMDGPU.td | 3 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +- lib/Target/AMDGPU/Processors.td | 12 +- lib/Target/AMDGPU/SIRegisterInfo.cpp | 9 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 3 + lib/Target/ARM/ARMISelDAGToDAG.cpp | 4 +- lib/Target/PowerPC/PPCFastISel.cpp | 18 +- lib/Target/PowerPC/PPCInstrAltivec.td | 2 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +- lib/Target/X86/X86ISelLowering.cpp | 65 ++-- .../InstCombine/InstCombineCompares.cpp | 2 +- .../InstCombineLoadStoreAlloca.cpp | 3 +- .../InstCombine/InstCombineVectorOps.cpp | 18 +- lib/Transforms/Utils/SimplifyCFG.cpp | 12 + test/Analysis/DemandedBits/basic.ll | 31 -- test/CodeGen/AArch64/fp16-v4-instructions.ll | 274 +++++++++++++ test/CodeGen/AArch64/fp16-v8-instructions.ll | 84 ++++ test/CodeGen/AMDGPU/hsa-note-no-func.ll | 2 + test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll | 1 + test/CodeGen/AMDGPU/spill-scavenge-offset.ll | 33 ++ test/CodeGen/ARM/shifter_operand.ll | 17 + test/CodeGen/PowerPC/fast-isel-ret.ll | 9 + test/CodeGen/PowerPC/inline-asm-s-modifier.ll | 10 + test/CodeGen/PowerPC/pr26193.ll | 9 + test/CodeGen/PowerPC/pr26356.ll | 136 +++++++ test/CodeGen/PowerPC/pr26381.ll | 8 + test/CodeGen/SystemZ/int-cmp-53.ll | 26 ++ .../X86/avx512-gather-scatter-intrin.ll | 63 ++- test/CodeGen/X86/setcc-lowering.ll | 79 +++- test/DebugInfo/X86/PR26148.ll | 102 +++++ test/Transforms/InstCombine/icmp.ll | 12 + .../InstCombine/insert-extract-shuffle.ll | 30 ++ test/Transforms/InstCombine/unpack-fca.ll | 15 + .../AArch64/loop-vectorization-factors.ll | 34 -- .../SimplifyCFG/X86/switch_to_lookup_table.ll | 32 ++ tools/CMakeLists.txt | 2 +- utils/release/test-release.sh | 26 +- utils/unittest/CMakeLists.txt | 7 +- utils/unittest/UnitTestMain/CMakeLists.txt | 4 +- 51 files changed, 1610 insertions(+), 228 deletions(-) create mode 100644 lib/Target/AArch64/AArch64SchedM1.td create mode 100644 test/CodeGen/AMDGPU/spill-scavenge-offset.ll create mode 100644 test/CodeGen/PowerPC/inline-asm-s-modifier.ll create mode 100644 test/CodeGen/PowerPC/pr26193.ll create mode 100644 test/CodeGen/PowerPC/pr26356.ll create mode 100644 test/CodeGen/PowerPC/pr26381.ll create mode 100644 test/CodeGen/SystemZ/int-cmp-53.ll create mode 100644 test/DebugInfo/X86/PR26148.ll diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index b06e434a2487..a829751eca8f 100755 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -468,20 +468,23 @@ function(llvm_add_library name) endif() endif() - # Add the explicit dependency information for this library. - # - # It would be nice to verify that we have the dependencies for this library - # name, but using get_property(... SET) doesn't suffice to determine if a - # property has been set to an empty value. - get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name}) - - if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_STATIC AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB) - set(llvm_libs LLVM) + if (DEFINED LLVM_LINK_COMPONENTS OR DEFINED ARG_LINK_COMPONENTS) + if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB) + set(llvm_libs LLVM) + else() + llvm_map_components_to_libnames(llvm_libs + ${ARG_LINK_COMPONENTS} + ${LLVM_LINK_COMPONENTS} + ) + endif() else() - llvm_map_components_to_libnames(llvm_libs - ${ARG_LINK_COMPONENTS} - ${LLVM_LINK_COMPONENTS} - ) + # Components have not been defined explicitly in CMake, so add the + # dependency information for this library as defined by LLVMBuild. + # + # It would be nice to verify that we have the dependencies for this library + # name, but using get_property(... SET) doesn't suffice to determine if a + # property has been set to an empty value. + get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name}) endif() if(CMAKE_VERSION VERSION_LESS 2.8.12) @@ -882,14 +885,11 @@ function(add_unittest test_suite test_name) set(LLVM_REQUIRES_RTTI OFF) + list(APPEND LLVM_LINK_COMPONENTS Support) # gtest needs it for raw_ostream add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN}) set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}) set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir}) - target_link_libraries(${test_name} - gtest - gtest_main - LLVMSupport # gtest needs it for raw_ostream. - ) + target_link_libraries(${test_name} gtest_main gtest) add_dependencies(${test_suite} ${test_name}) get_target_property(test_suite_folder ${test_suite} FOLDER) diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake index aa68b4007602..725178ab57b1 100755 --- a/cmake/modules/LLVM-Config.cmake +++ b/cmake/modules/LLVM-Config.cmake @@ -40,10 +40,19 @@ macro(llvm_config executable) # done in case libLLVM does not contain all of the components # the target requires. # - # TODO strip LLVM_DYLIB_COMPONENTS out of link_components. + # Strip LLVM_DYLIB_COMPONENTS out of link_components. # To do this, we need special handling for "all", since that # may imply linking to libraries that are not included in # libLLVM. + + if (DEFINED link_components AND DEFINED LLVM_DYLIB_COMPONENTS) + if("${LLVM_DYLIB_COMPONENTS}" STREQUAL "all") + set(link_components "") + else() + list(REMOVE_ITEM link_components ${LLVM_DYLIB_COMPONENTS}) + endif() + endif() + target_link_libraries(${executable} LLVM) endif() diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index dccb7f4d1cce..7b284d59656b 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -5,11 +5,6 @@ LLVM 3.8 Release Notes .. contents:: :local: -.. warning:: - These are in-progress notes for the upcoming LLVM 3.8 release. You may - prefer the `LLVM 3.7 Release Notes <http://llvm.org/releases/3.7.0/docs - /ReleaseNotes.html>`_. - Introduction ============ @@ -26,11 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send them. -Note that if you are reading this file from a Subversion checkout or the main -LLVM web page, this document applies to the *next* release, not the current -one. To see the release notes for a specific release, please see the `releases -page <http://llvm.org/releases/>`_. - Non-comprehensive list of changes in this release ================================================= * With this release, the minimum Windows version required for running LLVM is @@ -79,6 +69,26 @@ Non-comprehensive list of changes in this release * Support for dematerializing has been dropped. +* RegisterScheduler::setDefault was removed. Targets that used to call into the + command line parser to set the DAGScheduler, and that don't have enough + control with setSchedulingPreference, should look into overriding the + SubTargetHook "getDAGScheduler()". + +* ``ilist_iterator<T>`` no longer has implicit conversions to and from ``T*``, + since ``ilist_iterator<T>`` may be pointing at the sentinel (which is usually + not of type ``T`` at all). To convert from an iterator ``I`` to a pointer, + use ``&*I``; to convert from a pointer ``P`` to an iterator, use + ``P->getIterator()``. Alternatively, explicit conversions via + ``static_cast<T>(U)`` are still available. + +* ``ilist_node<T>::getNextNode()`` and ``ilist_node<T>::getPrevNode()`` now + fail at compile time when the node cannot access its parent list. + Previously, when the sentinel was was an ``ilist_half_node<T>``, this API + could return the sentinal instead of ``nullptr``. Frustrated callers should + be updated to use ``iplist<T>::getNextNode(T*)`` instead. Alternatively, if + the node ``N`` is guaranteed not to be the last in the list, it is safe to + call ``&*++N->getIterator()`` directly. + .. NOTE For small 1-3 sentence descriptions, just add an entry at the end of this list. If your description won't fit comfortably in one bullet @@ -98,17 +108,97 @@ Non-comprehensive list of changes in this release Makes programs 10x faster by doing Special New Thing. -Changes to the ARM Backend --------------------------- - During this release ... +Changes to the ARM Backends +--------------------------- + +During this release the AArch64 target has: + +* Added support for more sanitizers (MSAN, TSAN) and made them compatible with + all VMA kernel configurations (kurrently tested on 39 and 42 bits). +* Gained initial LLD support in the new ELF back-end +* Extended the Load/Store optimiser and cleaned up some of the bad decisions + made earlier. +* Expanded LLDB support, including watchpoints, native building, Renderscript, + LLDB-server, debugging 32-bit applications. +* Added support for the ``Exynos M1`` chip. + +During this release the ARM target has: + +* Gained massive performance improvements on embedded benchmarks due to finally + running the stride vectorizer in full form, incrementing the performance gains + that we already had in the previous releases with limited stride vectorization. +* Expanded LLDB support, including watchpoints, unwind tables +* Extended the Load/Store optimiser and cleaned up some of the bad decisions + made earlier. +* Simplified code generation for global variable addresses in ELF, resulting in + a significant (4% in Chromium) reduction in code size. +* Gained some additional code size improvements, though there's still a long road + ahead, especially for older cores. +* Added some EABI floating point comparison functions to Compiler-RT +* Added support for Windows+GNU triple, +features in -mcpu/-march options. Changes to the MIPS Target -------------------------- - During this release ... +During this release the MIPS target has: +* Significantly extended support for the Integrated Assembler. See below for + more information +* Added support for the ``P5600`` processor. +* Added support for the ``interrupt`` attribute for MIPS32R2 and later. This + attribute will generate a function which can be used as a interrupt handler + on bare metal MIPS targets using the static relocation model. +* Added support for the ``ERETNC`` instruction found in MIPS32R5 and later. +* Added support for OpenCL. See http://portablecl.org/. + + * Address spaces 1 to 255 are now reserved for software use and conversions + between them are no-op casts. + +* Removed the ``mips16`` value for the -mcpu option since it is an :abbr:`ASE + (Application Specific Extension)` and not a processor. If you were using this, + please specify another CPU and use ``-mips16`` to enable MIPS16. +* Removed ``copy_u.w`` from 32-bit MSA and ``copy_u.d`` from 64-bit MSA since + they have been removed from the MSA specification due to forward compatibility + issues. For example, 32-bit MSA code containing ``copy_u.w`` would behave + differently on a 64-bit processor supporting MSA. The corresponding intrinsics + are still available and may expand to ``copy_s.[wd]`` where this is + appropriate for forward compatibility purposes. +* Relaxed the ``-mnan`` option to allow ``-mnan=2008`` on MIPS32R2/MIPS64R2 for + compatibility with GCC. +* Made MIPS64R6 the default CPU for 64-bit Android triples. + +The MIPS target has also fixed various bugs including the following notable +fixes: + +* Fixed reversed operands on ``mthi``/``mtlo`` in the DSP :abbr:`ASE + (Application Specific Extension)`. +* The code generator no longer uses ``jal`` for calls to absolute immediate + addresses. +* Disabled fast instruction selection on MIPS32R6 and MIPS64R6 since this is not + yet supported. +* Corrected addend for ``R_MIPS_HI16`` and ``R_MIPS_PCHI16`` in MCJIT +* The code generator no longer crashes when handling subregisters of an 64-bit + FPU register with undefined value. +* The code generator no longer attempts to use ``$zero`` for operands that do + not permit ``$zero``. +* Corrected the opcode used for ``ll``/``sc`` when using MIPS32R6/MIPS64R6 and + the Integrated Assembler. +* Added support for atomic load and atomic store. +* Corrected debug info when dynamically re-aligning the stack. + +Integrated Assembler +^^^^^^^^^^^^^^^^^^^^ +We have made a large number of improvements to the integrated assembler for +MIPS. In this release, the integrated assembler isn't quite production-ready +since there are a few known issues related to bare-metal support, checking +immediates on instructions, and the N32/N64 ABI's. However, the current support +should be sufficient for many users of the O32 ABI, particularly those targeting +MIPS32 on Linux or bare-metal MIPS32. + +If you would like to try the integrated assembler, please use +``-fintegrated-as``. Changes to the PowerPC Target ----------------------------- @@ -123,6 +213,20 @@ Changes to the X86 Target * TLS is enabled for Cygwin as emutls. +* Smaller code for materializing 32-bit 1 and -1 constants at ``-Os``. + +* More efficient code for wide integer compares. (E.g. 64-bit compares + on 32-bit targets.) + +* Tail call support for ``thiscall``, ``stdcall`, ``vectorcall``, and + ``fastcall`` functions. + +Changes to the AVR Target +------------------------- + +Slightly less than half of the AVR backend has been merged in at this point. It is still +missing a number large parts which cause it to be unusable, but is well on the +road to being completely merged and workable. Changes to the OCaml bindings ----------------------------- @@ -140,7 +244,19 @@ An exciting aspect of LLVM is that it is used as an enabling technology for a lot of other language and tools projects. This section lists some of the projects that have already been updated to work with LLVM 3.8. -* A project +LDC - the LLVM-based D compiler +------------------------------- + +`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It +pragmatically combines efficiency, control, and modeling power, with safety and +programmer productivity. D supports powerful concepts like Compile-Time Function +Execution (CTFE) and Template Meta-Programming, provides an innovative approach +to concurrency and offers many classical paradigms. + +`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler +combined with LLVM as backend to produce efficient native code. LDC targets +x86/x86_64 systems like Linux, OS X and Windows and also PowerPC (32/64 bit) +and ARM. Ports to other architectures like AArch64 and MIPS64 are underway. Additional Information diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td index 06dfc329fe32..5512b1063fb0 100644 --- a/include/llvm/IR/IntrinsicsPowerPC.td +++ b/include/llvm/IR/IntrinsicsPowerPC.td @@ -484,7 +484,7 @@ let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.". Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">, - Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty], + Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h index bb7ff278fdef..8918dcd38c93 100644 --- a/include/llvm/IR/Value.h +++ b/include/llvm/IR/Value.h @@ -280,11 +280,7 @@ public: // when using them since you might not get all uses. // The methods that don't start with materialized_ assert that modules is // fully materialized. -#ifdef NDEBUG - void assertModuleIsMaterialized() const {} -#else void assertModuleIsMaterialized() const; -#endif bool use_empty() const { assertModuleIsMaterialized(); diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp index 143d0b79f188..6f92ba6289a4 100644 --- a/lib/Analysis/DemandedBits.cpp +++ b/lib/Analysis/DemandedBits.cpp @@ -242,13 +242,6 @@ void DemandedBits::determineLiveOperandBits( if (OperandNo != 0) AB = AOut; break; - case Instruction::ICmp: - // Count the number of leading zeroes in each operand. - ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1)); - auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(), - KnownZero2.countLeadingOnes()); - AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes); - break; } } diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index 4171657b5285..5633aa4a5655 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -555,6 +555,11 @@ bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, return true; O << -MO.getImm(); return false; + case 's': // The GCC deprecated s modifier + if (MO.getType() != MachineOperand::MO_Immediate) + return true; + O << ((32 - MO.getImm()) & 31); + return false; } } return true; diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index ae62b6b19a42..f56c8e492e52 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -793,16 +793,27 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) { llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!"); } -/// Determine whether two variable pieces overlap. -static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { - if (!P1->isBitPiece() || !P2->isBitPiece()) - return true; +// Determine the relative position of the pieces described by P1 and P2. +// Returns -1 if P1 is entirely before P2, 0 if P1 and P2 overlap, +// 1 if P1 is entirely after P2. +static int pieceCmp(const DIExpression *P1, const DIExpression *P2) { unsigned l1 = P1->getBitPieceOffset(); unsigned l2 = P2->getBitPieceOffset(); unsigned r1 = l1 + P1->getBitPieceSize(); unsigned r2 = l2 + P2->getBitPieceSize(); - // True where [l1,r1[ and [r1,r2[ overlap. - return (l1 < r2) && (l2 < r1); + if (r1 <= l2) + return -1; + else if (r2 <= l1) + return 1; + else + return 0; +} + +/// Determine whether two variable pieces overlap. +static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { + if (!P1->isBitPiece() || !P2->isBitPiece()) + return true; + return pieceCmp(P1, P2) == 0; } /// \brief If this and Next are describing different pieces of the same @@ -811,14 +822,32 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) { /// Return true if the merge was successful. bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) { if (Begin == Next.Begin) { - auto *Expr = cast_or_null<DIExpression>(Values[0].Expression); - auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression); - if (Expr->isBitPiece() && NextExpr->isBitPiece() && - !piecesOverlap(Expr, NextExpr)) { - addValues(Next.Values); - End = Next.End; - return true; + auto *FirstExpr = cast<DIExpression>(Values[0].Expression); + auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression); + if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece()) + return false; + + // We can only merge entries if none of the pieces overlap any others. + // In doing so, we can take advantage of the fact that both lists are + // sorted. + for (unsigned i = 0, j = 0; i < Values.size(); ++i) { + for (; j < Next.Values.size(); ++j) { + int res = pieceCmp(cast<DIExpression>(Values[i].Expression), + cast<DIExpression>(Next.Values[j].Expression)); + if (res == 0) // The two expressions overlap, we can't merge. + return false; + // Values[i] is entirely before Next.Values[j], + // so go back to the next entry of Values. + else if (res == -1) + break; + // Next.Values[j] is entirely before Values[i], so go on to the + // next entry of Next.Values. + } } + + addValues(Next.Values); + End = Next.End; + return true; } return false; } diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index eb9deb6a07e1..4d224a041349 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -313,8 +313,8 @@ void Value::takeName(Value *V) { ST->reinsertValue(this); } -#ifndef NDEBUG void Value::assertModuleIsMaterialized() const { +#ifndef NDEBUG const GlobalValue *GV = dyn_cast<GlobalValue>(this); if (!GV) return; @@ -322,8 +322,10 @@ void Value::assertModuleIsMaterialized() const { if (!M) return; assert(M->isMaterialized()); +#endif } +#ifndef NDEBUG static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr, Constant *C) { if (!Cache.insert(Expr).second) diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 46ef2c111bae..cd3e84d38fe2 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -90,6 +90,7 @@ def AArch64InstrInfo : InstrInfo; include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +include "AArch64SchedM1.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", @@ -144,8 +145,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; -// FIXME: Exynos-M1 is currently modelled without a specific SchedModel. -def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>; +def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 9b73c5e9d952..92cf1cd71970 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6689,6 +6689,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } + if (LHS.getValueType().getVectorElementType() == MVT::f16) + return SDValue(); + assert(LHS.getValueType().getVectorElementType() == MVT::f32 || LHS.getValueType().getVectorElementType() == MVT::f64); diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td new file mode 100644 index 000000000000..6525628dbfd6 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -0,0 +1,359 @@ +//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Samsung Exynos-M1 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Exynos-M1 is a traditional superscalar microprocessor with a +// 4-wide in-order stage for decode and dispatch and a wider issue stage. +// The execution units and loads and stores are out-of-order. + +def ExynosM1Model : SchedMachineModel { + let IssueWidth = 4; // Up to 4 uops per cycle. + let MinLatency = 0; // OoO. + let MicroOpBufferSize = 96; // ROB size. + let LoopMicroOpBufferSize = 32; // Instruction queue size. + let LoadLatency = 4; // Optimistic load cases. + let MispredictPenalty = 14; // Minimum branch misprediction penalty. + let CompleteModel = 0; // Use the default model otherwise. +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on the Exynos-M1, +// which has 9 pipelines, each with its own queue with out-of-order dispatch. + +def M1UnitA : ProcResource<2>; // Simple integer +def M1UnitC : ProcResource<1>; // Simple and complex integer +def M1UnitB : ProcResource<2>; // Branch +def M1UnitL : ProcResource<1>; // Load +def M1UnitS : ProcResource<1>; // Store +def M1PipeF0 : ProcResource<1>; // FP #0 +def M1PipeF1 : ProcResource<1>; // FP #1 + +let Super = M1PipeF0 in { + def M1UnitFMAC : ProcResource<1>; // FP multiplication + def M1UnitFCVT : ProcResource<1>; // FP conversion + def M1UnitNAL0 : ProcResource<1>; // Simple vector. + def M1UnitNMISC : ProcResource<1>; // Miscellanea + def M1UnitNCRYPT : ProcResource<1>; // Cryptographic +} + +let Super = M1PipeF1 in { + def M1UnitFADD : ProcResource<1>; // Simple FP + let BufferSize = 1 in + def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized) + def M1UnitNAL1 : ProcResource<1>; // Simple vector. + def M1UnitFST : ProcResource<1>; // FP store +} + +let SchedModel = ExynosM1Model in { + def M1UnitALU : ProcResGroup<[M1UnitA, + M1UnitC]>; // All simple integer. + def M1UnitNALU : ProcResGroup<[M1UnitNAL0, + M1UnitNAL1]>; // All simple vector. +} + +let SchedModel = ExynosM1Model in { + +//===----------------------------------------------------------------------===// +// Coarse scheduling model for the Exynos-M1. + +// Branch instructions. +// TODO: Non-conditional direct branches take zero cycles and units. +def : WriteRes<WriteBr, [M1UnitB]> { let Latency = 1; } +def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; } +// TODO: Branch and link is much different. + +// Arithmetic and logical integer instructions. +def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; } +// TODO: Shift over 3 and some extensions take 2 cycles. +def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; } +def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; } +def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; } + +// Move instructions. +def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; } + +// Divide and multiply instructions. +// TODO: Division blocks the divider inside C. +def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; } +def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; } +// TODO: Long multiplication take 5 cycles and also the ALU. +// TODO: Multiplication with accumulation can be advanced. +def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; } +// TODO: 64-bit multiplication has a throughput of 1/2. +def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; } + +// Miscellaneous instructions. +def : WriteRes<WriteExtr, [M1UnitALU, + M1UnitALU]> { let Latency = 2; } + +// TODO: The latency for the post or pre register is 1 cycle. +def : WriteRes<WriteAdr, []> { let Latency = 0; } + +// Load instructions. +def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; } +// TODO: Extended address requires also the ALU. +def : WriteRes<WriteLDIdx, [M1UnitL]> { let Latency = 5; } +def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; } + +// Store instructions. +def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; } +// TODO: Extended address requires also the ALU. +def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; } +def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; } +def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; } + +// FP data instructions. +def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; } +// TODO: FCCMP is much different. +def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; } +// TODO: DP takes longer. +def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15; } +// TODO: MACC takes longer. +def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; } + +// FP miscellaneous instructions. +// TODO: Conversion between register files is much different. +def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; } +def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; } +// TODO: Copy from FPR to GPR is much different. +def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; } + +// FP load instructions. +// TODO: ASIMD loads are much different. +def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; } + +// FP store instructions. +// TODO: ASIMD stores are much different. +def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; } + +// ASIMD FP instructions. +// TODO: Other operations are much different. +def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; } + +// Other miscellaneous instructions. +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Fast forwarding. + +// TODO: Add FP register forwarding rules. + +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +// Integer multiply-accumulate. +// TODO: The forwarding for WriteIM64 saves actually 3 cycles. +def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +//===----------------------------------------------------------------------===// +// Finer scheduling model for the Exynos-M1. + +def M1WriteNEONA : SchedWriteRes<[M1UnitNALU, + M1UnitNALU, + M1UnitFADD]> { let Latency = 9; } +def M1WriteNEONB : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 5; } +def M1WriteNEONC : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 6; } +def M1WriteNEOND : SchedWriteRes<[M1UnitNALU, + M1UnitFST, + M1UnitL]> { let Latency = 10; } +def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT, + M1UnitFST]> { let Latency = 8; } +def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT, + M1UnitFST, + M1UnitL]> { let Latency = 13; } +def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC, + M1UnitFST]> { let Latency = 6; } +def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 3; } +def M1WriteNEONI : SchedWriteRes<[M1UnitFST, + M1UnitL]> { let Latency = 9; } +def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } +def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; } +// FIXME: This is the worst case, conditional branch and link. +def M1WriteBL : SchedWriteRes<[M1UnitB, + M1UnitALU]> { let Latency = 1; } +// FIXME: This is the worst case, when using LR. +def M1WriteBLR : SchedWriteRes<[M1UnitB, + M1UnitALU, + M1UnitALU]> { let Latency = 2; } +def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } +def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } +def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; } +def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; } +def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; } +def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; } +def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; } +def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; } +def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; } +def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; } +def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; } +def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; } +def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; } +def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; } +def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; } +def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; } +def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; } +def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } +def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } +def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } +def M1WriteTB : SchedWriteRes<[M1UnitC, + M1UnitALU]> { let Latency = 2; } + +// Branch instructions +def : InstRW<[M1WriteB ], (instrs Bcc)>; +def : InstRW<[M1WriteBL], (instrs BL)>; +def : InstRW<[M1WriteBLR], (instrs BLR)>; +def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; +def : InstRW<[M1WriteTB], (instregex "^TBN?Z[WX]")>; + +// Arithmetic and logical integer instructions. +def : InstRW<[M1WriteALU1], (instrs COPY)>; + +// Divide and multiply instructions. + +// Miscellaneous instructions. + +// Load instructions. + +// Store instructions. + +// FP data instructions. +def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>; +def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>; +def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>; +def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>; +def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>; +def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>; +def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>; +def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>; +def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>; +def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>; + +// FP miscellaneous instructions. +def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>; +def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; +def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>; +def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>; +def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>; +def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>; + +// FP load instructions. + +// FP store instructions. + +// ASIMD instructions. +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>; +def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>; +def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>; +def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>; +def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; +def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>; +def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>; +def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>; +def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>; +def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>; +def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>; + +// ASIMD FP instructions. +def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>; +def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>; +def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>; +def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; +def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>; +def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>; +def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>; +def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; +def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; +def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>; +def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; + +// ASIMD miscellaneous instructions. +def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>; +def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; +def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>; +def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>; +def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>; +def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>; +def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>; +def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>; +def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 2>], + (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 3>], + (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 4>], + (instregex "^TB[LX]v8i8Four")>; +def : InstRW<[M1WriteNAL12], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[WriteSequence<[M1WriteNAL12], 2>], + (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[WriteSequence<[M1WriteNAL12], 3>], + (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[WriteSequence<[M1WriteNAL12], 4>], + (instregex "^TB[LX]v16i8Four")>; +def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>; +def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; +def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>; +def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>; + +// ASIMD load instructions. + +// ASIMD store instructions. + +// Cryptography instructions. +def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; +def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>; +def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>; + +// CRC instructions. +def : InstRW<[M1WriteC2], (instregex "^CRC32")>; + +} // SchedModel = ExynosM1Model diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 79c6604c4cc8..844d89c737bf 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -183,6 +183,7 @@ def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>; def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>; def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>; def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>; +def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>; class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< "localmemorysize"#Value, @@ -252,7 +253,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; + FeatureGCN3Encoding, FeatureCIInsts]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 4796e9ef3454..49c94f1eceb8 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -53,7 +53,8 @@ public: ISAVersion7_0_0, ISAVersion7_0_1, ISAVersion8_0_0, - ISAVersion8_0_1 + ISAVersion8_0_1, + ISAVersion8_0_3 }; private: diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index a1584a224cbd..4300d972d46b 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -128,21 +128,23 @@ def : ProcessorModel<"mullins", SIQuarterSpeedModel, //===----------------------------------------------------------------------===// def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0] + [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0, + FeatureLDSBankCount32] >; def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0] + [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0, + FeatureLDSBankCount32] >; def : ProcessorModel<"carrizo", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1] + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] >; def : ProcessorModel<"fiji", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1] + [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32] >; def : ProcessorModel<"stoney", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureISAVersion8_0_1] + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16] >; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 609f5e7df549..025ed2b5b76b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -234,6 +234,7 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, bool IsLoad = TII->get(LoadStoreOp).mayLoad(); bool RanOutOfSGPRs = false; + bool Scavenged = false; unsigned SOffset = ScratchOffset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); @@ -244,6 +245,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, if (SOffset == AMDGPU::NoRegister) { RanOutOfSGPRs = true; SOffset = AMDGPU::SGPR0; + } else { + Scavenged = true; } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) .addReg(ScratchOffset) @@ -259,10 +262,14 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; + unsigned SOffsetRegState = 0; + if (i + 1 == e && Scavenged) + SOffsetRegState |= RegState::Kill; + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) .addReg(ScratchRsrcReg) - .addReg(SOffset) + .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 3b4c235c0dc9..1f5deaef9d3b 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -41,6 +41,9 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; + if (Features.test(FeatureISAVersion8_0_3)) + return {8, 0, 3}; + return {0, 0, 0}; } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index dfbb96959470..6e7edbf9fb15 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -747,7 +747,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, // If Offset is a multiply-by-constant and it's profitable to extract a shift // and use it in a shifted operand do so. - if (Offset.getOpcode() == ISD::MUL) { + if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) { @@ -1422,7 +1422,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, // If OffReg is a multiply-by-constant and it's profitable to extract a shift // and use it in a shifted operand do so. - if (OffReg.getOpcode() == ISD::MUL) { + if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) { unsigned PowerOfTwo = 0; SDValue NewMulConst; if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) { diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index b451ebf7f27a..16dcd468c91d 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1615,7 +1615,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) { // extension rather than sign extension. Make sure we pass the return // value extension property to integer materialization. unsigned SrcReg = - PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt); + PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg); @@ -2091,25 +2091,21 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass); + int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue(); // If the constant is in range, use a load-immediate. - if (UseSExt && isInt<16>(CI->getSExtValue())) { + // Since LI will sign extend the constant we need to make sure that for + // our zeroext constants that the sign extended constant fits into 16-bits - + // a range of 0..0x7fff. + if (isInt<16>(Imm)) { unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; unsigned ImmReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) - .addImm(CI->getSExtValue()); - return ImmReg; - } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) { - unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; - unsigned ImmReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) - .addImm(CI->getZExtValue()); + .addImm(Imm); return ImmReg; } // Construct the constant piecewise. - int64_t Imm = CI->getZExtValue(); - if (VT == MVT::i64) return PPCMaterialize64BitInt(Imm, RC); else if (VT == MVT::i32) diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index cb0271fe8d0c..53674681b213 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -736,7 +736,7 @@ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, v16i8, v8i16>; def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, - v16i8, v4i32>; + v8i16, v4i32>; def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, v8i16, v4i32>; def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index ee732675fb39..b0a612764636 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1849,7 +1849,7 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask, if (CCMask == SystemZ::CCMASK_CMP_NE) return SystemZ::CCMASK_TM_SOME_1; } - if (EffectivelyUnsigned && CmpVal <= Low) { + if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) { if (CCMask == SystemZ::CCMASK_CMP_LT) return SystemZ::CCMASK_TM_ALL_0; if (CCMask == SystemZ::CCMASK_CMP_GE) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 34f39190ab96..c12a3ed43d29 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1335,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SETCCE, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); @@ -14975,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { assert(Carry.getOpcode() != ISD::CARRY_FALSE); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); - return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), - DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); + if (Op.getSimpleValueType() == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); + return SetCC; } // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. @@ -16315,6 +16319,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { + if (isAllOnesConstant(Mask)) + return DAG.getTargetConstant(1, dl, MaskVT); + if (X86::isZeroNode(Mask)) + return DAG.getTargetConstant(0, dl, MaskVT); + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { // Mask should be extended Mask = DAG.getNode(ISD::ANY_EXTEND, dl, @@ -17203,26 +17212,14 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); - SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; return DAG.getMergeValues(RetOps, dl); @@ -17230,7 +17227,8 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, - SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -17238,29 +17236,18 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else { - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements - // are extracted by EXTRACT_SUBVECTOR. - MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - } + SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); - SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); return SDValue(Res, 1); } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, - SDValue ScaleOp, SDValue Chain) { + SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); @@ -17268,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); - SDValue MaskInReg; - ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); - if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getBitcast(MaskVT, Mask); + SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl); //SDVTList VTs = DAG.getVTList(MVT::Other); - SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } @@ -17509,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, - Scale, Chain); + Scale, Chain, *Subtarget); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -17521,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); - return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); + return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, + *Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index c0786afe965e..d9311a343ead 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3560,7 +3560,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { BO1->getOperand(0)); } - if (CI->isMaxValue(true)) { + if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { ICmpInst::Predicate Pred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 47406b9a1632..dd2889de405e 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -557,7 +557,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { ConstantInt::get(IdxType, i), }; auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName); - auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName); + auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(), + LoadName); V = IC.Builder->CreateInsertValue(V, L, i); } diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 5cde31a9162e..bc4c0ebae790 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -380,6 +380,23 @@ static void replaceExtractElements(InsertElementInst *InsElt, ExtendMask.push_back(UndefValue::get(IntType)); Value *ExtVecOp = ExtElt->getVectorOperand(); + auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); + BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) + ? ExtVecOpInst->getParent() + : ExtElt->getParent(); + + // TODO: This restriction matches the basic block check below when creating + // new extractelement instructions. If that limitation is removed, this one + // could also be removed. But for now, we just bail out to ensure that we + // will replace the extractelement instruction that is feeding our + // insertelement instruction. This allows the insertelement to then be + // replaced by a shufflevector. If the insertelement is not replaced, we can + // induce infinite looping because there's an optimization for extractelement + // that will delete our widening shuffle. This would trigger another attempt + // here to create that shuffle, and we spin forever. + if (InsertionBlock != InsElt->getParent()) + return; + auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ConstantVector::get(ExtendMask)); @@ -387,7 +404,6 @@ static void replaceExtractElements(InsertElementInst *InsElt, // (as long as it's not a PHI) or at the start of the basic block of the // extract, so any subsequent extracts in the same basic block can use it. // TODO: Insert before the earliest ExtractElementInst that is replaced. - auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) WideVec->insertAfter(ExtVecOpInst); else diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 3125a2c359b6..e484b690597e 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -90,6 +90,11 @@ static cl::opt<bool> SpeculateOneExpensiveInst( cl::desc("Allow exactly one expensive instruction to be speculatively " "executed")); +static cl::opt<unsigned> MaxSpeculationDepth( + "max-speculation-depth", cl::Hidden, cl::init(10), + cl::desc("Limit maximum recursion depth when calculating costs of " + "speculatively executed instructions")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); @@ -269,6 +274,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, unsigned &CostRemaining, const TargetTransformInfo &TTI, unsigned Depth = 0) { + // It is possible to hit a zero-cost cycle (phi/gep instructions for example), + // so limit the recursion depth. + // TODO: While this recursion limit does prevent pathological behavior, it + // would be better to track visited instructions to avoid cycles. + if (Depth == MaxSpeculationDepth) + return false; + Instruction *I = dyn_cast<Instruction>(V); if (!I) { // Non-instructions all dominate instructions, but not all constantexprs diff --git a/test/Analysis/DemandedBits/basic.ll b/test/Analysis/DemandedBits/basic.ll index 9973edf79c17..3fd1b3212883 100644 --- a/test/Analysis/DemandedBits/basic.ll +++ b/test/Analysis/DemandedBits/basic.ll @@ -10,34 +10,3 @@ define i8 @test_mul(i32 %a, i32 %b) { %3 = trunc i32 %2 to i8 ret i8 %3 } - -; CHECK-LABEL: 'test_icmp1' -; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2 -; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0xFFF for %2 = shl i32 %1, 4 -define i1 @test_icmp1(i32 %a, i32 %b) { - %1 = and i32 %a, 255 - %2 = shl i32 %1, 4 - %3 = icmp eq i32 %1, %2 - ret i1 %3 -} - -; CHECK-LABEL: 'test_icmp2' -; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2 -; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0xFF for %2 = ashr i32 %1, 4 -define i1 @test_icmp2(i32 %a, i32 %b) { - %1 = and i32 %a, 255 - %2 = ashr i32 %1, 4 - %3 = icmp eq i32 %1, %2 - ret i1 %3 -} - -; CHECK-LABEL: 'test_icmp3' -; CHECK-DAG: DemandedBits: 0xFFFFFFFF for %1 = and i32 %a, 255 -; CHECK-DAG: DemandedBits: 0x1 for %2 = icmp eq i32 -1, %1 -define i1 @test_icmp3(i32 %a) { - %1 = and i32 %a, 255 - %2 = icmp eq i32 -1, %1 - ret i1 %2 -} diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll index f6e4bdf73459..b892f1902b03 100644 --- a/test/CodeGen/AArch64/fp16-v4-instructions.ll +++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll @@ -267,4 +267,278 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 { ret <4 x i16> %1 } +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_une: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, ne +; CHECK-DAG: csel {{.*}}, wzr, ne +; CHECK-DAG: csel {{.*}}, wzr, ne +; CHECK-DAG: csel {{.*}}, wzr, ne +define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp une <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ueq: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, vs +; CHECK-DAG: csel {{.*}}, vs +; CHECK-DAG: csel {{.*}}, vs +; CHECK-DAG: csel {{.*}}, vs +define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ueq <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ugt: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, hi +; CHECK-DAG: csel {{.*}}, wzr, hi +; CHECK-DAG: csel {{.*}}, wzr, hi +; CHECK-DAG: csel {{.*}}, wzr, hi +define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ugt <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_uge: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, pl +; CHECK-DAG: csel {{.*}}, wzr, pl +; CHECK-DAG: csel {{.*}}, wzr, pl +; CHECK-DAG: csel {{.*}}, wzr, pl +define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp uge <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ult: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, lt +; CHECK-DAG: csel {{.*}}, wzr, lt +; CHECK-DAG: csel {{.*}}, wzr, lt +; CHECK-DAG: csel {{.*}}, wzr, lt +define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ult <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ule: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, le +; CHECK-DAG: csel {{.*}}, wzr, le +; CHECK-DAG: csel {{.*}}, wzr, le +; CHECK-DAG: csel {{.*}}, wzr, le +define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ule <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_uno: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, vs +; CHECK-DAG: csel {{.*}}, wzr, vs +; CHECK-DAG: csel {{.*}}, wzr, vs +; CHECK-DAG: csel {{.*}}, wzr, vs +define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp uno <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_one: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, gt +; CHECK-DAG: csel {{.*}}, gt +; CHECK-DAG: csel {{.*}}, gt +; CHECK-DAG: csel {{.*}}, gt +define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp one <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_oeq: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +; CHECK-DAG: csel {{.*}}, wzr, eq +define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp oeq <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ogt: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, gt +; CHECK-DAG: csel {{.*}}, wzr, gt +; CHECK-DAG: csel {{.*}}, wzr, gt +; CHECK-DAG: csel {{.*}}, wzr, gt +define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ogt <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_oge: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, ge +; CHECK-DAG: csel {{.*}}, wzr, ge +; CHECK-DAG: csel {{.*}}, wzr, ge +; CHECK-DAG: csel {{.*}}, wzr, ge +define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp oge <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_olt: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +; CHECK-DAG: csel {{.*}}, wzr, mi +define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp olt <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ole: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, ls +; CHECK-DAG: csel {{.*}}, wzr, ls +; CHECK-DAG: csel {{.*}}, wzr, ls +; CHECK-DAG: csel {{.*}}, wzr, ls +define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ole <4 x half> %a, %b + ret <4 x i1> %1 +} + +; Function Attrs: nounwind readnone +; CHECK-LABEL: test_fcmp_ord: +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: fcvt +; CHECK-DAG: csel {{.*}}, wzr, vc +; CHECK-DAG: csel {{.*}}, wzr, vc +; CHECK-DAG: csel {{.*}}, wzr, vc +; CHECK-DAG: csel {{.*}}, wzr, vc +define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 { + %1 = fcmp ord <4 x half> %a, %b + ret <4 x i1> %1 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll index 137d1f358a30..2f70f3635d19 100644 --- a/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -421,4 +421,88 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 { ret <8 x i16> %1 } +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp une <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 16 csel tests. Skipped. +define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ueq <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ugt <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp uge <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ult <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ule <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp uno <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp one <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp oeq <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ogt <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp oge <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp olt <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ole <8 x half> %a, %b + ret <8 x i1> %1 +} + +; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped. +define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 { + %1 = fcmp ord <8 x half> %a, %b + ret <8 x i1> %1 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll index 0e4662231b4f..f82e98e79545 100644 --- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s ; HSA: .hsa_code_object_version 1,0 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" +; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU" diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll index 3d05da616e4e..fdc324087015 100644 --- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll +++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll @@ -1,5 +1,6 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s ;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s +;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s ;GCN-LABEL: {{^}}main: diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll new file mode 100644 index 000000000000..4a12ed545b81 --- /dev/null +++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; When the offset of VGPR spills into scratch space gets too large, an additional SGPR +; is used to calculate the scratch load/store address. Make sure that this +; mechanism works even when many spills happen. + +; Just test that it compiles successfully. +; CHECK-LABEL: test +define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in, + <96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) { +entry: + %tid = call i32 @llvm.SI.tid() nounwind readnone + + %aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid + %a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr + +; mark most VGPR registers as used to increase register pressure + call void asm sideeffect "", "~{VGPR4},~{VGPR8},~{VGPR12},~{VGPR16},~{VGPR20},~{VGPR24},~{VGPR28},~{VGPR32}" () + call void asm sideeffect "", "~{VGPR36},~{VGPR40},~{VGPR44},~{VGPR48},~{VGPR52},~{VGPR56},~{VGPR60},~{VGPR64}" () + call void asm sideeffect "", "~{VGPR68},~{VGPR72},~{VGPR76},~{VGPR80},~{VGPR84},~{VGPR88},~{VGPR92},~{VGPR96}" () + call void asm sideeffect "", "~{VGPR100},~{VGPR104},~{VGPR108},~{VGPR112},~{VGPR116},~{VGPR120},~{VGPR124},~{VGPR128}" () + call void asm sideeffect "", "~{VGPR132},~{VGPR136},~{VGPR140},~{VGPR144},~{VGPR148},~{VGPR152},~{VGPR156},~{VGPR160}" () + call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" () + call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" () + + %outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid + store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr + + ret void +} + +declare i32 @llvm.SI.tid() nounwind readnone diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll index 5d44eb0f11d1..e5f9b118527a 100644 --- a/test/CodeGen/ARM/shifter_operand.ll +++ b/test/CodeGen/ARM/shifter_operand.ll @@ -239,3 +239,20 @@ define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) { store i32 %add, i32* %addr ret void } + +define { i32, i32 } @test_multi_use_add(i32 %base, i32 %offset) { +; CHECK-LABEL: test_multi_use_add: +; CHECK-THUMB: movs [[CONST:r[0-9]+]], #28 +; CHECK-THUMB: movt [[CONST]], #1 + + %prod = mul i32 %offset, 65564 + %sum = add i32 %base, %prod + + %ptr = inttoptr i32 %sum to i32* + %loaded = load i32, i32* %ptr + + %ret.tmp = insertvalue { i32, i32 } undef, i32 %sum, 0 + %ret = insertvalue { i32, i32 } %ret.tmp, i32 %loaded, 1 + + ret { i32, i32 } %ret +} diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll index e05ef7d9ab82..0adb5a935109 100644 --- a/test/CodeGen/PowerPC/fast-isel-ret.ll +++ b/test/CodeGen/PowerPC/fast-isel-ret.ll @@ -186,3 +186,12 @@ entry: ; ELF64: blr ret i32 -1 } + +define zeroext i16 @ret20() nounwind { +entry: +; ELF64-LABEL: ret20 +; ELF64: lis{{.*}}0 +; ELF64: ori{{.*}}32768 +; ELF64: blr + ret i16 32768 +} diff --git a/test/CodeGen/PowerPC/inline-asm-s-modifier.ll b/test/CodeGen/PowerPC/inline-asm-s-modifier.ll new file mode 100644 index 000000000000..c8b00b6deb6e --- /dev/null +++ b/test/CodeGen/PowerPC/inline-asm-s-modifier.ll @@ -0,0 +1,10 @@ +; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +define void @test() { +entry: + call void asm sideeffect "mtfsb1 ${0:s}", "i"(i32 7), !srcloc !1 + ret void +} +; CHECK: #APP +; CHECK-NEXT: mtfsb1 25 + +!1 = !{i32 40} diff --git a/test/CodeGen/PowerPC/pr26193.ll b/test/CodeGen/PowerPC/pr26193.ll new file mode 100644 index 000000000000..acd99bc0331a --- /dev/null +++ b/test/CodeGen/PowerPC/pr26193.ll @@ -0,0 +1,9 @@ +; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s +define <8 x i16> @test(<4 x i32> %a) { +entry: + %0 = tail call <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32> %a, <4 x i32> %a) + ret <8 x i16> %0 +} +; CHECK: vpkswss 2, + +declare <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32>, <4 x i32>) diff --git a/test/CodeGen/PowerPC/pr26356.ll b/test/CodeGen/PowerPC/pr26356.ll new file mode 100644 index 000000000000..0f5d877b5764 --- /dev/null +++ b/test/CodeGen/PowerPC/pr26356.ll @@ -0,0 +1,136 @@ +; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s + +define zeroext i32 @f1() { +entry: + ret i32 65535 +} +; CHECK-LABEL: @f1 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define zeroext i32 @f2() { +entry: + ret i32 32768 +} +; CHECK-LABEL: @f2 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define zeroext i32 @f3() { +entry: + ret i32 32767 +} +; CHECK-LABEL: @f3 +; CHECK: li 3, 32767 + +define zeroext i16 @f4() { +entry: + ret i16 65535 +} +; CHECK-LABEL: @f4 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define zeroext i16 @f5() { +entry: + ret i16 32768 +} +; CHECK-LABEL: @f5 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define zeroext i16 @f6() { +entry: + ret i16 32767 +} +; CHECK-LABEL: @f6 +; CHECK: li 3, 32767 + +define zeroext i16 @f7() { +entry: + ret i16 -1 +} +; CHECK-LABEL: @f7 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define zeroext i16 @f8() { +entry: + ret i16 -32768 +} +; CHECK-LABEL: @f8 +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define signext i32 @f1s() { +entry: + ret i32 65535 +} +; CHECK-LABEL: @f1s +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 65535 + +define signext i32 @f2s() { +entry: + ret i32 32768 +} +; CHECK-LABEL: @f2s +; CHECK: lis 3, 0 +; CHECK: ori 3, 3, 32768 + +define signext i32 @f3s() { +entry: + ret i32 32767 +} +; CHECK-LABEL: @f3s +; CHECK: li 3, 32767 + +define signext i16 @f4s() { +entry: + ret i16 32767 +} +; CHECK-LABEL: @f4s +; CHECK: li 3, 32767 + +define signext i32 @f1sn() { +entry: + ret i32 -65535 +} +; CHECK-LABEL: @f1sn +; CHECK: lis 3, -1 +; CHECK: ori 3, 3, 1 + +define signext i32 @f2sn() { +entry: + ret i32 -32768 +} +; CHECK-LABEL: @f2sn +; CHECK: li 3, -32768 + +define signext i32 @f3sn() { +entry: + ret i32 -32767 +} +; CHECK-LABEL: @f3sn +; CHECK: li 3, -32767 + +define signext i32 @f4sn() { +entry: + ret i32 -65536 +} +; CHECK-LABEL: @f4sn +; CHECK: lis 3, -1 + +define signext i16 @f5sn() { +entry: + ret i16 -32767 +} +; CHECK-LABEL: @f5sn +; CHECK: li 3, -32767 + +define signext i16 @f6sn() { +entry: + ret i16 -32768 +} +; CHECK-LABEL: @f6sn +; CHECK: li 3, -32768 diff --git a/test/CodeGen/PowerPC/pr26381.ll b/test/CodeGen/PowerPC/pr26381.ll new file mode 100644 index 000000000000..a45288e545f2 --- /dev/null +++ b/test/CodeGen/PowerPC/pr26381.ll @@ -0,0 +1,8 @@ +; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s + +define internal signext i32 @foo() #0 { + ret i32 -125452974 +} + +; CHECK: lis 3, -1915 +; CHECK: ori 3, 3, 48466 diff --git a/test/CodeGen/SystemZ/int-cmp-53.ll b/test/CodeGen/SystemZ/int-cmp-53.ll new file mode 100644 index 000000000000..b7d985eeefe7 --- /dev/null +++ b/test/CodeGen/SystemZ/int-cmp-53.ll @@ -0,0 +1,26 @@ +; This used to incorrectly use a TMLL for an always-false test at -O0. +; +; RUN: llc -O0 < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define void @test(i8 *%input, i32 *%result) { +entry: +; CHECK-NOT: tmll + + %0 = load i8, i8* %input, align 1 + %1 = trunc i8 %0 to i1 + %2 = zext i1 %1 to i32 + %3 = icmp sge i32 %2, 0 + br i1 %3, label %if.then, label %if.else + +if.then: + store i32 1, i32* %result, align 4 + br label %return + +if.else: + store i32 0, i32* %result, align 4 + br label %return + +return: + ret void +} + diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 3bc67cceaab5..9ba18192f5d2 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -259,18 +259,22 @@ define void @prefetch(<8 x i64> %ind, i8* %base) { ; CHECK: ## BB#0: ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: kmovb %eax, %k1 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: movb $120, %al +; CHECK-NEXT: kmovb %eax, %k1 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} ; CHECK-NEXT: retq call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0) - call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1) - call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0) - call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1) + call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1) ret void } - declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { @@ -790,3 +794,54 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, < ret void } +define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { +; CHECK-LABEL: scatter_mask_test: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: kmovb %eax, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} +; CHECK-NEXT: movb $96, %al +; CHECK-NEXT: kmovb %eax, %k1 +; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: retq + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2) + call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4) + ret void +} + +define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) { +; CHECK-LABEL: gather_mask_test: +; CHECK: ## BB#0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm2 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: movw $1, %ax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: movw $220, %ax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) + %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) + %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4) + %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4) + + %res4 = fadd <16 x float> %res, %res1 + %res5 = fadd <16 x float> %res3, %res2 + %res6 = fadd <16 x float> %res5, %res4 + ret <16 x float> %res6 +} diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 77739e72fcc8..91b42bd67767 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -1,26 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX +; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s | FileCheck %s --check-prefix=KNL-32 + ; Verify that we don't crash during codegen due to a wrong lowering ; of a setcc node with illegal operand types and return type. define <8 x i16> @pr25080(<8 x i32> %a) { -; CHECK-LABEL: pr25080: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; CHECK-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 -; CHECK-NEXT: vpsraw $15, %xmm0, %xmm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; AVX-LABEL: pr25080: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i23> %1 = icmp eq <8 x i23> %0, zeroinitializer @@ -28,3 +30,46 @@ entry: %3 = sext <8 x i1> %2 to <8 x i16> ret <8 x i16> %3 } + +define void @pr26232(i64 %a) { +; KNL-32-LABEL: pr26232: +; KNL-32: # BB#0: # %for_test11.preheader +; KNL-32-NEXT: pushl %esi +; KNL-32-NEXT: .Ltmp0: +; KNL-32-NEXT: .cfi_def_cfa_offset 8 +; KNL-32-NEXT: .Ltmp1: +; KNL-32-NEXT: .cfi_offset %esi, -8 +; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; KNL-32-NEXT: movw $-1, %dx +; KNL-32-NEXT: .align 16, 0x90 +; KNL-32-NEXT: .LBB1_1: # %for_loop599 +; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1 +; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000 +; KNL-32-NEXT: movl %eax, %esi +; KNL-32-NEXT: sbbl $0, %esi +; KNL-32-NEXT: movl $0, %esi +; KNL-32-NEXT: cmovlw %dx, %si +; KNL-32-NEXT: testw %si, %si +; KNL-32-NEXT: jne .LBB1_1 +; KNL-32-NEXT: # BB#2: # %for_exit600 +; KNL-32-NEXT: popl %esi +; KNL-32-NEXT: retl +allocas: + br label %for_test11.preheader + +for_test11.preheader: ; preds = %for_test11.preheader, %allocas + br i1 undef, label %for_loop599, label %for_test11.preheader + +for_loop599: ; preds = %for_loop599, %for_test11.preheader + %less_i_load605_ = icmp slt i64 %a, 65536 + %less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0 + %less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer + %"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, undef + %intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16 + %res.i895 = icmp eq i16 %intmask.i894, 0 + br i1 %res.i895, label %for_exit600, label %for_loop599 + +for_exit600: ; preds = %for_loop599 + ret void +} diff --git a/test/DebugInfo/X86/PR26148.ll b/test/DebugInfo/X86/PR26148.ll new file mode 100644 index 000000000000..b552508910c9 --- /dev/null +++ b/test/DebugInfo/X86/PR26148.ll @@ -0,0 +1,102 @@ +; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s +; +; Created using clang -g -O3 from: +; struct S0 { +; short f0; +; int f3; +; } a; +; void fn1(short p1) { +; struct S0 b, c = {3}; +; b.f3 = p1; +; a = b = c; +; } +; +; int main() { return 0; } +; +; This is similar to the bug in test/DebugInfo/ARM/PR26163.ll, except that there is an +; extra non-overlapping range first. Thus, we make sure that the backend actually looks +; at all expressions when determining whether to merge ranges, not just the first one. +; AS in 26163, we expect two ranges (as opposed to one), the first one being zero sized +; +; +; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004 +; CHECK: Ending address offset: 0x0000000000000004 +; CHECK: Location description: 10 03 55 93 04 +; CHECK: Beginning address offset: 0x0000000000000004 +; CHECK: Ending address offset: 0x0000000000000014 +; CHECK: Location description: 10 03 10 00 + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +%struct.S0 = type { i16, i32 } + +@a = common global %struct.S0 zeroinitializer, align 4 + +declare void @llvm.dbg.declare(metadata, metadata, metadata) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +; The attributes are here to force the zero-sized range not to be at the start of +; the function, which has special interpretation in DWARF. The fact that this happens +; at all is probably an LLVM bug. +attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" } +define void @fn1(i16 signext %p1) #0 !dbg !4 { +entry: + tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !9, metadata !26), !dbg !27 + tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !10, metadata !26), !dbg !28 + tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !16, metadata !26), !dbg !29 + tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !16, metadata !30), !dbg !29 + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29 + tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !10, metadata !32), !dbg !28 + tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !10, metadata !30), !dbg !28 + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !31), !dbg !28 + store i32 3, i32* bitcast (%struct.S0* @a to i32*), align 4, !dbg !33 + store i32 0, i32* getelementptr inbounds (%struct.S0, %struct.S0* @a, i64 0, i32 1), align 4, !dbg !33 + ret void, !dbg !34 +} + +define i32 @main() !dbg !17 { +entry: + ret i32 0, !dbg !35 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!22, !23, !24} +!llvm.ident = !{!25} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !20) +!1 = !DIFile(filename: "small.c", directory: "/Users/kfischer/Projects/clangbug") +!2 = !{} +!3 = !{!4, !17} +!4 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !5, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, variables: !8) +!5 = !DISubroutineType(types: !6) +!6 = !{null, !7} +!7 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed) +!8 = !{!9, !10, !16} +!9 = !DILocalVariable(name: "p1", arg: 1, scope: !4, file: !1, line: 5, type: !7) +!10 = !DILocalVariable(name: "b", scope: !4, file: !1, line: 6, type: !11) +!11 = !DICompositeType(tag: DW_TAG_structure_type, name: "S0", file: !1, line: 1, size: 64, align: 32, elements: !12) +!12 = !{!13, !14} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "f0", scope: !11, file: !1, line: 2, baseType: !7, size: 16, align: 16) +!14 = !DIDerivedType(tag: DW_TAG_member, name: "f3", scope: !11, file: !1, line: 3, baseType: !15, size: 32, align: 32, offset: 32) +!15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!16 = !DILocalVariable(name: "c", scope: !4, file: !1, line: 6, type: !11) +!17 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 11, type: !18, isLocal: false, isDefinition: true, scopeLine: 11, isOptimized: true, variables: !2) +!18 = !DISubroutineType(types: !19) +!19 = !{!15} +!20 = !{!21} +!21 = !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 4, type: !11, isLocal: false, isDefinition: true, variable: %struct.S0* @a) +!22 = !{i32 2, !"Dwarf Version", i32 2} +!23 = !{i32 2, !"Debug Info Version", i32 3} +!24 = !{i32 1, !"PIC Level", i32 2} +!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)"} +!26 = !DIExpression() +!27 = !DILocation(line: 5, column: 16, scope: !4) +!28 = !DILocation(line: 6, column: 13, scope: !4) +!29 = !DILocation(line: 6, column: 16, scope: !4) +!30 = !DIExpression(DW_OP_bit_piece, 0, 32) +!31 = !DIExpression(DW_OP_bit_piece, 32, 32) +!32 = !DIExpression(DW_OP_bit_piece, 32, 16) +!33 = !DILocation(line: 8, column: 9, scope: !4) +!34 = !DILocation(line: 9, column: 1, scope: !4) +!35 = !DILocation(line: 11, column: 14, scope: !17) diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll index 7d6ec96b5328..1e64cd7f5820 100644 --- a/test/Transforms/InstCombine/icmp.ll +++ b/test/Transforms/InstCombine/icmp.ll @@ -1672,3 +1672,15 @@ define i1 @cmp_slt_rhs_inc(float %x, i32 %i) { %cmp = icmp slt i32 %conv, %inc ret i1 %cmp } + +; CHECK-LABEL: @PR26407 +; CHECK-NEXT: %[[addx:.*]] = add i32 %x, 2147483647 +; CHECK-NEXT: %[[addy:.*]] = add i32 %y, 2147483647 +; CHECK-NEXT: %[[cmp:.*]] = icmp uge i32 %[[addx]], %[[addy]] +; CHECK-NEXT: ret i1 %[[cmp]] +define i1 @PR26407(i32 %x, i32 %y) { + %addx = add i32 %x, 2147483647 + %addy = add i32 %y, 2147483647 + %cmp = icmp uge i32 %addx, %addy + ret i1 %cmp +} diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll index 47c2a139a479..8ed4db8bbbc3 100644 --- a/test/Transforms/InstCombine/insert-extract-shuffle.ll +++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll @@ -175,3 +175,33 @@ bb3: ret <4 x double> %tmp4 } +; PR26354: https://llvm.org/bugs/show_bug.cgi?id=26354 +; Don't create a shufflevector if we know that we're not going to replace the insertelement. + +define double @pr26354(<2 x double>* %tmp, i1 %B) { +; CHECK-LABEL: @pr26354( +; CHECK: %ld = load <2 x double>, <2 x double>* %tmp +; CHECK-NEXT: %e1 = extractelement <2 x double> %ld, i32 0 +; CHECK-NEXT: br i1 %B, label %if, label %end +; CHECK: if: +; CHECK-NEXT: %e2 = extractelement <2 x double> %ld, i32 1 +; CHECK-NEXT: %i1 = insertelement <4 x double> +; CHECK-NEXT: br label %end + +entry: + %ld = load <2 x double>, <2 x double>* %tmp + %e1 = extractelement <2 x double> %ld, i32 0 + %e2 = extractelement <2 x double> %ld, i32 1 + br i1 %B, label %if, label %end + +if: + %i1 = insertelement <4 x double> zeroinitializer, double %e2, i32 3 + br label %end + +end: + %ph = phi <4 x double> [ undef, %entry ], [ %i1, %if ] + %e3 = extractelement <4 x double> %ph, i32 1 + %mu = fmul double %e1, %e3 + ret double %mu +} + diff --git a/test/Transforms/InstCombine/unpack-fca.ll b/test/Transforms/InstCombine/unpack-fca.ll index 9b8d10457491..435983924b77 100644 --- a/test/Transforms/InstCombine/unpack-fca.ll +++ b/test/Transforms/InstCombine/unpack-fca.ll @@ -136,3 +136,18 @@ define %B @structB(%B* %b.ptr) { %1 = load %B, %B* %b.ptr, align 8 ret %B %1 } + +%struct.S = type <{ i8, %struct.T }> +%struct.T = type { i32, i32 } + +; Make sure that we do not increase alignment of packed struct element +define i32 @packed_alignment(%struct.S* dereferenceable(9) %s) { +; CHECK-LABEL: packed_alignment +; CHECK-NEXT: %tv.elt1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1, i32 1 +; CHECK-NEXT: %tv.unpack2 = load i32, i32* %tv.elt1, align 1 +; CHECK-NEXT: ret i32 %tv.unpack2 + %t = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 1 + %tv = load %struct.T, %struct.T* %t, align 1 + %v = extractvalue %struct.T %tv, 1 + ret i32 %v +} diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index eee310491805..51f899c2f645 100644 --- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -205,39 +205,5 @@ for.body: ; preds = %for.body, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: @add_g -; CHECK: load <16 x i8> -; CHECK: xor <16 x i8> -; CHECK: icmp ult <16 x i8> -; CHECK: select <16 x i1> {{.*}}, <16 x i8> -; CHECK: store <16 x i8> -define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 { - %1 = icmp sgt i32 %len, 0 - br i1 %1, label %.lr.ph, label %._crit_edge - -.lr.ph: ; preds = %0 - %2 = sext i8 %arg1 to i64 - br label %3 - -._crit_edge: ; preds = %3, %0 - ret void - -; <label>:3 ; preds = %3, %.lr.ph - %indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ] - %x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv - %x5 = load i8, i8* %x4 - %x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv - %x8 = load i8, i8* %x7 - %x9 = zext i8 %x5 to i32 - %x10 = xor i32 %x9, 255 - %x11 = icmp ult i32 %x10, 24 - %x12 = select i1 %x11, i32 %x10, i32 24 - %x13 = trunc i32 %x12 to i8 - store i8 %x13, i8* %x4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %len - br i1 %exitcond, label %._crit_edge, label %3 -} attributes #0 = { nounwind } diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll index cae1a91bd43d..6953cf9c8b33 100644 --- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll +++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll @@ -1302,3 +1302,35 @@ l6: ; CHECK: entry ; CHECK-NEXT: switch } + +; Speculation depth must be limited to avoid a zero-cost instruction cycle. + +; CHECK-LABEL: @PR26308( +; CHECK: cleanup4: +; CHECK-NEXT: br label %cleanup4 + +define i32 @PR26308(i1 %B, i64 %load) { +entry: + br label %while.body + +while.body: + br label %cleanup + +cleanup: + %cleanup.dest.slot.0 = phi i1 [ false, %while.body ] + br i1 %cleanup.dest.slot.0, label %for.cond, label %cleanup4 + +for.cond: + %e.0 = phi i64* [ undef, %cleanup ], [ %incdec.ptr, %for.cond2 ] + %pi = ptrtoint i64* %e.0 to i64 + %incdec.ptr = getelementptr inbounds i64, i64* %e.0, i64 1 + br label %for.cond2 + +for.cond2: + %storemerge = phi i64 [ %pi, %for.cond ], [ %load, %for.cond2 ] + br i1 %B, label %for.cond2, label %for.cond + +cleanup4: + br label %while.body +} + diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 9b89d87fc571..f50382b08977 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -25,7 +25,7 @@ if(NOT LLVM_USE_INTEL_JITEVENTS ) set(LLVM_TOOL_LLVM_JITLISTENER_BUILD Off) endif() -if(CYGWIN) +if(CYGWIN OR NOT LLVM_ENABLE_PIC) set(LLVM_TOOL_LTO_BUILD Off) set(LLVM_TOOL_LLVM_LTO_BUILD Off) endif() diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh index c5fe631da998..69de8371d631 100755 --- a/utils/release/test-release.sh +++ b/utils/release/test-release.sh @@ -155,9 +155,12 @@ while [ $# -gt 0 ]; do done if [ "$use_autoconf" = "no" ]; then - # See llvm.org/PR26146. - echo Skipping test-suite when using CMake. - do_test_suite="no" + if [ "$do_test_suite" = "yes" ]; then + # See llvm.org/PR26146. + echo Skipping test-suite build when using CMake. + echo It will still be exported. + do_test_suite="export-only" + fi fi # Check required arguments. @@ -202,9 +205,11 @@ if [ $do_libs = "yes" ]; then projects="$projects libunwind" fi fi -if [ $do_test_suite = "yes" ]; then - projects="$projects test-suite" -fi +case $do_test_suite in + yes|export-only) + projects="$projects test-suite" + ;; +esac if [ $do_openmp = "yes" ]; then projects="$projects openmp" fi @@ -277,9 +282,16 @@ function export_sources() { clang-tools-extra) projsrc=llvm.src/tools/clang/tools/extra ;; - compiler-rt|libcxx|libcxxabi|libunwind|openmp|test-suite) + compiler-rt|libcxx|libcxxabi|libunwind|openmp) projsrc=llvm.src/projects/$proj ;; + test-suite) + if [ $do_test_suite = 'yes' ]; then + projsrc=llvm.src/projects/$proj + else + projsrc=$proj.src + fi + ;; *) echo "error: unknown project $proj" exit 1 diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt index b34e22ae0cb4..c9a2cdd45c8e 100644 --- a/utils/unittest/CMakeLists.txt +++ b/utils/unittest/CMakeLists.txt @@ -32,10 +32,6 @@ if (NOT LLVM_ENABLE_THREADS) add_definitions( -DGTEST_HAS_PTHREAD=0 ) endif() -set(LIBS - LLVMSupport # Depends on llvm::raw_ostream -) - find_library(PTHREAD_LIBRARY_PATH pthread) if (PTHREAD_LIBRARY_PATH) list(APPEND LIBS pthread) @@ -46,6 +42,9 @@ add_llvm_library(gtest LINK_LIBS ${LIBS} + + LINK_COMPONENTS + Support # Depends on llvm::raw_ostream ) add_subdirectory(UnitTestMain) diff --git a/utils/unittest/UnitTestMain/CMakeLists.txt b/utils/unittest/UnitTestMain/CMakeLists.txt index 65ef97b02816..520db4e8d2b3 100644 --- a/utils/unittest/UnitTestMain/CMakeLists.txt +++ b/utils/unittest/UnitTestMain/CMakeLists.txt @@ -3,5 +3,7 @@ add_llvm_library(gtest_main LINK_LIBS gtest - LLVMSupport # Depends on llvm::cl + + LINK_COMPONENTS + Support # Depends on llvm::cl )