Vendor import of llvm release_38 branch r260756:
https://llvm.org/svn/llvm-project/llvm/branches/release_38@260756
This commit is contained in:
parent
44c4732640
commit
97a7b8a20a
@ -468,20 +468,23 @@ function(llvm_add_library name)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Add the explicit dependency information for this library.
|
||||
#
|
||||
# It would be nice to verify that we have the dependencies for this library
|
||||
# name, but using get_property(... SET) doesn't suffice to determine if a
|
||||
# property has been set to an empty value.
|
||||
get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
|
||||
|
||||
if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_STATIC AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
|
||||
set(llvm_libs LLVM)
|
||||
if (DEFINED LLVM_LINK_COMPONENTS OR DEFINED ARG_LINK_COMPONENTS)
|
||||
if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
|
||||
set(llvm_libs LLVM)
|
||||
else()
|
||||
llvm_map_components_to_libnames(llvm_libs
|
||||
${ARG_LINK_COMPONENTS}
|
||||
${LLVM_LINK_COMPONENTS}
|
||||
)
|
||||
endif()
|
||||
else()
|
||||
llvm_map_components_to_libnames(llvm_libs
|
||||
${ARG_LINK_COMPONENTS}
|
||||
${LLVM_LINK_COMPONENTS}
|
||||
)
|
||||
# Components have not been defined explicitly in CMake, so add the
|
||||
# dependency information for this library as defined by LLVMBuild.
|
||||
#
|
||||
# It would be nice to verify that we have the dependencies for this library
|
||||
# name, but using get_property(... SET) doesn't suffice to determine if a
|
||||
# property has been set to an empty value.
|
||||
get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
|
||||
endif()
|
||||
|
||||
if(CMAKE_VERSION VERSION_LESS 2.8.12)
|
||||
@ -882,14 +885,11 @@ function(add_unittest test_suite test_name)
|
||||
|
||||
set(LLVM_REQUIRES_RTTI OFF)
|
||||
|
||||
list(APPEND LLVM_LINK_COMPONENTS Support) # gtest needs it for raw_ostream
|
||||
add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN})
|
||||
set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR})
|
||||
set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir})
|
||||
target_link_libraries(${test_name}
|
||||
gtest
|
||||
gtest_main
|
||||
LLVMSupport # gtest needs it for raw_ostream.
|
||||
)
|
||||
target_link_libraries(${test_name} gtest_main gtest)
|
||||
|
||||
add_dependencies(${test_suite} ${test_name})
|
||||
get_target_property(test_suite_folder ${test_suite} FOLDER)
|
||||
|
@ -40,10 +40,19 @@ macro(llvm_config executable)
|
||||
# done in case libLLVM does not contain all of the components
|
||||
# the target requires.
|
||||
#
|
||||
# TODO strip LLVM_DYLIB_COMPONENTS out of link_components.
|
||||
# Strip LLVM_DYLIB_COMPONENTS out of link_components.
|
||||
# To do this, we need special handling for "all", since that
|
||||
# may imply linking to libraries that are not included in
|
||||
# libLLVM.
|
||||
|
||||
if (DEFINED link_components AND DEFINED LLVM_DYLIB_COMPONENTS)
|
||||
if("${LLVM_DYLIB_COMPONENTS}" STREQUAL "all")
|
||||
set(link_components "")
|
||||
else()
|
||||
list(REMOVE_ITEM link_components ${LLVM_DYLIB_COMPONENTS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
target_link_libraries(${executable} LLVM)
|
||||
endif()
|
||||
|
||||
|
@ -5,11 +5,6 @@ LLVM 3.8 Release Notes
|
||||
.. contents::
|
||||
:local:
|
||||
|
||||
.. warning::
|
||||
These are in-progress notes for the upcoming LLVM 3.8 release. You may
|
||||
prefer the `LLVM 3.7 Release Notes <http://llvm.org/releases/3.7.0/docs
|
||||
/ReleaseNotes.html>`_.
|
||||
|
||||
|
||||
Introduction
|
||||
============
|
||||
@ -26,11 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List
|
||||
<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
|
||||
them.
|
||||
|
||||
Note that if you are reading this file from a Subversion checkout or the main
|
||||
LLVM web page, this document applies to the *next* release, not the current
|
||||
one. To see the release notes for a specific release, please see the `releases
|
||||
page <http://llvm.org/releases/>`_.
|
||||
|
||||
Non-comprehensive list of changes in this release
|
||||
=================================================
|
||||
* With this release, the minimum Windows version required for running LLVM is
|
||||
@ -79,6 +69,26 @@ Non-comprehensive list of changes in this release
|
||||
|
||||
* Support for dematerializing has been dropped.
|
||||
|
||||
* RegisterScheduler::setDefault was removed. Targets that used to call into the
|
||||
command line parser to set the DAGScheduler, and that don't have enough
|
||||
control with setSchedulingPreference, should look into overriding the
|
||||
SubTargetHook "getDAGScheduler()".
|
||||
|
||||
* ``ilist_iterator<T>`` no longer has implicit conversions to and from ``T*``,
|
||||
since ``ilist_iterator<T>`` may be pointing at the sentinel (which is usually
|
||||
not of type ``T`` at all). To convert from an iterator ``I`` to a pointer,
|
||||
use ``&*I``; to convert from a pointer ``P`` to an iterator, use
|
||||
``P->getIterator()``. Alternatively, explicit conversions via
|
||||
``static_cast<T>(U)`` are still available.
|
||||
|
||||
* ``ilist_node<T>::getNextNode()`` and ``ilist_node<T>::getPrevNode()`` now
|
||||
fail at compile time when the node cannot access its parent list.
|
||||
Previously, when the sentinel was was an ``ilist_half_node<T>``, this API
|
||||
could return the sentinal instead of ``nullptr``. Frustrated callers should
|
||||
be updated to use ``iplist<T>::getNextNode(T*)`` instead. Alternatively, if
|
||||
the node ``N`` is guaranteed not to be the last in the list, it is safe to
|
||||
call ``&*++N->getIterator()`` directly.
|
||||
|
||||
.. NOTE
|
||||
For small 1-3 sentence descriptions, just add an entry at the end of
|
||||
this list. If your description won't fit comfortably in one bullet
|
||||
@ -98,17 +108,97 @@ Non-comprehensive list of changes in this release
|
||||
|
||||
Makes programs 10x faster by doing Special New Thing.
|
||||
|
||||
Changes to the ARM Backend
|
||||
--------------------------
|
||||
|
||||
During this release ...
|
||||
Changes to the ARM Backends
|
||||
---------------------------
|
||||
|
||||
During this release the AArch64 target has:
|
||||
|
||||
* Added support for more sanitizers (MSAN, TSAN) and made them compatible with
|
||||
all VMA kernel configurations (kurrently tested on 39 and 42 bits).
|
||||
* Gained initial LLD support in the new ELF back-end
|
||||
* Extended the Load/Store optimiser and cleaned up some of the bad decisions
|
||||
made earlier.
|
||||
* Expanded LLDB support, including watchpoints, native building, Renderscript,
|
||||
LLDB-server, debugging 32-bit applications.
|
||||
* Added support for the ``Exynos M1`` chip.
|
||||
|
||||
During this release the ARM target has:
|
||||
|
||||
* Gained massive performance improvements on embedded benchmarks due to finally
|
||||
running the stride vectorizer in full form, incrementing the performance gains
|
||||
that we already had in the previous releases with limited stride vectorization.
|
||||
* Expanded LLDB support, including watchpoints, unwind tables
|
||||
* Extended the Load/Store optimiser and cleaned up some of the bad decisions
|
||||
made earlier.
|
||||
* Simplified code generation for global variable addresses in ELF, resulting in
|
||||
a significant (4% in Chromium) reduction in code size.
|
||||
* Gained some additional code size improvements, though there's still a long road
|
||||
ahead, especially for older cores.
|
||||
* Added some EABI floating point comparison functions to Compiler-RT
|
||||
* Added support for Windows+GNU triple, +features in -mcpu/-march options.
|
||||
|
||||
|
||||
Changes to the MIPS Target
|
||||
--------------------------
|
||||
|
||||
During this release ...
|
||||
During this release the MIPS target has:
|
||||
|
||||
* Significantly extended support for the Integrated Assembler. See below for
|
||||
more information
|
||||
* Added support for the ``P5600`` processor.
|
||||
* Added support for the ``interrupt`` attribute for MIPS32R2 and later. This
|
||||
attribute will generate a function which can be used as a interrupt handler
|
||||
on bare metal MIPS targets using the static relocation model.
|
||||
* Added support for the ``ERETNC`` instruction found in MIPS32R5 and later.
|
||||
* Added support for OpenCL. See http://portablecl.org/.
|
||||
|
||||
* Address spaces 1 to 255 are now reserved for software use and conversions
|
||||
between them are no-op casts.
|
||||
|
||||
* Removed the ``mips16`` value for the -mcpu option since it is an :abbr:`ASE
|
||||
(Application Specific Extension)` and not a processor. If you were using this,
|
||||
please specify another CPU and use ``-mips16`` to enable MIPS16.
|
||||
* Removed ``copy_u.w`` from 32-bit MSA and ``copy_u.d`` from 64-bit MSA since
|
||||
they have been removed from the MSA specification due to forward compatibility
|
||||
issues. For example, 32-bit MSA code containing ``copy_u.w`` would behave
|
||||
differently on a 64-bit processor supporting MSA. The corresponding intrinsics
|
||||
are still available and may expand to ``copy_s.[wd]`` where this is
|
||||
appropriate for forward compatibility purposes.
|
||||
* Relaxed the ``-mnan`` option to allow ``-mnan=2008`` on MIPS32R2/MIPS64R2 for
|
||||
compatibility with GCC.
|
||||
* Made MIPS64R6 the default CPU for 64-bit Android triples.
|
||||
|
||||
The MIPS target has also fixed various bugs including the following notable
|
||||
fixes:
|
||||
|
||||
* Fixed reversed operands on ``mthi``/``mtlo`` in the DSP :abbr:`ASE
|
||||
(Application Specific Extension)`.
|
||||
* The code generator no longer uses ``jal`` for calls to absolute immediate
|
||||
addresses.
|
||||
* Disabled fast instruction selection on MIPS32R6 and MIPS64R6 since this is not
|
||||
yet supported.
|
||||
* Corrected addend for ``R_MIPS_HI16`` and ``R_MIPS_PCHI16`` in MCJIT
|
||||
* The code generator no longer crashes when handling subregisters of an 64-bit
|
||||
FPU register with undefined value.
|
||||
* The code generator no longer attempts to use ``$zero`` for operands that do
|
||||
not permit ``$zero``.
|
||||
* Corrected the opcode used for ``ll``/``sc`` when using MIPS32R6/MIPS64R6 and
|
||||
the Integrated Assembler.
|
||||
* Added support for atomic load and atomic store.
|
||||
* Corrected debug info when dynamically re-aligning the stack.
|
||||
|
||||
Integrated Assembler
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
We have made a large number of improvements to the integrated assembler for
|
||||
MIPS. In this release, the integrated assembler isn't quite production-ready
|
||||
since there are a few known issues related to bare-metal support, checking
|
||||
immediates on instructions, and the N32/N64 ABI's. However, the current support
|
||||
should be sufficient for many users of the O32 ABI, particularly those targeting
|
||||
MIPS32 on Linux or bare-metal MIPS32.
|
||||
|
||||
If you would like to try the integrated assembler, please use
|
||||
``-fintegrated-as``.
|
||||
|
||||
Changes to the PowerPC Target
|
||||
-----------------------------
|
||||
@ -123,6 +213,20 @@ Changes to the X86 Target
|
||||
|
||||
* TLS is enabled for Cygwin as emutls.
|
||||
|
||||
* Smaller code for materializing 32-bit 1 and -1 constants at ``-Os``.
|
||||
|
||||
* More efficient code for wide integer compares. (E.g. 64-bit compares
|
||||
on 32-bit targets.)
|
||||
|
||||
* Tail call support for ``thiscall``, ``stdcall`, ``vectorcall``, and
|
||||
``fastcall`` functions.
|
||||
|
||||
Changes to the AVR Target
|
||||
-------------------------
|
||||
|
||||
Slightly less than half of the AVR backend has been merged in at this point. It is still
|
||||
missing a number large parts which cause it to be unusable, but is well on the
|
||||
road to being completely merged and workable.
|
||||
|
||||
Changes to the OCaml bindings
|
||||
-----------------------------
|
||||
@ -140,7 +244,19 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
|
||||
a lot of other language and tools projects. This section lists some of the
|
||||
projects that have already been updated to work with LLVM 3.8.
|
||||
|
||||
* A project
|
||||
LDC - the LLVM-based D compiler
|
||||
-------------------------------
|
||||
|
||||
`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
|
||||
pragmatically combines efficiency, control, and modeling power, with safety and
|
||||
programmer productivity. D supports powerful concepts like Compile-Time Function
|
||||
Execution (CTFE) and Template Meta-Programming, provides an innovative approach
|
||||
to concurrency and offers many classical paradigms.
|
||||
|
||||
`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
|
||||
combined with LLVM as backend to produce efficient native code. LDC targets
|
||||
x86/x86_64 systems like Linux, OS X and Windows and also PowerPC (32/64 bit)
|
||||
and ARM. Ports to other architectures like AArch64 and MIPS64 are underway.
|
||||
|
||||
|
||||
Additional Information
|
||||
|
@ -484,7 +484,7 @@ let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
|
||||
Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
|
||||
Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
|
||||
Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
|
||||
Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
|
||||
|
@ -280,11 +280,7 @@ public:
|
||||
// when using them since you might not get all uses.
|
||||
// The methods that don't start with materialized_ assert that modules is
|
||||
// fully materialized.
|
||||
#ifdef NDEBUG
|
||||
void assertModuleIsMaterialized() const {}
|
||||
#else
|
||||
void assertModuleIsMaterialized() const;
|
||||
#endif
|
||||
|
||||
bool use_empty() const {
|
||||
assertModuleIsMaterialized();
|
||||
|
@ -242,13 +242,6 @@ void DemandedBits::determineLiveOperandBits(
|
||||
if (OperandNo != 0)
|
||||
AB = AOut;
|
||||
break;
|
||||
case Instruction::ICmp:
|
||||
// Count the number of leading zeroes in each operand.
|
||||
ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
|
||||
auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(),
|
||||
KnownZero2.countLeadingOnes());
|
||||
AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -555,6 +555,11 @@ bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
||||
return true;
|
||||
O << -MO.getImm();
|
||||
return false;
|
||||
case 's': // The GCC deprecated s modifier
|
||||
if (MO.getType() != MachineOperand::MO_Immediate)
|
||||
return true;
|
||||
O << ((32 - MO.getImm()) & 31);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -793,16 +793,27 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
|
||||
llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
|
||||
}
|
||||
|
||||
/// Determine whether two variable pieces overlap.
|
||||
static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
|
||||
if (!P1->isBitPiece() || !P2->isBitPiece())
|
||||
return true;
|
||||
// Determine the relative position of the pieces described by P1 and P2.
|
||||
// Returns -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
|
||||
// 1 if P1 is entirely after P2.
|
||||
static int pieceCmp(const DIExpression *P1, const DIExpression *P2) {
|
||||
unsigned l1 = P1->getBitPieceOffset();
|
||||
unsigned l2 = P2->getBitPieceOffset();
|
||||
unsigned r1 = l1 + P1->getBitPieceSize();
|
||||
unsigned r2 = l2 + P2->getBitPieceSize();
|
||||
// True where [l1,r1[ and [r1,r2[ overlap.
|
||||
return (l1 < r2) && (l2 < r1);
|
||||
if (r1 <= l2)
|
||||
return -1;
|
||||
else if (r2 <= l1)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Determine whether two variable pieces overlap.
|
||||
static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
|
||||
if (!P1->isBitPiece() || !P2->isBitPiece())
|
||||
return true;
|
||||
return pieceCmp(P1, P2) == 0;
|
||||
}
|
||||
|
||||
/// \brief If this and Next are describing different pieces of the same
|
||||
@ -811,14 +822,32 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
|
||||
/// Return true if the merge was successful.
|
||||
bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
|
||||
if (Begin == Next.Begin) {
|
||||
auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
|
||||
auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
|
||||
if (Expr->isBitPiece() && NextExpr->isBitPiece() &&
|
||||
!piecesOverlap(Expr, NextExpr)) {
|
||||
addValues(Next.Values);
|
||||
End = Next.End;
|
||||
return true;
|
||||
auto *FirstExpr = cast<DIExpression>(Values[0].Expression);
|
||||
auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression);
|
||||
if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece())
|
||||
return false;
|
||||
|
||||
// We can only merge entries if none of the pieces overlap any others.
|
||||
// In doing so, we can take advantage of the fact that both lists are
|
||||
// sorted.
|
||||
for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
|
||||
for (; j < Next.Values.size(); ++j) {
|
||||
int res = pieceCmp(cast<DIExpression>(Values[i].Expression),
|
||||
cast<DIExpression>(Next.Values[j].Expression));
|
||||
if (res == 0) // The two expressions overlap, we can't merge.
|
||||
return false;
|
||||
// Values[i] is entirely before Next.Values[j],
|
||||
// so go back to the next entry of Values.
|
||||
else if (res == -1)
|
||||
break;
|
||||
// Next.Values[j] is entirely before Values[i], so go on to the
|
||||
// next entry of Next.Values.
|
||||
}
|
||||
}
|
||||
|
||||
addValues(Next.Values);
|
||||
End = Next.End;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -313,8 +313,8 @@ void Value::takeName(Value *V) {
|
||||
ST->reinsertValue(this);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
void Value::assertModuleIsMaterialized() const {
|
||||
#ifndef NDEBUG
|
||||
const GlobalValue *GV = dyn_cast<GlobalValue>(this);
|
||||
if (!GV)
|
||||
return;
|
||||
@ -322,8 +322,10 @@ void Value::assertModuleIsMaterialized() const {
|
||||
if (!M)
|
||||
return;
|
||||
assert(M->isMaterialized());
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr,
|
||||
Constant *C) {
|
||||
if (!Cache.insert(Expr).second)
|
||||
|
@ -90,6 +90,7 @@ def AArch64InstrInfo : InstrInfo;
|
||||
include "AArch64SchedA53.td"
|
||||
include "AArch64SchedA57.td"
|
||||
include "AArch64SchedCyclone.td"
|
||||
include "AArch64SchedM1.td"
|
||||
|
||||
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
|
||||
"Cortex-A35 ARM processors",
|
||||
@ -144,8 +145,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
|
||||
// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
|
||||
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
|
||||
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
|
||||
// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
|
||||
def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;
|
||||
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Assembly parser
|
||||
|
@ -6689,6 +6689,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
|
||||
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
|
||||
}
|
||||
|
||||
if (LHS.getValueType().getVectorElementType() == MVT::f16)
|
||||
return SDValue();
|
||||
|
||||
assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
|
||||
LHS.getValueType().getVectorElementType() == MVT::f64);
|
||||
|
||||
|
359
lib/Target/AArch64/AArch64SchedM1.td
Normal file
359
lib/Target/AArch64/AArch64SchedM1.td
Normal file
@ -0,0 +1,359 @@
|
||||
//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file defines the machine model for Samsung Exynos-M1 to support
|
||||
// instruction scheduling and other instruction cost heuristics.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// The Exynos-M1 is a traditional superscalar microprocessor with a
|
||||
// 4-wide in-order stage for decode and dispatch and a wider issue stage.
|
||||
// The execution units and loads and stores are out-of-order.
|
||||
|
||||
def ExynosM1Model : SchedMachineModel {
|
||||
let IssueWidth = 4; // Up to 4 uops per cycle.
|
||||
let MinLatency = 0; // OoO.
|
||||
let MicroOpBufferSize = 96; // ROB size.
|
||||
let LoopMicroOpBufferSize = 32; // Instruction queue size.
|
||||
let LoadLatency = 4; // Optimistic load cases.
|
||||
let MispredictPenalty = 14; // Minimum branch misprediction penalty.
|
||||
let CompleteModel = 0; // Use the default model otherwise.
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Define each kind of processor resource and number available on the Exynos-M1,
|
||||
// which has 9 pipelines, each with its own queue with out-of-order dispatch.
|
||||
|
||||
def M1UnitA : ProcResource<2>; // Simple integer
|
||||
def M1UnitC : ProcResource<1>; // Simple and complex integer
|
||||
def M1UnitB : ProcResource<2>; // Branch
|
||||
def M1UnitL : ProcResource<1>; // Load
|
||||
def M1UnitS : ProcResource<1>; // Store
|
||||
def M1PipeF0 : ProcResource<1>; // FP #0
|
||||
def M1PipeF1 : ProcResource<1>; // FP #1
|
||||
|
||||
let Super = M1PipeF0 in {
|
||||
def M1UnitFMAC : ProcResource<1>; // FP multiplication
|
||||
def M1UnitFCVT : ProcResource<1>; // FP conversion
|
||||
def M1UnitNAL0 : ProcResource<1>; // Simple vector.
|
||||
def M1UnitNMISC : ProcResource<1>; // Miscellanea
|
||||
def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
|
||||
}
|
||||
|
||||
let Super = M1PipeF1 in {
|
||||
def M1UnitFADD : ProcResource<1>; // Simple FP
|
||||
let BufferSize = 1 in
|
||||
def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
|
||||
def M1UnitNAL1 : ProcResource<1>; // Simple vector.
|
||||
def M1UnitFST : ProcResource<1>; // FP store
|
||||
}
|
||||
|
||||
let SchedModel = ExynosM1Model in {
|
||||
def M1UnitALU : ProcResGroup<[M1UnitA,
|
||||
M1UnitC]>; // All simple integer.
|
||||
def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
|
||||
M1UnitNAL1]>; // All simple vector.
|
||||
}
|
||||
|
||||
let SchedModel = ExynosM1Model in {
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Coarse scheduling model for the Exynos-M1.
|
||||
|
||||
// Branch instructions.
|
||||
// TODO: Non-conditional direct branches take zero cycles and units.
|
||||
def : WriteRes<WriteBr, [M1UnitB]> { let Latency = 1; }
|
||||
def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
|
||||
// TODO: Branch and link is much different.
|
||||
|
||||
// Arithmetic and logical integer instructions.
|
||||
def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; }
|
||||
// TODO: Shift over 3 and some extensions take 2 cycles.
|
||||
def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
|
||||
def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
|
||||
def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; }
|
||||
|
||||
// Move instructions.
|
||||
def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
|
||||
|
||||
// Divide and multiply instructions.
|
||||
// TODO: Division blocks the divider inside C.
|
||||
def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; }
|
||||
def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; }
|
||||
// TODO: Long multiplication take 5 cycles and also the ALU.
|
||||
// TODO: Multiplication with accumulation can be advanced.
|
||||
def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
|
||||
// TODO: 64-bit multiplication has a throughput of 1/2.
|
||||
def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
|
||||
|
||||
// Miscellaneous instructions.
|
||||
def : WriteRes<WriteExtr, [M1UnitALU,
|
||||
M1UnitALU]> { let Latency = 2; }
|
||||
|
||||
// TODO: The latency for the post or pre register is 1 cycle.
|
||||
def : WriteRes<WriteAdr, []> { let Latency = 0; }
|
||||
|
||||
// Load instructions.
|
||||
def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; }
|
||||
// TODO: Extended address requires also the ALU.
|
||||
def : WriteRes<WriteLDIdx, [M1UnitL]> { let Latency = 5; }
|
||||
def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; }
|
||||
|
||||
// Store instructions.
|
||||
def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; }
|
||||
// TODO: Extended address requires also the ALU.
|
||||
def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; }
|
||||
def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; }
|
||||
def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; }
|
||||
|
||||
// FP data instructions.
|
||||
def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; }
|
||||
// TODO: FCCMP is much different.
|
||||
def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
|
||||
// TODO: DP takes longer.
|
||||
def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15; }
|
||||
// TODO: MACC takes longer.
|
||||
def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; }
|
||||
|
||||
// FP miscellaneous instructions.
|
||||
// TODO: Conversion between register files is much different.
|
||||
def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; }
|
||||
def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; }
|
||||
// TODO: Copy from FPR to GPR is much different.
|
||||
def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; }
|
||||
|
||||
// FP load instructions.
|
||||
// TODO: ASIMD loads are much different.
|
||||
def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
|
||||
|
||||
// FP store instructions.
|
||||
// TODO: ASIMD stores are much different.
|
||||
def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
|
||||
|
||||
// ASIMD FP instructions.
|
||||
// TODO: Other operations are much different.
|
||||
def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
|
||||
|
||||
// Other miscellaneous instructions.
|
||||
def : WriteRes<WriteSys, []> { let Latency = 1; }
|
||||
def : WriteRes<WriteBarrier, []> { let Latency = 1; }
|
||||
def : WriteRes<WriteHint, []> { let Latency = 1; }
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Fast forwarding.
|
||||
|
||||
// TODO: Add FP register forwarding rules.
|
||||
|
||||
def : ReadAdvance<ReadI, 0>;
|
||||
def : ReadAdvance<ReadISReg, 0>;
|
||||
def : ReadAdvance<ReadIEReg, 0>;
|
||||
def : ReadAdvance<ReadIM, 0>;
|
||||
// Integer multiply-accumulate.
|
||||
// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
|
||||
def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>;
|
||||
def : ReadAdvance<ReadID, 0>;
|
||||
def : ReadAdvance<ReadExtrHi, 0>;
|
||||
def : ReadAdvance<ReadAdrBase, 0>;
|
||||
def : ReadAdvance<ReadVLD, 0>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Finer scheduling model for the Exynos-M1.
|
||||
|
||||
def M1WriteNEONA : SchedWriteRes<[M1UnitNALU,
|
||||
M1UnitNALU,
|
||||
M1UnitFADD]> { let Latency = 9; }
|
||||
def M1WriteNEONB : SchedWriteRes<[M1UnitNALU,
|
||||
M1UnitFST]> { let Latency = 5; }
|
||||
def M1WriteNEONC : SchedWriteRes<[M1UnitNALU,
|
||||
M1UnitFST]> { let Latency = 6; }
|
||||
def M1WriteNEOND : SchedWriteRes<[M1UnitNALU,
|
||||
M1UnitFST,
|
||||
M1UnitL]> { let Latency = 10; }
|
||||
def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT,
|
||||
M1UnitFST]> { let Latency = 8; }
|
||||
def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT,
|
||||
M1UnitFST,
|
||||
M1UnitL]> { let Latency = 13; }
|
||||
def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC,
|
||||
M1UnitFST]> { let Latency = 6; }
|
||||
def M1WriteNEONH : SchedWriteRes<[M1UnitNALU,
|
||||
M1UnitFST]> { let Latency = 3; }
|
||||
def M1WriteNEONI : SchedWriteRes<[M1UnitFST,
|
||||
M1UnitL]> { let Latency = 9; }
|
||||
def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; }
|
||||
def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
|
||||
// FIXME: This is the worst case, conditional branch and link.
|
||||
def M1WriteBL : SchedWriteRes<[M1UnitB,
|
||||
M1UnitALU]> { let Latency = 1; }
|
||||
// FIXME: This is the worst case, when using LR.
|
||||
def M1WriteBLR : SchedWriteRes<[M1UnitB,
|
||||
M1UnitALU,
|
||||
M1UnitALU]> { let Latency = 2; }
|
||||
def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; }
|
||||
def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; }
|
||||
def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; }
|
||||
def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; }
|
||||
def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; }
|
||||
def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; }
|
||||
def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; }
|
||||
def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; }
|
||||
def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; }
|
||||
def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; }
|
||||
def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; }
|
||||
def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; }
|
||||
def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; }
|
||||
def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; }
|
||||
def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
|
||||
def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
|
||||
def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; }
|
||||
def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; }
|
||||
def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; }
|
||||
def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; }
|
||||
def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
|
||||
def M1WriteTB : SchedWriteRes<[M1UnitC,
|
||||
M1UnitALU]> { let Latency = 2; }
|
||||
|
||||
// Branch instructions
|
||||
def : InstRW<[M1WriteB ], (instrs Bcc)>;
|
||||
def : InstRW<[M1WriteBL], (instrs BL)>;
|
||||
def : InstRW<[M1WriteBLR], (instrs BLR)>;
|
||||
def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>;
|
||||
def : InstRW<[M1WriteTB], (instregex "^TBN?Z[WX]")>;
|
||||
|
||||
// Arithmetic and logical integer instructions.
|
||||
def : InstRW<[M1WriteALU1], (instrs COPY)>;
|
||||
|
||||
// Divide and multiply instructions.
|
||||
|
||||
// Miscellaneous instructions.
|
||||
|
||||
// Load instructions.
|
||||
|
||||
// Store instructions.
|
||||
|
||||
// FP data instructions.
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>;
|
||||
def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>;
|
||||
def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>;
|
||||
def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
|
||||
def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
|
||||
def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
|
||||
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
|
||||
def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>;
|
||||
def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
|
||||
def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>;
|
||||
def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>;
|
||||
def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
|
||||
def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
|
||||
|
||||
// FP miscellaneous instructions.
|
||||
def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
|
||||
def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
|
||||
def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
|
||||
def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>;
|
||||
def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
|
||||
|
||||
// FP load instructions.
|
||||
|
||||
// FP store instructions.
|
||||
|
||||
// ASIMD instructions.
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
|
||||
def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
|
||||
def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
|
||||
def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
|
||||
def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
|
||||
def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
|
||||
def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>;
|
||||
def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
|
||||
def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>;
|
||||
|
||||
// ASIMD FP instructions.
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>;
|
||||
def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
|
||||
def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
|
||||
def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
|
||||
def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
|
||||
def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
|
||||
def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
|
||||
def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
|
||||
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
|
||||
def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>;
|
||||
def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>;
|
||||
def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>;
|
||||
|
||||
// ASIMD miscellaneous instructions.
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>;
|
||||
def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^CPY")>;
|
||||
def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>;
|
||||
def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>;
|
||||
def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>;
|
||||
def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev")>;
|
||||
def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
|
||||
def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>;
|
||||
def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>;
|
||||
def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
|
||||
(instregex "^TB[LX]v8i8Two")>;
|
||||
def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
|
||||
(instregex "^TB[LX]v8i8Three")>;
|
||||
def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
|
||||
(instregex "^TB[LX]v8i8Four")>;
|
||||
def : InstRW<[M1WriteNAL12], (instregex "^TB[LX]v16i8One")>;
|
||||
def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
|
||||
(instregex "^TB[LX]v16i8Two")>;
|
||||
def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
|
||||
(instregex "^TB[LX]v16i8Three")>;
|
||||
def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
|
||||
(instregex "^TB[LX]v16i8Four")>;
|
||||
def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
|
||||
def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
|
||||
def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>;
|
||||
|
||||
// ASIMD load instructions.
|
||||
|
||||
// ASIMD store instructions.
|
||||
|
||||
// Cryptography instructions.
|
||||
def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
|
||||
def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
|
||||
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
|
||||
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
|
||||
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
|
||||
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
|
||||
|
||||
// CRC instructions.
|
||||
def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
|
||||
|
||||
} // SchedModel = ExynosM1Model
|
@ -183,6 +183,7 @@ def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>;
|
||||
def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>;
|
||||
def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>;
|
||||
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
|
||||
def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
|
||||
|
||||
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
|
||||
"localmemorysize"#Value,
|
||||
@ -252,7 +253,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
|
||||
def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
|
||||
[Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
|
||||
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
|
||||
FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
|
||||
FeatureGCN3Encoding, FeatureCIInsts]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
@ -53,7 +53,8 @@ public:
|
||||
ISAVersion7_0_0,
|
||||
ISAVersion7_0_1,
|
||||
ISAVersion8_0_0,
|
||||
ISAVersion8_0_1
|
||||
ISAVersion8_0_1,
|
||||
ISAVersion8_0_3
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -128,21 +128,23 @@ def : ProcessorModel<"mullins", SIQuarterSpeedModel,
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
|
||||
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
|
||||
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
|
||||
FeatureLDSBankCount32]
|
||||
>;
|
||||
|
||||
def : ProcessorModel<"iceland", SIQuarterSpeedModel,
|
||||
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
|
||||
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
|
||||
FeatureLDSBankCount32]
|
||||
>;
|
||||
|
||||
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
|
||||
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
|
||||
[FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
|
||||
>;
|
||||
|
||||
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
|
||||
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
|
||||
[FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32]
|
||||
>;
|
||||
|
||||
def : ProcessorModel<"stoney", SIQuarterSpeedModel,
|
||||
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
|
||||
[FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
|
||||
>;
|
||||
|
@ -234,6 +234,7 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
|
||||
bool IsLoad = TII->get(LoadStoreOp).mayLoad();
|
||||
|
||||
bool RanOutOfSGPRs = false;
|
||||
bool Scavenged = false;
|
||||
unsigned SOffset = ScratchOffset;
|
||||
|
||||
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
||||
@ -244,6 +245,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
|
||||
if (SOffset == AMDGPU::NoRegister) {
|
||||
RanOutOfSGPRs = true;
|
||||
SOffset = AMDGPU::SGPR0;
|
||||
} else {
|
||||
Scavenged = true;
|
||||
}
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
|
||||
.addReg(ScratchOffset)
|
||||
@ -259,10 +262,14 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
|
||||
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
|
||||
Value;
|
||||
|
||||
unsigned SOffsetRegState = 0;
|
||||
if (i + 1 == e && Scavenged)
|
||||
SOffsetRegState |= RegState::Kill;
|
||||
|
||||
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
|
||||
.addReg(SubReg, getDefRegState(IsLoad))
|
||||
.addReg(ScratchRsrcReg)
|
||||
.addReg(SOffset)
|
||||
.addReg(SOffset, SOffsetRegState)
|
||||
.addImm(Offset)
|
||||
.addImm(0) // glc
|
||||
.addImm(0) // slc
|
||||
|
@ -41,6 +41,9 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
|
||||
if (Features.test(FeatureISAVersion8_0_1))
|
||||
return {8, 0, 1};
|
||||
|
||||
if (Features.test(FeatureISAVersion8_0_3))
|
||||
return {8, 0, 3};
|
||||
|
||||
return {0, 0, 0};
|
||||
}
|
||||
|
||||
|
@ -747,7 +747,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
|
||||
|
||||
// If Offset is a multiply-by-constant and it's profitable to extract a shift
|
||||
// and use it in a shifted operand do so.
|
||||
if (Offset.getOpcode() == ISD::MUL) {
|
||||
if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) {
|
||||
unsigned PowerOfTwo = 0;
|
||||
SDValue NewMulConst;
|
||||
if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
|
||||
@ -1422,7 +1422,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
|
||||
|
||||
// If OffReg is a multiply-by-constant and it's profitable to extract a shift
|
||||
// and use it in a shifted operand do so.
|
||||
if (OffReg.getOpcode() == ISD::MUL) {
|
||||
if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) {
|
||||
unsigned PowerOfTwo = 0;
|
||||
SDValue NewMulConst;
|
||||
if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {
|
||||
|
@ -1615,7 +1615,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
|
||||
// extension rather than sign extension. Make sure we pass the return
|
||||
// value extension property to integer materialization.
|
||||
unsigned SrcReg =
|
||||
PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt);
|
||||
PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt);
|
||||
|
||||
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
|
||||
TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
|
||||
@ -2091,25 +2091,21 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
|
||||
|
||||
const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
|
||||
&PPC::GPRCRegClass);
|
||||
int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
|
||||
|
||||
// If the constant is in range, use a load-immediate.
|
||||
if (UseSExt && isInt<16>(CI->getSExtValue())) {
|
||||
// Since LI will sign extend the constant we need to make sure that for
|
||||
// our zeroext constants that the sign extended constant fits into 16-bits -
|
||||
// a range of 0..0x7fff.
|
||||
if (isInt<16>(Imm)) {
|
||||
unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
|
||||
unsigned ImmReg = createResultReg(RC);
|
||||
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
|
||||
.addImm(CI->getSExtValue());
|
||||
return ImmReg;
|
||||
} else if (!UseSExt && isUInt<16>(CI->getZExtValue())) {
|
||||
unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
|
||||
unsigned ImmReg = createResultReg(RC);
|
||||
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
|
||||
.addImm(CI->getZExtValue());
|
||||
.addImm(Imm);
|
||||
return ImmReg;
|
||||
}
|
||||
|
||||
// Construct the constant piecewise.
|
||||
int64_t Imm = CI->getZExtValue();
|
||||
|
||||
if (VT == MVT::i64)
|
||||
return PPCMaterialize64BitInt(Imm, RC);
|
||||
else if (VT == MVT::i32)
|
||||
|
@ -736,7 +736,7 @@ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
|
||||
def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
|
||||
v16i8, v8i16>;
|
||||
def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
|
||||
v16i8, v4i32>;
|
||||
v8i16, v4i32>;
|
||||
def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
|
||||
v8i16, v4i32>;
|
||||
def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
|
||||
|
@ -1849,7 +1849,7 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
|
||||
if (CCMask == SystemZ::CCMASK_CMP_NE)
|
||||
return SystemZ::CCMASK_TM_SOME_1;
|
||||
}
|
||||
if (EffectivelyUnsigned && CmpVal <= Low) {
|
||||
if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
|
||||
if (CCMask == SystemZ::CCMASK_CMP_LT)
|
||||
return SystemZ::CCMASK_TM_ALL_0;
|
||||
if (CCMask == SystemZ::CCMASK_CMP_GE)
|
||||
|
@ -1335,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
||||
|
||||
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
|
||||
setOperationAction(ISD::SETCC, MVT::i1, Custom);
|
||||
setOperationAction(ISD::SETCCE, MVT::i1, Custom);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
|
||||
setOperationAction(ISD::XOR, MVT::i1, Legal);
|
||||
setOperationAction(ISD::OR, MVT::i1, Legal);
|
||||
@ -14975,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
|
||||
assert(Carry.getOpcode() != ISD::CARRY_FALSE);
|
||||
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
|
||||
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
|
||||
return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
|
||||
DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
|
||||
SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
|
||||
DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
|
||||
if (Op.getSimpleValueType() == MVT::i1)
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
|
||||
return SetCC;
|
||||
}
|
||||
|
||||
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
|
||||
@ -16315,6 +16319,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG, SDLoc dl) {
|
||||
|
||||
if (isAllOnesConstant(Mask))
|
||||
return DAG.getTargetConstant(1, dl, MaskVT);
|
||||
if (X86::isZeroNode(Mask))
|
||||
return DAG.getTargetConstant(0, dl, MaskVT);
|
||||
|
||||
if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
|
||||
// Mask should be extended
|
||||
Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
|
||||
@ -17203,26 +17212,14 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
|
||||
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
|
||||
MVT MaskVT = MVT::getVectorVT(MVT::i1,
|
||||
Index.getSimpleValueType().getVectorNumElements());
|
||||
SDValue MaskInReg;
|
||||
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
|
||||
if (MaskC)
|
||||
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
|
||||
else {
|
||||
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
|
||||
Mask.getSimpleValueType().getSizeInBits());
|
||||
|
||||
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
|
||||
// are extracted by EXTRACT_SUBVECTOR.
|
||||
MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
|
||||
DAG.getBitcast(BitcastVT, Mask),
|
||||
DAG.getIntPtrConstant(0, dl));
|
||||
}
|
||||
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
|
||||
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
|
||||
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
|
||||
SDValue Segment = DAG.getRegister(0, MVT::i32);
|
||||
if (Src.getOpcode() == ISD::UNDEF)
|
||||
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
|
||||
SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
|
||||
SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
|
||||
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
|
||||
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
|
||||
return DAG.getMergeValues(RetOps, dl);
|
||||
@ -17230,7 +17227,8 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
|
||||
|
||||
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
|
||||
SDValue Src, SDValue Mask, SDValue Base,
|
||||
SDValue Index, SDValue ScaleOp, SDValue Chain) {
|
||||
SDValue Index, SDValue ScaleOp, SDValue Chain,
|
||||
const X86Subtarget &Subtarget) {
|
||||
SDLoc dl(Op);
|
||||
auto *C = cast<ConstantSDNode>(ScaleOp);
|
||||
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
|
||||
@ -17238,29 +17236,18 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
|
||||
SDValue Segment = DAG.getRegister(0, MVT::i32);
|
||||
MVT MaskVT = MVT::getVectorVT(MVT::i1,
|
||||
Index.getSimpleValueType().getVectorNumElements());
|
||||
SDValue MaskInReg;
|
||||
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
|
||||
if (MaskC)
|
||||
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
|
||||
else {
|
||||
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
|
||||
Mask.getSimpleValueType().getSizeInBits());
|
||||
|
||||
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
|
||||
// are extracted by EXTRACT_SUBVECTOR.
|
||||
MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
|
||||
DAG.getBitcast(BitcastVT, Mask),
|
||||
DAG.getIntPtrConstant(0, dl));
|
||||
}
|
||||
SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
|
||||
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
|
||||
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
|
||||
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
|
||||
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
|
||||
return SDValue(Res, 1);
|
||||
}
|
||||
|
||||
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
|
||||
SDValue Mask, SDValue Base, SDValue Index,
|
||||
SDValue ScaleOp, SDValue Chain) {
|
||||
SDValue ScaleOp, SDValue Chain,
|
||||
const X86Subtarget &Subtarget) {
|
||||
SDLoc dl(Op);
|
||||
auto *C = cast<ConstantSDNode>(ScaleOp);
|
||||
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
|
||||
@ -17268,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
|
||||
SDValue Segment = DAG.getRegister(0, MVT::i32);
|
||||
MVT MaskVT =
|
||||
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
|
||||
SDValue MaskInReg;
|
||||
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
|
||||
if (MaskC)
|
||||
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
|
||||
else
|
||||
MaskInReg = DAG.getBitcast(MaskVT, Mask);
|
||||
SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
|
||||
//SDVTList VTs = DAG.getVTList(MVT::Other);
|
||||
SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
|
||||
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
|
||||
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
|
||||
return SDValue(Res, 0);
|
||||
}
|
||||
@ -17509,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
|
||||
SDValue Src = Op.getOperand(5);
|
||||
SDValue Scale = Op.getOperand(6);
|
||||
return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
|
||||
Scale, Chain);
|
||||
Scale, Chain, *Subtarget);
|
||||
}
|
||||
case PREFETCH: {
|
||||
SDValue Hint = Op.getOperand(6);
|
||||
@ -17521,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
|
||||
SDValue Index = Op.getOperand(3);
|
||||
SDValue Base = Op.getOperand(4);
|
||||
SDValue Scale = Op.getOperand(5);
|
||||
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
|
||||
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
|
||||
*Subtarget);
|
||||
}
|
||||
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
|
||||
case RDTSC: {
|
||||
|
@ -3560,7 +3560,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
|
||||
BO1->getOperand(0));
|
||||
}
|
||||
|
||||
if (CI->isMaxValue(true)) {
|
||||
if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
|
||||
ICmpInst::Predicate Pred = I.isSigned()
|
||||
? I.getUnsignedPredicate()
|
||||
: I.getSignedPredicate();
|
||||
|
@ -557,7 +557,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
|
||||
ConstantInt::get(IdxType, i),
|
||||
};
|
||||
auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName);
|
||||
auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName);
|
||||
auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(),
|
||||
LoadName);
|
||||
V = IC.Builder->CreateInsertValue(V, L, i);
|
||||
}
|
||||
|
||||
|
@ -380,6 +380,23 @@ static void replaceExtractElements(InsertElementInst *InsElt,
|
||||
ExtendMask.push_back(UndefValue::get(IntType));
|
||||
|
||||
Value *ExtVecOp = ExtElt->getVectorOperand();
|
||||
auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
|
||||
BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
|
||||
? ExtVecOpInst->getParent()
|
||||
: ExtElt->getParent();
|
||||
|
||||
// TODO: This restriction matches the basic block check below when creating
|
||||
// new extractelement instructions. If that limitation is removed, this one
|
||||
// could also be removed. But for now, we just bail out to ensure that we
|
||||
// will replace the extractelement instruction that is feeding our
|
||||
// insertelement instruction. This allows the insertelement to then be
|
||||
// replaced by a shufflevector. If the insertelement is not replaced, we can
|
||||
// induce infinite looping because there's an optimization for extractelement
|
||||
// that will delete our widening shuffle. This would trigger another attempt
|
||||
// here to create that shuffle, and we spin forever.
|
||||
if (InsertionBlock != InsElt->getParent())
|
||||
return;
|
||||
|
||||
auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
|
||||
ConstantVector::get(ExtendMask));
|
||||
|
||||
@ -387,7 +404,6 @@ static void replaceExtractElements(InsertElementInst *InsElt,
|
||||
// (as long as it's not a PHI) or at the start of the basic block of the
|
||||
// extract, so any subsequent extracts in the same basic block can use it.
|
||||
// TODO: Insert before the earliest ExtractElementInst that is replaced.
|
||||
auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
|
||||
if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
|
||||
WideVec->insertAfter(ExtVecOpInst);
|
||||
else
|
||||
|
@ -90,6 +90,11 @@ static cl::opt<bool> SpeculateOneExpensiveInst(
|
||||
cl::desc("Allow exactly one expensive instruction to be speculatively "
|
||||
"executed"));
|
||||
|
||||
static cl::opt<unsigned> MaxSpeculationDepth(
|
||||
"max-speculation-depth", cl::Hidden, cl::init(10),
|
||||
cl::desc("Limit maximum recursion depth when calculating costs of "
|
||||
"speculatively executed instructions"));
|
||||
|
||||
STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
|
||||
STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
|
||||
STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
|
||||
@ -269,6 +274,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
|
||||
unsigned &CostRemaining,
|
||||
const TargetTransformInfo &TTI,
|
||||
unsigned Depth = 0) {
|
||||
// It is possible to hit a zero-cost cycle (phi/gep instructions for example),
|
||||
// so limit the recursion depth.
|
||||
// TODO: While this recursion limit does prevent pathological behavior, it
|
||||
// would be better to track visited instructions to avoid cycles.
|
||||
if (Depth == MaxSpeculationDepth)
|
||||
return false;
|
||||
|
||||
Instruction *I = dyn_cast<Instruction>(V);
|
||||
if (!I) {
|
||||
// Non-instructions all dominate instructions, but not all constantexprs
|
||||
|
@ -10,34 +10,3 @@ define i8 @test_mul(i32 %a, i32 %b) {
|
||||
%3 = trunc i32 %2 to i8
|
||||
ret i8 %3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_icmp1'
|
||||
; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2
|
||||
; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255
|
||||
; CHECK-DAG: DemandedBits: 0xFFF for %2 = shl i32 %1, 4
|
||||
define i1 @test_icmp1(i32 %a, i32 %b) {
|
||||
%1 = and i32 %a, 255
|
||||
%2 = shl i32 %1, 4
|
||||
%3 = icmp eq i32 %1, %2
|
||||
ret i1 %3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_icmp2'
|
||||
; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2
|
||||
; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255
|
||||
; CHECK-DAG: DemandedBits: 0xFF for %2 = ashr i32 %1, 4
|
||||
define i1 @test_icmp2(i32 %a, i32 %b) {
|
||||
%1 = and i32 %a, 255
|
||||
%2 = ashr i32 %1, 4
|
||||
%3 = icmp eq i32 %1, %2
|
||||
ret i1 %3
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'test_icmp3'
|
||||
; CHECK-DAG: DemandedBits: 0xFFFFFFFF for %1 = and i32 %a, 255
|
||||
; CHECK-DAG: DemandedBits: 0x1 for %2 = icmp eq i32 -1, %1
|
||||
define i1 @test_icmp3(i32 %a) {
|
||||
%1 = and i32 %a, 255
|
||||
%2 = icmp eq i32 -1, %1
|
||||
ret i1 %2
|
||||
}
|
||||
|
@ -267,4 +267,278 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
|
||||
ret <4 x i16> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_une:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ne
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ne
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ne
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ne
|
||||
define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp une <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ueq:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, vs
|
||||
; CHECK-DAG: csel {{.*}}, vs
|
||||
; CHECK-DAG: csel {{.*}}, vs
|
||||
; CHECK-DAG: csel {{.*}}, vs
|
||||
define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ueq <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ugt:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, hi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, hi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, hi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, hi
|
||||
define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ugt <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_uge:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, pl
|
||||
; CHECK-DAG: csel {{.*}}, wzr, pl
|
||||
; CHECK-DAG: csel {{.*}}, wzr, pl
|
||||
; CHECK-DAG: csel {{.*}}, wzr, pl
|
||||
define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp uge <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ult:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, lt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, lt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, lt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, lt
|
||||
define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ult <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ule:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, le
|
||||
; CHECK-DAG: csel {{.*}}, wzr, le
|
||||
; CHECK-DAG: csel {{.*}}, wzr, le
|
||||
; CHECK-DAG: csel {{.*}}, wzr, le
|
||||
define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ule <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_uno:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vs
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vs
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vs
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vs
|
||||
define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp uno <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_one:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, gt
|
||||
; CHECK-DAG: csel {{.*}}, gt
|
||||
; CHECK-DAG: csel {{.*}}, gt
|
||||
; CHECK-DAG: csel {{.*}}, gt
|
||||
define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp one <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_oeq:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
; CHECK-DAG: csel {{.*}}, wzr, eq
|
||||
define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp oeq <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ogt:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, gt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, gt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, gt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, gt
|
||||
define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ogt <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_oge:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ge
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ge
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ge
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ge
|
||||
define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp oge <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_olt:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
; CHECK-DAG: csel {{.*}}, wzr, mi
|
||||
define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp olt <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ole:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ls
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ls
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ls
|
||||
; CHECK-DAG: csel {{.*}}, wzr, ls
|
||||
define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ole <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
; CHECK-LABEL: test_fcmp_ord:
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: fcvt
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vc
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vc
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vc
|
||||
; CHECK-DAG: csel {{.*}}, wzr, vc
|
||||
define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
|
||||
%1 = fcmp ord <4 x half> %a, %b
|
||||
ret <4 x i1> %1
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -421,4 +421,88 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
|
||||
ret <8 x i16> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp une <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 16 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ueq <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ugt <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp uge <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ult <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ule <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp uno <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp one <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp oeq <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ogt <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp oge <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp olt <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ole <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
|
||||
define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
|
||||
%1 = fcmp ord <8 x half> %a, %b
|
||||
ret <8 x i1> %1
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,6 +1,8 @@
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s
|
||||
|
||||
; HSA: .hsa_code_object_version 1,0
|
||||
; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
|
||||
; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
|
||||
; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
|
||||
|
@ -1,5 +1,6 @@
|
||||
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
|
||||
;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
|
||||
;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
|
||||
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
|
||||
|
||||
;GCN-LABEL: {{^}}main:
|
||||
|
33
test/CodeGen/AMDGPU/spill-scavenge-offset.ll
Normal file
33
test/CodeGen/AMDGPU/spill-scavenge-offset.ll
Normal file
@ -0,0 +1,33 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
|
||||
|
||||
; When the offset of VGPR spills into scratch space gets too large, an additional SGPR
|
||||
; is used to calculate the scratch load/store address. Make sure that this
|
||||
; mechanism works even when many spills happen.
|
||||
|
||||
; Just test that it compiles successfully.
|
||||
; CHECK-LABEL: test
|
||||
define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in,
|
||||
<96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.SI.tid() nounwind readnone
|
||||
|
||||
%aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
|
||||
%a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
|
||||
|
||||
; mark most VGPR registers as used to increase register pressure
|
||||
call void asm sideeffect "", "~{VGPR4},~{VGPR8},~{VGPR12},~{VGPR16},~{VGPR20},~{VGPR24},~{VGPR28},~{VGPR32}" ()
|
||||
call void asm sideeffect "", "~{VGPR36},~{VGPR40},~{VGPR44},~{VGPR48},~{VGPR52},~{VGPR56},~{VGPR60},~{VGPR64}" ()
|
||||
call void asm sideeffect "", "~{VGPR68},~{VGPR72},~{VGPR76},~{VGPR80},~{VGPR84},~{VGPR88},~{VGPR92},~{VGPR96}" ()
|
||||
call void asm sideeffect "", "~{VGPR100},~{VGPR104},~{VGPR108},~{VGPR112},~{VGPR116},~{VGPR120},~{VGPR124},~{VGPR128}" ()
|
||||
call void asm sideeffect "", "~{VGPR132},~{VGPR136},~{VGPR140},~{VGPR144},~{VGPR148},~{VGPR152},~{VGPR156},~{VGPR160}" ()
|
||||
call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" ()
|
||||
call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" ()
|
||||
|
||||
%outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
|
||||
store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.SI.tid() nounwind readnone
|
@ -239,3 +239,20 @@ define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) {
|
||||
store i32 %add, i32* %addr
|
||||
ret void
|
||||
}
|
||||
|
||||
define { i32, i32 } @test_multi_use_add(i32 %base, i32 %offset) {
|
||||
; CHECK-LABEL: test_multi_use_add:
|
||||
; CHECK-THUMB: movs [[CONST:r[0-9]+]], #28
|
||||
; CHECK-THUMB: movt [[CONST]], #1
|
||||
|
||||
%prod = mul i32 %offset, 65564
|
||||
%sum = add i32 %base, %prod
|
||||
|
||||
%ptr = inttoptr i32 %sum to i32*
|
||||
%loaded = load i32, i32* %ptr
|
||||
|
||||
%ret.tmp = insertvalue { i32, i32 } undef, i32 %sum, 0
|
||||
%ret = insertvalue { i32, i32 } %ret.tmp, i32 %loaded, 1
|
||||
|
||||
ret { i32, i32 } %ret
|
||||
}
|
||||
|
@ -186,3 +186,12 @@ entry:
|
||||
; ELF64: blr
|
||||
ret i32 -1
|
||||
}
|
||||
|
||||
define zeroext i16 @ret20() nounwind {
|
||||
entry:
|
||||
; ELF64-LABEL: ret20
|
||||
; ELF64: lis{{.*}}0
|
||||
; ELF64: ori{{.*}}32768
|
||||
; ELF64: blr
|
||||
ret i16 32768
|
||||
}
|
||||
|
10
test/CodeGen/PowerPC/inline-asm-s-modifier.ll
Normal file
10
test/CodeGen/PowerPC/inline-asm-s-modifier.ll
Normal file
@ -0,0 +1,10 @@
|
||||
; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
|
||||
define void @test() {
|
||||
entry:
|
||||
call void asm sideeffect "mtfsb1 ${0:s}", "i"(i32 7), !srcloc !1
|
||||
ret void
|
||||
}
|
||||
; CHECK: #APP
|
||||
; CHECK-NEXT: mtfsb1 25
|
||||
|
||||
!1 = !{i32 40}
|
9
test/CodeGen/PowerPC/pr26193.ll
Normal file
9
test/CodeGen/PowerPC/pr26193.ll
Normal file
@ -0,0 +1,9 @@
|
||||
; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
|
||||
define <8 x i16> @test(<4 x i32> %a) {
|
||||
entry:
|
||||
%0 = tail call <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32> %a, <4 x i32> %a)
|
||||
ret <8 x i16> %0
|
||||
}
|
||||
; CHECK: vpkswss 2,
|
||||
|
||||
declare <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32>, <4 x i32>)
|
136
test/CodeGen/PowerPC/pr26356.ll
Normal file
136
test/CodeGen/PowerPC/pr26356.ll
Normal file
@ -0,0 +1,136 @@
|
||||
; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
|
||||
|
||||
define zeroext i32 @f1() {
|
||||
entry:
|
||||
ret i32 65535
|
||||
}
|
||||
; CHECK-LABEL: @f1
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 65535
|
||||
|
||||
define zeroext i32 @f2() {
|
||||
entry:
|
||||
ret i32 32768
|
||||
}
|
||||
; CHECK-LABEL: @f2
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 32768
|
||||
|
||||
define zeroext i32 @f3() {
|
||||
entry:
|
||||
ret i32 32767
|
||||
}
|
||||
; CHECK-LABEL: @f3
|
||||
; CHECK: li 3, 32767
|
||||
|
||||
define zeroext i16 @f4() {
|
||||
entry:
|
||||
ret i16 65535
|
||||
}
|
||||
; CHECK-LABEL: @f4
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 65535
|
||||
|
||||
define zeroext i16 @f5() {
|
||||
entry:
|
||||
ret i16 32768
|
||||
}
|
||||
; CHECK-LABEL: @f5
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 32768
|
||||
|
||||
define zeroext i16 @f6() {
|
||||
entry:
|
||||
ret i16 32767
|
||||
}
|
||||
; CHECK-LABEL: @f6
|
||||
; CHECK: li 3, 32767
|
||||
|
||||
define zeroext i16 @f7() {
|
||||
entry:
|
||||
ret i16 -1
|
||||
}
|
||||
; CHECK-LABEL: @f7
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 65535
|
||||
|
||||
define zeroext i16 @f8() {
|
||||
entry:
|
||||
ret i16 -32768
|
||||
}
|
||||
; CHECK-LABEL: @f8
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 32768
|
||||
|
||||
define signext i32 @f1s() {
|
||||
entry:
|
||||
ret i32 65535
|
||||
}
|
||||
; CHECK-LABEL: @f1s
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 65535
|
||||
|
||||
define signext i32 @f2s() {
|
||||
entry:
|
||||
ret i32 32768
|
||||
}
|
||||
; CHECK-LABEL: @f2s
|
||||
; CHECK: lis 3, 0
|
||||
; CHECK: ori 3, 3, 32768
|
||||
|
||||
define signext i32 @f3s() {
|
||||
entry:
|
||||
ret i32 32767
|
||||
}
|
||||
; CHECK-LABEL: @f3s
|
||||
; CHECK: li 3, 32767
|
||||
|
||||
define signext i16 @f4s() {
|
||||
entry:
|
||||
ret i16 32767
|
||||
}
|
||||
; CHECK-LABEL: @f4s
|
||||
; CHECK: li 3, 32767
|
||||
|
||||
define signext i32 @f1sn() {
|
||||
entry:
|
||||
ret i32 -65535
|
||||
}
|
||||
; CHECK-LABEL: @f1sn
|
||||
; CHECK: lis 3, -1
|
||||
; CHECK: ori 3, 3, 1
|
||||
|
||||
define signext i32 @f2sn() {
|
||||
entry:
|
||||
ret i32 -32768
|
||||
}
|
||||
; CHECK-LABEL: @f2sn
|
||||
; CHECK: li 3, -32768
|
||||
|
||||
define signext i32 @f3sn() {
|
||||
entry:
|
||||
ret i32 -32767
|
||||
}
|
||||
; CHECK-LABEL: @f3sn
|
||||
; CHECK: li 3, -32767
|
||||
|
||||
define signext i32 @f4sn() {
|
||||
entry:
|
||||
ret i32 -65536
|
||||
}
|
||||
; CHECK-LABEL: @f4sn
|
||||
; CHECK: lis 3, -1
|
||||
|
||||
define signext i16 @f5sn() {
|
||||
entry:
|
||||
ret i16 -32767
|
||||
}
|
||||
; CHECK-LABEL: @f5sn
|
||||
; CHECK: li 3, -32767
|
||||
|
||||
define signext i16 @f6sn() {
|
||||
entry:
|
||||
ret i16 -32768
|
||||
}
|
||||
; CHECK-LABEL: @f6sn
|
||||
; CHECK: li 3, -32768
|
8
test/CodeGen/PowerPC/pr26381.ll
Normal file
8
test/CodeGen/PowerPC/pr26381.ll
Normal file
@ -0,0 +1,8 @@
|
||||
; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s
|
||||
|
||||
define internal signext i32 @foo() #0 {
|
||||
ret i32 -125452974
|
||||
}
|
||||
|
||||
; CHECK: lis 3, -1915
|
||||
; CHECK: ori 3, 3, 48466
|
26
test/CodeGen/SystemZ/int-cmp-53.ll
Normal file
26
test/CodeGen/SystemZ/int-cmp-53.ll
Normal file
@ -0,0 +1,26 @@
|
||||
; This used to incorrectly use a TMLL for an always-false test at -O0.
|
||||
;
|
||||
; RUN: llc -O0 < %s -mtriple=s390x-linux-gnu | FileCheck %s
|
||||
|
||||
define void @test(i8 *%input, i32 *%result) {
|
||||
entry:
|
||||
; CHECK-NOT: tmll
|
||||
|
||||
%0 = load i8, i8* %input, align 1
|
||||
%1 = trunc i8 %0 to i1
|
||||
%2 = zext i1 %1 to i32
|
||||
%3 = icmp sge i32 %2, 0
|
||||
br i1 %3, label %if.then, label %if.else
|
||||
|
||||
if.then:
|
||||
store i32 1, i32* %result, align 4
|
||||
br label %return
|
||||
|
||||
if.else:
|
||||
store i32 0, i32* %result, align 4
|
||||
br label %return
|
||||
|
||||
return:
|
||||
ret void
|
||||
}
|
||||
|
@ -259,18 +259,22 @@ define void @prefetch(<8 x i64> %ind, i8* %base) {
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
|
||||
; CHECK-NEXT: kxorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
|
||||
; CHECK-NEXT: movb $1, %al
|
||||
; CHECK-NEXT: kmovb %eax, %k1
|
||||
; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
|
||||
; CHECK-NEXT: movb $120, %al
|
||||
; CHECK-NEXT: kmovb %eax, %k1
|
||||
; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
|
||||
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
|
||||
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
|
||||
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
|
||||
call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
|
||||
call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
|
||||
call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
|
||||
|
||||
define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
|
||||
@ -790,3 +794,54 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
|
||||
; CHECK-LABEL: scatter_mask_test:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: movb $1, %al
|
||||
; CHECK-NEXT: kmovb %eax, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: movb $96, %al
|
||||
; CHECK-NEXT: kmovb %eax, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
|
||||
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
|
||||
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
|
||||
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
|
||||
ret void
|
||||
}
|
||||
|
||||
define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
|
||||
; CHECK-LABEL: gather_mask_test:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm2
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
|
||||
; CHECK-NEXT: kxorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm3
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
|
||||
; CHECK-NEXT: movw $1, %ax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm4
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
|
||||
; CHECK-NEXT: movw $220, %ax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
|
||||
; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1
|
||||
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
|
||||
%res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
|
||||
%res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
|
||||
%res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
|
||||
|
||||
%res4 = fadd <16 x float> %res, %res1
|
||||
%res5 = fadd <16 x float> %res3, %res2
|
||||
%res6 = fadd <16 x float> %res5, %res4
|
||||
ret <16 x float> %res6
|
||||
}
|
||||
|
@ -1,26 +1,28 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
|
||||
; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s | FileCheck %s --check-prefix=KNL-32
|
||||
|
||||
|
||||
; Verify that we don't crash during codegen due to a wrong lowering
|
||||
; of a setcc node with illegal operand types and return type.
|
||||
|
||||
define <8 x i16> @pr25080(<8 x i32> %a) {
|
||||
; CHECK-LABEL: pr25080:
|
||||
; CHECK: # BB#0: # %entry
|
||||
; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; CHECK-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; CHECK-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vpsraw $15, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
; AVX-LABEL: pr25080:
|
||||
; AVX: # BB#0: # %entry
|
||||
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
entry:
|
||||
%0 = trunc <8 x i32> %a to <8 x i23>
|
||||
%1 = icmp eq <8 x i23> %0, zeroinitializer
|
||||
@ -28,3 +30,46 @@ entry:
|
||||
%3 = sext <8 x i1> %2 to <8 x i16>
|
||||
ret <8 x i16> %3
|
||||
}
|
||||
|
||||
define void @pr26232(i64 %a) {
|
||||
; KNL-32-LABEL: pr26232:
|
||||
; KNL-32: # BB#0: # %for_test11.preheader
|
||||
; KNL-32-NEXT: pushl %esi
|
||||
; KNL-32-NEXT: .Ltmp0:
|
||||
; KNL-32-NEXT: .cfi_def_cfa_offset 8
|
||||
; KNL-32-NEXT: .Ltmp1:
|
||||
; KNL-32-NEXT: .cfi_offset %esi, -8
|
||||
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; KNL-32-NEXT: movw $-1, %dx
|
||||
; KNL-32-NEXT: .align 16, 0x90
|
||||
; KNL-32-NEXT: .LBB1_1: # %for_loop599
|
||||
; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000
|
||||
; KNL-32-NEXT: movl %eax, %esi
|
||||
; KNL-32-NEXT: sbbl $0, %esi
|
||||
; KNL-32-NEXT: movl $0, %esi
|
||||
; KNL-32-NEXT: cmovlw %dx, %si
|
||||
; KNL-32-NEXT: testw %si, %si
|
||||
; KNL-32-NEXT: jne .LBB1_1
|
||||
; KNL-32-NEXT: # BB#2: # %for_exit600
|
||||
; KNL-32-NEXT: popl %esi
|
||||
; KNL-32-NEXT: retl
|
||||
allocas:
|
||||
br label %for_test11.preheader
|
||||
|
||||
for_test11.preheader: ; preds = %for_test11.preheader, %allocas
|
||||
br i1 undef, label %for_loop599, label %for_test11.preheader
|
||||
|
||||
for_loop599: ; preds = %for_loop599, %for_test11.preheader
|
||||
%less_i_load605_ = icmp slt i64 %a, 65536
|
||||
%less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0
|
||||
%less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer
|
||||
%"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, undef
|
||||
%intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16
|
||||
%res.i895 = icmp eq i16 %intmask.i894, 0
|
||||
br i1 %res.i895, label %for_exit600, label %for_loop599
|
||||
|
||||
for_exit600: ; preds = %for_loop599
|
||||
ret void
|
||||
}
|
||||
|
102
test/DebugInfo/X86/PR26148.ll
Normal file
102
test/DebugInfo/X86/PR26148.ll
Normal file
@ -0,0 +1,102 @@
|
||||
; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s
|
||||
;
|
||||
; Created using clang -g -O3 from:
|
||||
; struct S0 {
|
||||
; short f0;
|
||||
; int f3;
|
||||
; } a;
|
||||
; void fn1(short p1) {
|
||||
; struct S0 b, c = {3};
|
||||
; b.f3 = p1;
|
||||
; a = b = c;
|
||||
; }
|
||||
;
|
||||
; int main() { return 0; }
|
||||
;
|
||||
; This is similar to the bug in test/DebugInfo/ARM/PR26163.ll, except that there is an
|
||||
; extra non-overlapping range first. Thus, we make sure that the backend actually looks
|
||||
; at all expressions when determining whether to merge ranges, not just the first one.
|
||||
; AS in 26163, we expect two ranges (as opposed to one), the first one being zero sized
|
||||
;
|
||||
;
|
||||
; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004
|
||||
; CHECK: Ending address offset: 0x0000000000000004
|
||||
; CHECK: Location description: 10 03 55 93 04
|
||||
; CHECK: Beginning address offset: 0x0000000000000004
|
||||
; CHECK: Ending address offset: 0x0000000000000014
|
||||
; CHECK: Location description: 10 03 10 00
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-apple-macosx10.11.0"
|
||||
|
||||
%struct.S0 = type { i16, i32 }
|
||||
|
||||
@a = common global %struct.S0 zeroinitializer, align 4
|
||||
|
||||
declare void @llvm.dbg.declare(metadata, metadata, metadata)
|
||||
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
|
||||
|
||||
; The attributes are here to force the zero-sized range not to be at the start of
|
||||
; the function, which has special interpretation in DWARF. The fact that this happens
|
||||
; at all is probably an LLVM bug.
|
||||
attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
|
||||
define void @fn1(i16 signext %p1) #0 !dbg !4 {
|
||||
entry:
|
||||
tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !9, metadata !26), !dbg !27
|
||||
tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !10, metadata !26), !dbg !28
|
||||
tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !16, metadata !26), !dbg !29
|
||||
tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !16, metadata !30), !dbg !29
|
||||
tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29
|
||||
tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !10, metadata !32), !dbg !28
|
||||
tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !10, metadata !30), !dbg !28
|
||||
tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !31), !dbg !28
|
||||
store i32 3, i32* bitcast (%struct.S0* @a to i32*), align 4, !dbg !33
|
||||
store i32 0, i32* getelementptr inbounds (%struct.S0, %struct.S0* @a, i64 0, i32 1), align 4, !dbg !33
|
||||
ret void, !dbg !34
|
||||
}
|
||||
|
||||
define i32 @main() !dbg !17 {
|
||||
entry:
|
||||
ret i32 0, !dbg !35
|
||||
}
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!22, !23, !24}
|
||||
!llvm.ident = !{!25}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !20)
|
||||
!1 = !DIFile(filename: "small.c", directory: "/Users/kfischer/Projects/clangbug")
|
||||
!2 = !{}
|
||||
!3 = !{!4, !17}
|
||||
!4 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !5, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
|
||||
!5 = !DISubroutineType(types: !6)
|
||||
!6 = !{null, !7}
|
||||
!7 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed)
|
||||
!8 = !{!9, !10, !16}
|
||||
!9 = !DILocalVariable(name: "p1", arg: 1, scope: !4, file: !1, line: 5, type: !7)
|
||||
!10 = !DILocalVariable(name: "b", scope: !4, file: !1, line: 6, type: !11)
|
||||
!11 = !DICompositeType(tag: DW_TAG_structure_type, name: "S0", file: !1, line: 1, size: 64, align: 32, elements: !12)
|
||||
!12 = !{!13, !14}
|
||||
!13 = !DIDerivedType(tag: DW_TAG_member, name: "f0", scope: !11, file: !1, line: 2, baseType: !7, size: 16, align: 16)
|
||||
!14 = !DIDerivedType(tag: DW_TAG_member, name: "f3", scope: !11, file: !1, line: 3, baseType: !15, size: 32, align: 32, offset: 32)
|
||||
!15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
|
||||
!16 = !DILocalVariable(name: "c", scope: !4, file: !1, line: 6, type: !11)
|
||||
!17 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 11, type: !18, isLocal: false, isDefinition: true, scopeLine: 11, isOptimized: true, variables: !2)
|
||||
!18 = !DISubroutineType(types: !19)
|
||||
!19 = !{!15}
|
||||
!20 = !{!21}
|
||||
!21 = !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 4, type: !11, isLocal: false, isDefinition: true, variable: %struct.S0* @a)
|
||||
!22 = !{i32 2, !"Dwarf Version", i32 2}
|
||||
!23 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!24 = !{i32 1, !"PIC Level", i32 2}
|
||||
!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)"}
|
||||
!26 = !DIExpression()
|
||||
!27 = !DILocation(line: 5, column: 16, scope: !4)
|
||||
!28 = !DILocation(line: 6, column: 13, scope: !4)
|
||||
!29 = !DILocation(line: 6, column: 16, scope: !4)
|
||||
!30 = !DIExpression(DW_OP_bit_piece, 0, 32)
|
||||
!31 = !DIExpression(DW_OP_bit_piece, 32, 32)
|
||||
!32 = !DIExpression(DW_OP_bit_piece, 32, 16)
|
||||
!33 = !DILocation(line: 8, column: 9, scope: !4)
|
||||
!34 = !DILocation(line: 9, column: 1, scope: !4)
|
||||
!35 = !DILocation(line: 11, column: 14, scope: !17)
|
@ -1672,3 +1672,15 @@ define i1 @cmp_slt_rhs_inc(float %x, i32 %i) {
|
||||
%cmp = icmp slt i32 %conv, %inc
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @PR26407
|
||||
; CHECK-NEXT: %[[addx:.*]] = add i32 %x, 2147483647
|
||||
; CHECK-NEXT: %[[addy:.*]] = add i32 %y, 2147483647
|
||||
; CHECK-NEXT: %[[cmp:.*]] = icmp uge i32 %[[addx]], %[[addy]]
|
||||
; CHECK-NEXT: ret i1 %[[cmp]]
|
||||
define i1 @PR26407(i32 %x, i32 %y) {
|
||||
%addx = add i32 %x, 2147483647
|
||||
%addy = add i32 %y, 2147483647
|
||||
%cmp = icmp uge i32 %addx, %addy
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
@ -175,3 +175,33 @@ bb3:
|
||||
ret <4 x double> %tmp4
|
||||
}
|
||||
|
||||
; PR26354: https://llvm.org/bugs/show_bug.cgi?id=26354
|
||||
; Don't create a shufflevector if we know that we're not going to replace the insertelement.
|
||||
|
||||
define double @pr26354(<2 x double>* %tmp, i1 %B) {
|
||||
; CHECK-LABEL: @pr26354(
|
||||
; CHECK: %ld = load <2 x double>, <2 x double>* %tmp
|
||||
; CHECK-NEXT: %e1 = extractelement <2 x double> %ld, i32 0
|
||||
; CHECK-NEXT: br i1 %B, label %if, label %end
|
||||
; CHECK: if:
|
||||
; CHECK-NEXT: %e2 = extractelement <2 x double> %ld, i32 1
|
||||
; CHECK-NEXT: %i1 = insertelement <4 x double>
|
||||
; CHECK-NEXT: br label %end
|
||||
|
||||
entry:
|
||||
%ld = load <2 x double>, <2 x double>* %tmp
|
||||
%e1 = extractelement <2 x double> %ld, i32 0
|
||||
%e2 = extractelement <2 x double> %ld, i32 1
|
||||
br i1 %B, label %if, label %end
|
||||
|
||||
if:
|
||||
%i1 = insertelement <4 x double> zeroinitializer, double %e2, i32 3
|
||||
br label %end
|
||||
|
||||
end:
|
||||
%ph = phi <4 x double> [ undef, %entry ], [ %i1, %if ]
|
||||
%e3 = extractelement <4 x double> %ph, i32 1
|
||||
%mu = fmul double %e1, %e3
|
||||
ret double %mu
|
||||
}
|
||||
|
||||
|
@ -136,3 +136,18 @@ define %B @structB(%B* %b.ptr) {
|
||||
%1 = load %B, %B* %b.ptr, align 8
|
||||
ret %B %1
|
||||
}
|
||||
|
||||
%struct.S = type <{ i8, %struct.T }>
|
||||
%struct.T = type { i32, i32 }
|
||||
|
||||
; Make sure that we do not increase alignment of packed struct element
|
||||
define i32 @packed_alignment(%struct.S* dereferenceable(9) %s) {
|
||||
; CHECK-LABEL: packed_alignment
|
||||
; CHECK-NEXT: %tv.elt1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1, i32 1
|
||||
; CHECK-NEXT: %tv.unpack2 = load i32, i32* %tv.elt1, align 1
|
||||
; CHECK-NEXT: ret i32 %tv.unpack2
|
||||
%t = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 1
|
||||
%tv = load %struct.T, %struct.T* %t, align 1
|
||||
%v = extractvalue %struct.T %tv, 1
|
||||
ret i32 %v
|
||||
}
|
||||
|
@ -205,39 +205,5 @@ for.body: ; preds = %for.body, %for.body
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @add_g
|
||||
; CHECK: load <16 x i8>
|
||||
; CHECK: xor <16 x i8>
|
||||
; CHECK: icmp ult <16 x i8>
|
||||
; CHECK: select <16 x i1> {{.*}}, <16 x i8>
|
||||
; CHECK: store <16 x i8>
|
||||
define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 {
|
||||
%1 = icmp sgt i32 %len, 0
|
||||
br i1 %1, label %.lr.ph, label %._crit_edge
|
||||
|
||||
.lr.ph: ; preds = %0
|
||||
%2 = sext i8 %arg1 to i64
|
||||
br label %3
|
||||
|
||||
._crit_edge: ; preds = %3, %0
|
||||
ret void
|
||||
|
||||
; <label>:3 ; preds = %3, %.lr.ph
|
||||
%indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ]
|
||||
%x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
|
||||
%x5 = load i8, i8* %x4
|
||||
%x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
|
||||
%x8 = load i8, i8* %x7
|
||||
%x9 = zext i8 %x5 to i32
|
||||
%x10 = xor i32 %x9, 255
|
||||
%x11 = icmp ult i32 %x10, 24
|
||||
%x12 = select i1 %x11, i32 %x10, i32 24
|
||||
%x13 = trunc i32 %x12 to i8
|
||||
store i8 %x13, i8* %x4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||
%exitcond = icmp eq i32 %lftr.wideiv, %len
|
||||
br i1 %exitcond, label %._crit_edge, label %3
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1302,3 +1302,35 @@ l6:
|
||||
; CHECK: entry
|
||||
; CHECK-NEXT: switch
|
||||
}
|
||||
|
||||
; Speculation depth must be limited to avoid a zero-cost instruction cycle.
|
||||
|
||||
; CHECK-LABEL: @PR26308(
|
||||
; CHECK: cleanup4:
|
||||
; CHECK-NEXT: br label %cleanup4
|
||||
|
||||
define i32 @PR26308(i1 %B, i64 %load) {
|
||||
entry:
|
||||
br label %while.body
|
||||
|
||||
while.body:
|
||||
br label %cleanup
|
||||
|
||||
cleanup:
|
||||
%cleanup.dest.slot.0 = phi i1 [ false, %while.body ]
|
||||
br i1 %cleanup.dest.slot.0, label %for.cond, label %cleanup4
|
||||
|
||||
for.cond:
|
||||
%e.0 = phi i64* [ undef, %cleanup ], [ %incdec.ptr, %for.cond2 ]
|
||||
%pi = ptrtoint i64* %e.0 to i64
|
||||
%incdec.ptr = getelementptr inbounds i64, i64* %e.0, i64 1
|
||||
br label %for.cond2
|
||||
|
||||
for.cond2:
|
||||
%storemerge = phi i64 [ %pi, %for.cond ], [ %load, %for.cond2 ]
|
||||
br i1 %B, label %for.cond2, label %for.cond
|
||||
|
||||
cleanup4:
|
||||
br label %while.body
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@ if(NOT LLVM_USE_INTEL_JITEVENTS )
|
||||
set(LLVM_TOOL_LLVM_JITLISTENER_BUILD Off)
|
||||
endif()
|
||||
|
||||
if(CYGWIN)
|
||||
if(CYGWIN OR NOT LLVM_ENABLE_PIC)
|
||||
set(LLVM_TOOL_LTO_BUILD Off)
|
||||
set(LLVM_TOOL_LLVM_LTO_BUILD Off)
|
||||
endif()
|
||||
|
@ -155,9 +155,12 @@ while [ $# -gt 0 ]; do
|
||||
done
|
||||
|
||||
if [ "$use_autoconf" = "no" ]; then
|
||||
# See llvm.org/PR26146.
|
||||
echo Skipping test-suite when using CMake.
|
||||
do_test_suite="no"
|
||||
if [ "$do_test_suite" = "yes" ]; then
|
||||
# See llvm.org/PR26146.
|
||||
echo Skipping test-suite build when using CMake.
|
||||
echo It will still be exported.
|
||||
do_test_suite="export-only"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check required arguments.
|
||||
@ -202,9 +205,11 @@ if [ $do_libs = "yes" ]; then
|
||||
projects="$projects libunwind"
|
||||
fi
|
||||
fi
|
||||
if [ $do_test_suite = "yes" ]; then
|
||||
projects="$projects test-suite"
|
||||
fi
|
||||
case $do_test_suite in
|
||||
yes|export-only)
|
||||
projects="$projects test-suite"
|
||||
;;
|
||||
esac
|
||||
if [ $do_openmp = "yes" ]; then
|
||||
projects="$projects openmp"
|
||||
fi
|
||||
@ -277,9 +282,16 @@ function export_sources() {
|
||||
clang-tools-extra)
|
||||
projsrc=llvm.src/tools/clang/tools/extra
|
||||
;;
|
||||
compiler-rt|libcxx|libcxxabi|libunwind|openmp|test-suite)
|
||||
compiler-rt|libcxx|libcxxabi|libunwind|openmp)
|
||||
projsrc=llvm.src/projects/$proj
|
||||
;;
|
||||
test-suite)
|
||||
if [ $do_test_suite = 'yes' ]; then
|
||||
projsrc=llvm.src/projects/$proj
|
||||
else
|
||||
projsrc=$proj.src
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
echo "error: unknown project $proj"
|
||||
exit 1
|
||||
|
@ -32,10 +32,6 @@ if (NOT LLVM_ENABLE_THREADS)
|
||||
add_definitions( -DGTEST_HAS_PTHREAD=0 )
|
||||
endif()
|
||||
|
||||
set(LIBS
|
||||
LLVMSupport # Depends on llvm::raw_ostream
|
||||
)
|
||||
|
||||
find_library(PTHREAD_LIBRARY_PATH pthread)
|
||||
if (PTHREAD_LIBRARY_PATH)
|
||||
list(APPEND LIBS pthread)
|
||||
@ -46,6 +42,9 @@ add_llvm_library(gtest
|
||||
|
||||
LINK_LIBS
|
||||
${LIBS}
|
||||
|
||||
LINK_COMPONENTS
|
||||
Support # Depends on llvm::raw_ostream
|
||||
)
|
||||
|
||||
add_subdirectory(UnitTestMain)
|
||||
|
@ -3,5 +3,7 @@ add_llvm_library(gtest_main
|
||||
|
||||
LINK_LIBS
|
||||
gtest
|
||||
LLVMSupport # Depends on llvm::cl
|
||||
|
||||
LINK_COMPONENTS
|
||||
Support # Depends on llvm::cl
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user