Vendor import of llvm release_38 branch r260756:

https://llvm.org/svn/llvm-project/llvm/branches/release_38@260756
This commit is contained in:
dim 2016-02-13 14:57:10 +00:00
parent 44c4732640
commit 97a7b8a20a
51 changed files with 1610 additions and 228 deletions

View File

@ -468,20 +468,23 @@ function(llvm_add_library name)
endif()
endif()
# Add the explicit dependency information for this library.
#
# It would be nice to verify that we have the dependencies for this library
# name, but using get_property(... SET) doesn't suffice to determine if a
# property has been set to an empty value.
get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_STATIC AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
set(llvm_libs LLVM)
if (DEFINED LLVM_LINK_COMPONENTS OR DEFINED ARG_LINK_COMPONENTS)
if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
set(llvm_libs LLVM)
else()
llvm_map_components_to_libnames(llvm_libs
${ARG_LINK_COMPONENTS}
${LLVM_LINK_COMPONENTS}
)
endif()
else()
llvm_map_components_to_libnames(llvm_libs
${ARG_LINK_COMPONENTS}
${LLVM_LINK_COMPONENTS}
)
# Components have not been defined explicitly in CMake, so add the
# dependency information for this library as defined by LLVMBuild.
#
# It would be nice to verify that we have the dependencies for this library
# name, but using get_property(... SET) doesn't suffice to determine if a
# property has been set to an empty value.
get_property(lib_deps GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_${name})
endif()
if(CMAKE_VERSION VERSION_LESS 2.8.12)
@ -882,14 +885,11 @@ function(add_unittest test_suite test_name)
set(LLVM_REQUIRES_RTTI OFF)
list(APPEND LLVM_LINK_COMPONENTS Support) # gtest needs it for raw_ostream
add_llvm_executable(${test_name} IGNORE_EXTERNALIZE_DEBUGINFO ${ARGN})
set(outdir ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR})
set_output_directory(${test_name} BINARY_DIR ${outdir} LIBRARY_DIR ${outdir})
target_link_libraries(${test_name}
gtest
gtest_main
LLVMSupport # gtest needs it for raw_ostream.
)
target_link_libraries(${test_name} gtest_main gtest)
add_dependencies(${test_suite} ${test_name})
get_target_property(test_suite_folder ${test_suite} FOLDER)

View File

@ -40,10 +40,19 @@ macro(llvm_config executable)
# done in case libLLVM does not contain all of the components
# the target requires.
#
# TODO strip LLVM_DYLIB_COMPONENTS out of link_components.
# Strip LLVM_DYLIB_COMPONENTS out of link_components.
# To do this, we need special handling for "all", since that
# may imply linking to libraries that are not included in
# libLLVM.
if (DEFINED link_components AND DEFINED LLVM_DYLIB_COMPONENTS)
if("${LLVM_DYLIB_COMPONENTS}" STREQUAL "all")
set(link_components "")
else()
list(REMOVE_ITEM link_components ${LLVM_DYLIB_COMPONENTS})
endif()
endif()
target_link_libraries(${executable} LLVM)
endif()

View File

@ -5,11 +5,6 @@ LLVM 3.8 Release Notes
.. contents::
:local:
.. warning::
These are in-progress notes for the upcoming LLVM 3.8 release. You may
prefer the `LLVM 3.7 Release Notes <http://llvm.org/releases/3.7.0/docs
/ReleaseNotes.html>`_.
Introduction
============
@ -26,11 +21,6 @@ have questions or comments, the `LLVM Developer's Mailing List
<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
them.
Note that if you are reading this file from a Subversion checkout or the main
LLVM web page, this document applies to the *next* release, not the current
one. To see the release notes for a specific release, please see the `releases
page <http://llvm.org/releases/>`_.
Non-comprehensive list of changes in this release
=================================================
* With this release, the minimum Windows version required for running LLVM is
@ -79,6 +69,26 @@ Non-comprehensive list of changes in this release
* Support for dematerializing has been dropped.
* RegisterScheduler::setDefault was removed. Targets that used to call into the
command line parser to set the DAGScheduler, and that don't have enough
control with setSchedulingPreference, should look into overriding the
SubTargetHook "getDAGScheduler()".
* ``ilist_iterator<T>`` no longer has implicit conversions to and from ``T*``,
since ``ilist_iterator<T>`` may be pointing at the sentinel (which is usually
not of type ``T`` at all). To convert from an iterator ``I`` to a pointer,
use ``&*I``; to convert from a pointer ``P`` to an iterator, use
``P->getIterator()``. Alternatively, explicit conversions via
``static_cast<T>(U)`` are still available.
* ``ilist_node<T>::getNextNode()`` and ``ilist_node<T>::getPrevNode()`` now
fail at compile time when the node cannot access its parent list.
Previously, when the sentinel was was an ``ilist_half_node<T>``, this API
could return the sentinal instead of ``nullptr``. Frustrated callers should
be updated to use ``iplist<T>::getNextNode(T*)`` instead. Alternatively, if
the node ``N`` is guaranteed not to be the last in the list, it is safe to
call ``&*++N->getIterator()`` directly.
.. NOTE
For small 1-3 sentence descriptions, just add an entry at the end of
this list. If your description won't fit comfortably in one bullet
@ -98,17 +108,97 @@ Non-comprehensive list of changes in this release
Makes programs 10x faster by doing Special New Thing.
Changes to the ARM Backend
--------------------------
During this release ...
Changes to the ARM Backends
---------------------------
During this release the AArch64 target has:
* Added support for more sanitizers (MSAN, TSAN) and made them compatible with
all VMA kernel configurations (kurrently tested on 39 and 42 bits).
* Gained initial LLD support in the new ELF back-end
* Extended the Load/Store optimiser and cleaned up some of the bad decisions
made earlier.
* Expanded LLDB support, including watchpoints, native building, Renderscript,
LLDB-server, debugging 32-bit applications.
* Added support for the ``Exynos M1`` chip.
During this release the ARM target has:
* Gained massive performance improvements on embedded benchmarks due to finally
running the stride vectorizer in full form, incrementing the performance gains
that we already had in the previous releases with limited stride vectorization.
* Expanded LLDB support, including watchpoints, unwind tables
* Extended the Load/Store optimiser and cleaned up some of the bad decisions
made earlier.
* Simplified code generation for global variable addresses in ELF, resulting in
a significant (4% in Chromium) reduction in code size.
* Gained some additional code size improvements, though there's still a long road
ahead, especially for older cores.
* Added some EABI floating point comparison functions to Compiler-RT
* Added support for Windows+GNU triple, +features in -mcpu/-march options.
Changes to the MIPS Target
--------------------------
During this release ...
During this release the MIPS target has:
* Significantly extended support for the Integrated Assembler. See below for
more information
* Added support for the ``P5600`` processor.
* Added support for the ``interrupt`` attribute for MIPS32R2 and later. This
attribute will generate a function which can be used as a interrupt handler
on bare metal MIPS targets using the static relocation model.
* Added support for the ``ERETNC`` instruction found in MIPS32R5 and later.
* Added support for OpenCL. See http://portablecl.org/.
* Address spaces 1 to 255 are now reserved for software use and conversions
between them are no-op casts.
* Removed the ``mips16`` value for the -mcpu option since it is an :abbr:`ASE
(Application Specific Extension)` and not a processor. If you were using this,
please specify another CPU and use ``-mips16`` to enable MIPS16.
* Removed ``copy_u.w`` from 32-bit MSA and ``copy_u.d`` from 64-bit MSA since
they have been removed from the MSA specification due to forward compatibility
issues. For example, 32-bit MSA code containing ``copy_u.w`` would behave
differently on a 64-bit processor supporting MSA. The corresponding intrinsics
are still available and may expand to ``copy_s.[wd]`` where this is
appropriate for forward compatibility purposes.
* Relaxed the ``-mnan`` option to allow ``-mnan=2008`` on MIPS32R2/MIPS64R2 for
compatibility with GCC.
* Made MIPS64R6 the default CPU for 64-bit Android triples.
The MIPS target has also fixed various bugs including the following notable
fixes:
* Fixed reversed operands on ``mthi``/``mtlo`` in the DSP :abbr:`ASE
(Application Specific Extension)`.
* The code generator no longer uses ``jal`` for calls to absolute immediate
addresses.
* Disabled fast instruction selection on MIPS32R6 and MIPS64R6 since this is not
yet supported.
* Corrected addend for ``R_MIPS_HI16`` and ``R_MIPS_PCHI16`` in MCJIT
* The code generator no longer crashes when handling subregisters of an 64-bit
FPU register with undefined value.
* The code generator no longer attempts to use ``$zero`` for operands that do
not permit ``$zero``.
* Corrected the opcode used for ``ll``/``sc`` when using MIPS32R6/MIPS64R6 and
the Integrated Assembler.
* Added support for atomic load and atomic store.
* Corrected debug info when dynamically re-aligning the stack.
Integrated Assembler
^^^^^^^^^^^^^^^^^^^^
We have made a large number of improvements to the integrated assembler for
MIPS. In this release, the integrated assembler isn't quite production-ready
since there are a few known issues related to bare-metal support, checking
immediates on instructions, and the N32/N64 ABI's. However, the current support
should be sufficient for many users of the O32 ABI, particularly those targeting
MIPS32 on Linux or bare-metal MIPS32.
If you would like to try the integrated assembler, please use
``-fintegrated-as``.
Changes to the PowerPC Target
-----------------------------
@ -123,6 +213,20 @@ Changes to the X86 Target
* TLS is enabled for Cygwin as emutls.
* Smaller code for materializing 32-bit 1 and -1 constants at ``-Os``.
* More efficient code for wide integer compares. (E.g. 64-bit compares
on 32-bit targets.)
* Tail call support for ``thiscall``, ``stdcall`, ``vectorcall``, and
``fastcall`` functions.
Changes to the AVR Target
-------------------------
Slightly less than half of the AVR backend has been merged in at this point. It is still
missing a number large parts which cause it to be unusable, but is well on the
road to being completely merged and workable.
Changes to the OCaml bindings
-----------------------------
@ -140,7 +244,19 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
a lot of other language and tools projects. This section lists some of the
projects that have already been updated to work with LLVM 3.8.
* A project
LDC - the LLVM-based D compiler
-------------------------------
`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
pragmatically combines efficiency, control, and modeling power, with safety and
programmer productivity. D supports powerful concepts like Compile-Time Function
Execution (CTFE) and Template Meta-Programming, provides an innovative approach
to concurrency and offers many classical paradigms.
`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
combined with LLVM as backend to produce efficient native code. LDC targets
x86/x86_64 systems like Linux, OS X and Windows and also PowerPC (32/64 bit)
and ARM. Ports to other architectures like AArch64 and MIPS64 are underway.
Additional Information

View File

@ -484,7 +484,7 @@ let TargetPrefix = "ppc" in { // All PPC intrinsics start with "llvm.ppc.".
Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
[IntrNoMem]>;
def int_ppc_altivec_vpkswss : GCCBuiltin<"__builtin_altivec_vpkswss">,
Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
[IntrNoMem]>;
def int_ppc_altivec_vpkswus : GCCBuiltin<"__builtin_altivec_vpkswus">,
Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],

View File

@ -280,11 +280,7 @@ public:
// when using them since you might not get all uses.
// The methods that don't start with materialized_ assert that modules is
// fully materialized.
#ifdef NDEBUG
void assertModuleIsMaterialized() const {}
#else
void assertModuleIsMaterialized() const;
#endif
bool use_empty() const {
assertModuleIsMaterialized();

View File

@ -242,13 +242,6 @@ void DemandedBits::determineLiveOperandBits(
if (OperandNo != 0)
AB = AOut;
break;
case Instruction::ICmp:
// Count the number of leading zeroes in each operand.
ComputeKnownBits(BitWidth, UserI->getOperand(0), UserI->getOperand(1));
auto NumLeadingZeroes = std::min(KnownZero.countLeadingOnes(),
KnownZero2.countLeadingOnes());
AB = ~APInt::getHighBitsSet(BitWidth, NumLeadingZeroes);
break;
}
}

View File

@ -555,6 +555,11 @@ bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
return true;
O << -MO.getImm();
return false;
case 's': // The GCC deprecated s modifier
if (MO.getType() != MachineOperand::MO_Immediate)
return true;
O << ((32 - MO.getImm()) & 31);
return false;
}
}
return true;

View File

@ -793,16 +793,27 @@ static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
}
/// Determine whether two variable pieces overlap.
static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
if (!P1->isBitPiece() || !P2->isBitPiece())
return true;
// Determine the relative position of the pieces described by P1 and P2.
// Returns -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
// 1 if P1 is entirely after P2.
static int pieceCmp(const DIExpression *P1, const DIExpression *P2) {
unsigned l1 = P1->getBitPieceOffset();
unsigned l2 = P2->getBitPieceOffset();
unsigned r1 = l1 + P1->getBitPieceSize();
unsigned r2 = l2 + P2->getBitPieceSize();
// True where [l1,r1[ and [r1,r2[ overlap.
return (l1 < r2) && (l2 < r1);
if (r1 <= l2)
return -1;
else if (r2 <= l1)
return 1;
else
return 0;
}
/// Determine whether two variable pieces overlap.
static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
if (!P1->isBitPiece() || !P2->isBitPiece())
return true;
return pieceCmp(P1, P2) == 0;
}
/// \brief If this and Next are describing different pieces of the same
@ -811,14 +822,32 @@ static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
/// Return true if the merge was successful.
bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
if (Begin == Next.Begin) {
auto *Expr = cast_or_null<DIExpression>(Values[0].Expression);
auto *NextExpr = cast_or_null<DIExpression>(Next.Values[0].Expression);
if (Expr->isBitPiece() && NextExpr->isBitPiece() &&
!piecesOverlap(Expr, NextExpr)) {
addValues(Next.Values);
End = Next.End;
return true;
auto *FirstExpr = cast<DIExpression>(Values[0].Expression);
auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression);
if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece())
return false;
// We can only merge entries if none of the pieces overlap any others.
// In doing so, we can take advantage of the fact that both lists are
// sorted.
for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
for (; j < Next.Values.size(); ++j) {
int res = pieceCmp(cast<DIExpression>(Values[i].Expression),
cast<DIExpression>(Next.Values[j].Expression));
if (res == 0) // The two expressions overlap, we can't merge.
return false;
// Values[i] is entirely before Next.Values[j],
// so go back to the next entry of Values.
else if (res == -1)
break;
// Next.Values[j] is entirely before Values[i], so go on to the
// next entry of Next.Values.
}
}
addValues(Next.Values);
End = Next.End;
return true;
}
return false;
}

View File

@ -313,8 +313,8 @@ void Value::takeName(Value *V) {
ST->reinsertValue(this);
}
#ifndef NDEBUG
void Value::assertModuleIsMaterialized() const {
#ifndef NDEBUG
const GlobalValue *GV = dyn_cast<GlobalValue>(this);
if (!GV)
return;
@ -322,8 +322,10 @@ void Value::assertModuleIsMaterialized() const {
if (!M)
return;
assert(M->isMaterialized());
#endif
}
#ifndef NDEBUG
static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr,
Constant *C) {
if (!Cache.insert(Expr).second)

View File

@ -90,6 +90,7 @@ def AArch64InstrInfo : InstrInfo;
include "AArch64SchedA53.td"
include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
include "AArch64SchedM1.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors",
@ -144,8 +145,7 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;
def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
//===----------------------------------------------------------------------===//
// Assembly parser

View File

@ -6689,6 +6689,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
}
if (LHS.getValueType().getVectorElementType() == MVT::f16)
return SDValue();
assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
LHS.getValueType().getVectorElementType() == MVT::f64);

View File

@ -0,0 +1,359 @@
//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the machine model for Samsung Exynos-M1 to support
// instruction scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// The Exynos-M1 is a traditional superscalar microprocessor with a
// 4-wide in-order stage for decode and dispatch and a wider issue stage.
// The execution units and loads and stores are out-of-order.
def ExynosM1Model : SchedMachineModel {
let IssueWidth = 4; // Up to 4 uops per cycle.
let MinLatency = 0; // OoO.
let MicroOpBufferSize = 96; // ROB size.
let LoopMicroOpBufferSize = 32; // Instruction queue size.
let LoadLatency = 4; // Optimistic load cases.
let MispredictPenalty = 14; // Minimum branch misprediction penalty.
let CompleteModel = 0; // Use the default model otherwise.
}
//===----------------------------------------------------------------------===//
// Define each kind of processor resource and number available on the Exynos-M1,
// which has 9 pipelines, each with its own queue with out-of-order dispatch.
def M1UnitA : ProcResource<2>; // Simple integer
def M1UnitC : ProcResource<1>; // Simple and complex integer
def M1UnitB : ProcResource<2>; // Branch
def M1UnitL : ProcResource<1>; // Load
def M1UnitS : ProcResource<1>; // Store
def M1PipeF0 : ProcResource<1>; // FP #0
def M1PipeF1 : ProcResource<1>; // FP #1
let Super = M1PipeF0 in {
def M1UnitFMAC : ProcResource<1>; // FP multiplication
def M1UnitFCVT : ProcResource<1>; // FP conversion
def M1UnitNAL0 : ProcResource<1>; // Simple vector.
def M1UnitNMISC : ProcResource<1>; // Miscellanea
def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
}
let Super = M1PipeF1 in {
def M1UnitFADD : ProcResource<1>; // Simple FP
let BufferSize = 1 in
def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
def M1UnitNAL1 : ProcResource<1>; // Simple vector.
def M1UnitFST : ProcResource<1>; // FP store
}
let SchedModel = ExynosM1Model in {
def M1UnitALU : ProcResGroup<[M1UnitA,
M1UnitC]>; // All simple integer.
def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
M1UnitNAL1]>; // All simple vector.
}
let SchedModel = ExynosM1Model in {
//===----------------------------------------------------------------------===//
// Coarse scheduling model for the Exynos-M1.
// Branch instructions.
// TODO: Non-conditional direct branches take zero cycles and units.
def : WriteRes<WriteBr, [M1UnitB]> { let Latency = 1; }
def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
// TODO: Branch and link is much different.
// Arithmetic and logical integer instructions.
def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; }
// TODO: Shift over 3 and some extensions take 2 cycles.
def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; }
// Move instructions.
def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
// Divide and multiply instructions.
// TODO: Division blocks the divider inside C.
def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; }
def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; }
// TODO: Long multiplication take 5 cycles and also the ALU.
// TODO: Multiplication with accumulation can be advanced.
def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
// TODO: 64-bit multiplication has a throughput of 1/2.
def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
// Miscellaneous instructions.
def : WriteRes<WriteExtr, [M1UnitALU,
M1UnitALU]> { let Latency = 2; }
// TODO: The latency for the post or pre register is 1 cycle.
def : WriteRes<WriteAdr, []> { let Latency = 0; }
// Load instructions.
def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; }
// TODO: Extended address requires also the ALU.
def : WriteRes<WriteLDIdx, [M1UnitL]> { let Latency = 5; }
def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; }
// Store instructions.
def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; }
// TODO: Extended address requires also the ALU.
def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; }
def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; }
def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; }
// FP data instructions.
def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; }
// TODO: FCCMP is much different.
def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
// TODO: DP takes longer.
def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15; }
// TODO: MACC takes longer.
def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; }
// FP miscellaneous instructions.
// TODO: Conversion between register files is much different.
def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; }
def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; }
// TODO: Copy from FPR to GPR is much different.
def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; }
// FP load instructions.
// TODO: ASIMD loads are much different.
def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
// FP store instructions.
// TODO: ASIMD stores are much different.
def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
// ASIMD FP instructions.
// TODO: Other operations are much different.
def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
// Other miscellaneous instructions.
def : WriteRes<WriteSys, []> { let Latency = 1; }
def : WriteRes<WriteBarrier, []> { let Latency = 1; }
def : WriteRes<WriteHint, []> { let Latency = 1; }
//===----------------------------------------------------------------------===//
// Fast forwarding.
// TODO: Add FP register forwarding rules.
def : ReadAdvance<ReadI, 0>;
def : ReadAdvance<ReadISReg, 0>;
def : ReadAdvance<ReadIEReg, 0>;
def : ReadAdvance<ReadIM, 0>;
// Integer multiply-accumulate.
// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>;
def : ReadAdvance<ReadID, 0>;
def : ReadAdvance<ReadExtrHi, 0>;
def : ReadAdvance<ReadAdrBase, 0>;
def : ReadAdvance<ReadVLD, 0>;
//===----------------------------------------------------------------------===//
// Finer scheduling model for the Exynos-M1.
def M1WriteNEONA : SchedWriteRes<[M1UnitNALU,
M1UnitNALU,
M1UnitFADD]> { let Latency = 9; }
def M1WriteNEONB : SchedWriteRes<[M1UnitNALU,
M1UnitFST]> { let Latency = 5; }
def M1WriteNEONC : SchedWriteRes<[M1UnitNALU,
M1UnitFST]> { let Latency = 6; }
def M1WriteNEOND : SchedWriteRes<[M1UnitNALU,
M1UnitFST,
M1UnitL]> { let Latency = 10; }
def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT,
M1UnitFST]> { let Latency = 8; }
def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT,
M1UnitFST,
M1UnitL]> { let Latency = 13; }
def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC,
M1UnitFST]> { let Latency = 6; }
def M1WriteNEONH : SchedWriteRes<[M1UnitNALU,
M1UnitFST]> { let Latency = 3; }
def M1WriteNEONI : SchedWriteRes<[M1UnitFST,
M1UnitL]> { let Latency = 9; }
def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; }
def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
// FIXME: This is the worst case, conditional branch and link.
def M1WriteBL : SchedWriteRes<[M1UnitB,
M1UnitALU]> { let Latency = 1; }
// FIXME: This is the worst case, when using LR.
def M1WriteBLR : SchedWriteRes<[M1UnitB,
M1UnitALU,
M1UnitALU]> { let Latency = 2; }
def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; }
def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; }
def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; }
def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; }
def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; }
def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; }
def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; }
def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; }
def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; }
def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; }
def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; }
def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; }
def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; }
def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; }
def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; }
def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; }
def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; }
def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; }
def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
def M1WriteTB : SchedWriteRes<[M1UnitC,
M1UnitALU]> { let Latency = 2; }
// Branch instructions
def : InstRW<[M1WriteB ], (instrs Bcc)>;
def : InstRW<[M1WriteBL], (instrs BL)>;
def : InstRW<[M1WriteBLR], (instrs BLR)>;
def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>;
def : InstRW<[M1WriteTB], (instregex "^TBN?Z[WX]")>;
// Arithmetic and logical integer instructions.
def : InstRW<[M1WriteALU1], (instrs COPY)>;
// Divide and multiply instructions.
// Miscellaneous instructions.
// Load instructions.
// Store instructions.
// FP data instructions.
def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>;
def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>;
def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>;
def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>;
def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>;
def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>;
def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
// FP miscellaneous instructions.
def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>;
def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
// FP load instructions.
// FP store instructions.
// ASIMD instructions.
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>;
def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>;
def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>;
def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>;
def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>;
// ASIMD FP instructions.
def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>;
def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>;
def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>;
def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>;
// ASIMD miscellaneous instructions.
def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>;
def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>;
def : InstRW<[M1WriteNALU1], (instregex "^CPY")>;
def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>;
def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>;
def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>;
def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>;
def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev")>;
def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>;
def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>;
def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>;
def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
(instregex "^TB[LX]v8i8Two")>;
def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
(instregex "^TB[LX]v8i8Three")>;
def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
(instregex "^TB[LX]v8i8Four")>;
def : InstRW<[M1WriteNAL12], (instregex "^TB[LX]v16i8One")>;
def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
(instregex "^TB[LX]v16i8Two")>;
def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
(instregex "^TB[LX]v16i8Three")>;
def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
(instregex "^TB[LX]v16i8Four")>;
def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>;
def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>;
def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>;
// ASIMD load instructions.
// ASIMD store instructions.
// Cryptography instructions.
def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
// CRC instructions.
def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
} // SchedModel = ExynosM1Model

View File

@ -183,6 +183,7 @@ def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>;
def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>;
def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>;
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
"localmemorysize"#Value,
@ -252,7 +253,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
[Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
FeatureGCN3Encoding, FeatureCIInsts]>;
//===----------------------------------------------------------------------===//

View File

@ -53,7 +53,8 @@ public:
ISAVersion7_0_0,
ISAVersion7_0_1,
ISAVersion8_0_0,
ISAVersion8_0_1
ISAVersion8_0_1,
ISAVersion8_0_3
};
private:

View File

@ -128,21 +128,23 @@ def : ProcessorModel<"mullins", SIQuarterSpeedModel,
//===----------------------------------------------------------------------===//
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
FeatureLDSBankCount32]
>;
def : ProcessorModel<"iceland", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
[FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
FeatureLDSBankCount32]
>;
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
[FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
>;
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
[FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32]
>;
def : ProcessorModel<"stoney", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
[FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
>;

View File

@ -234,6 +234,7 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
bool IsLoad = TII->get(LoadStoreOp).mayLoad();
bool RanOutOfSGPRs = false;
bool Scavenged = false;
unsigned SOffset = ScratchOffset;
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
@ -244,6 +245,8 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
if (SOffset == AMDGPU::NoRegister) {
RanOutOfSGPRs = true;
SOffset = AMDGPU::SGPR0;
} else {
Scavenged = true;
}
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
.addReg(ScratchOffset)
@ -259,10 +262,14 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
unsigned SOffsetRegState = 0;
if (i + 1 == e && Scavenged)
SOffsetRegState |= RegState::Kill;
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
.addReg(SubReg, getDefRegState(IsLoad))
.addReg(ScratchRsrcReg)
.addReg(SOffset)
.addReg(SOffset, SOffsetRegState)
.addImm(Offset)
.addImm(0) // glc
.addImm(0) // slc

View File

@ -41,6 +41,9 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) {
if (Features.test(FeatureISAVersion8_0_1))
return {8, 0, 1};
if (Features.test(FeatureISAVersion8_0_3))
return {8, 0, 3};
return {0, 0, 0};
}

View File

@ -747,7 +747,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
// If Offset is a multiply-by-constant and it's profitable to extract a shift
// and use it in a shifted operand do so.
if (Offset.getOpcode() == ISD::MUL) {
if (Offset.getOpcode() == ISD::MUL && N.hasOneUse()) {
unsigned PowerOfTwo = 0;
SDValue NewMulConst;
if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) {
@ -1422,7 +1422,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
// If OffReg is a multiply-by-constant and it's profitable to extract a shift
// and use it in a shifted operand do so.
if (OffReg.getOpcode() == ISD::MUL) {
if (OffReg.getOpcode() == ISD::MUL && N.hasOneUse()) {
unsigned PowerOfTwo = 0;
SDValue NewMulConst;
if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) {

View File

@ -1615,7 +1615,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
// extension rather than sign extension. Make sure we pass the return
// value extension property to integer materialization.
unsigned SrcReg =
PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt);
PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() != CCValAssign::ZExt);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
@ -2091,25 +2091,21 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
&PPC::GPRCRegClass);
int64_t Imm = UseSExt ? CI->getSExtValue() : CI->getZExtValue();
// If the constant is in range, use a load-immediate.
if (UseSExt && isInt<16>(CI->getSExtValue())) {
// Since LI will sign extend the constant we need to make sure that for
// our zeroext constants that the sign extended constant fits into 16-bits -
// a range of 0..0x7fff.
if (isInt<16>(Imm)) {
unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
unsigned ImmReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
.addImm(CI->getSExtValue());
return ImmReg;
} else if (!UseSExt && isUInt<16>(CI->getZExtValue())) {
unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
unsigned ImmReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
.addImm(CI->getZExtValue());
.addImm(Imm);
return ImmReg;
}
// Construct the constant piecewise.
int64_t Imm = CI->getZExtValue();
if (VT == MVT::i64)
return PPCMaterialize64BitInt(Imm, RC);
else if (VT == MVT::i32)

View File

@ -736,7 +736,7 @@ def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
v16i8, v8i16>;
def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
v16i8, v4i32>;
v8i16, v4i32>;
def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
v8i16, v4i32>;
def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),

View File

@ -1849,7 +1849,7 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
if (CCMask == SystemZ::CCMASK_CMP_NE)
return SystemZ::CCMASK_TM_SOME_1;
}
if (EffectivelyUnsigned && CmpVal <= Low) {
if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
if (CCMask == SystemZ::CCMASK_CMP_LT)
return SystemZ::CCMASK_TM_ALL_0;
if (CCMask == SystemZ::CCMASK_CMP_GE)

View File

@ -1335,6 +1335,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
setOperationAction(ISD::SETCCE, MVT::i1, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
@ -14975,8 +14976,11 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
assert(Carry.getOpcode() != ISD::CARRY_FALSE);
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(),
DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
if (Op.getSimpleValueType() == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
return SetCC;
}
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
@ -16315,6 +16319,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget *Subtarget,
SelectionDAG &DAG, SDLoc dl) {
if (isAllOnesConstant(Mask))
return DAG.getTargetConstant(1, dl, MaskVT);
if (X86::isZeroNode(Mask))
return DAG.getTargetConstant(0, dl, MaskVT);
if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
// Mask should be extended
Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
@ -17203,26 +17212,14 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
MVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else {
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getBitcast(BitcastVT, Mask),
DAG.getIntPtrConstant(0, dl));
}
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
if (Src.getOpcode() == ISD::UNDEF)
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
return DAG.getMergeValues(RetOps, dl);
@ -17230,7 +17227,8 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain) {
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = cast<ConstantSDNode>(ScaleOp);
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@ -17238,29 +17236,18 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else {
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
DAG.getBitcast(BitcastVT, Mask),
DAG.getIntPtrConstant(0, dl));
}
SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Mask, SDValue Base, SDValue Index,
SDValue ScaleOp, SDValue Chain) {
SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = cast<ConstantSDNode>(ScaleOp);
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
@ -17268,14 +17255,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else
MaskInReg = DAG.getBitcast(MaskVT, Mask);
SDValue VMask = getMaskNode(Mask, MaskVT, &Subtarget, DAG, dl);
//SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
}
@ -17509,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain);
Scale, Chain, *Subtarget);
}
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
@ -17521,7 +17503,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Index = Op.getOperand(3);
SDValue Base = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
*Subtarget);
}
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
case RDTSC: {

View File

@ -3560,7 +3560,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
BO1->getOperand(0));
}
if (CI->isMaxValue(true)) {
if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
ICmpInst::Predicate Pred = I.isSigned()
? I.getUnsignedPredicate()
: I.getSignedPredicate();

View File

@ -557,7 +557,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
ConstantInt::get(IdxType, i),
};
auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName);
auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName);
auto *L = IC.Builder->CreateAlignedLoad(Ptr, LI.getAlignment(),
LoadName);
V = IC.Builder->CreateInsertValue(V, L, i);
}

View File

@ -380,6 +380,23 @@ static void replaceExtractElements(InsertElementInst *InsElt,
ExtendMask.push_back(UndefValue::get(IntType));
Value *ExtVecOp = ExtElt->getVectorOperand();
auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
? ExtVecOpInst->getParent()
: ExtElt->getParent();
// TODO: This restriction matches the basic block check below when creating
// new extractelement instructions. If that limitation is removed, this one
// could also be removed. But for now, we just bail out to ensure that we
// will replace the extractelement instruction that is feeding our
// insertelement instruction. This allows the insertelement to then be
// replaced by a shufflevector. If the insertelement is not replaced, we can
// induce infinite looping because there's an optimization for extractelement
// that will delete our widening shuffle. This would trigger another attempt
// here to create that shuffle, and we spin forever.
if (InsertionBlock != InsElt->getParent())
return;
auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
ConstantVector::get(ExtendMask));
@ -387,7 +404,6 @@ static void replaceExtractElements(InsertElementInst *InsElt,
// (as long as it's not a PHI) or at the start of the basic block of the
// extract, so any subsequent extracts in the same basic block can use it.
// TODO: Insert before the earliest ExtractElementInst that is replaced.
auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
WideVec->insertAfter(ExtVecOpInst);
else

View File

@ -90,6 +90,11 @@ static cl::opt<bool> SpeculateOneExpensiveInst(
cl::desc("Allow exactly one expensive instruction to be speculatively "
"executed"));
static cl::opt<unsigned> MaxSpeculationDepth(
"max-speculation-depth", cl::Hidden, cl::init(10),
cl::desc("Limit maximum recursion depth when calculating costs of "
"speculatively executed instructions"));
STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
@ -269,6 +274,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
unsigned &CostRemaining,
const TargetTransformInfo &TTI,
unsigned Depth = 0) {
// It is possible to hit a zero-cost cycle (phi/gep instructions for example),
// so limit the recursion depth.
// TODO: While this recursion limit does prevent pathological behavior, it
// would be better to track visited instructions to avoid cycles.
if (Depth == MaxSpeculationDepth)
return false;
Instruction *I = dyn_cast<Instruction>(V);
if (!I) {
// Non-instructions all dominate instructions, but not all constantexprs

View File

@ -10,34 +10,3 @@ define i8 @test_mul(i32 %a, i32 %b) {
%3 = trunc i32 %2 to i8
ret i8 %3
}
; CHECK-LABEL: 'test_icmp1'
; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2
; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255
; CHECK-DAG: DemandedBits: 0xFFF for %2 = shl i32 %1, 4
define i1 @test_icmp1(i32 %a, i32 %b) {
%1 = and i32 %a, 255
%2 = shl i32 %1, 4
%3 = icmp eq i32 %1, %2
ret i1 %3
}
; CHECK-LABEL: 'test_icmp2'
; CHECK-DAG: DemandedBits: 0x1 for %3 = icmp eq i32 %1, %2
; CHECK-DAG: DemandedBits: 0xFFF for %1 = and i32 %a, 255
; CHECK-DAG: DemandedBits: 0xFF for %2 = ashr i32 %1, 4
define i1 @test_icmp2(i32 %a, i32 %b) {
%1 = and i32 %a, 255
%2 = ashr i32 %1, 4
%3 = icmp eq i32 %1, %2
ret i1 %3
}
; CHECK-LABEL: 'test_icmp3'
; CHECK-DAG: DemandedBits: 0xFFFFFFFF for %1 = and i32 %a, 255
; CHECK-DAG: DemandedBits: 0x1 for %2 = icmp eq i32 -1, %1
define i1 @test_icmp3(i32 %a) {
%1 = and i32 %a, 255
%2 = icmp eq i32 -1, %1
ret i1 %2
}

View File

@ -267,4 +267,278 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
ret <4 x i16> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_une:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, ne
; CHECK-DAG: csel {{.*}}, wzr, ne
; CHECK-DAG: csel {{.*}}, wzr, ne
; CHECK-DAG: csel {{.*}}, wzr, ne
define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp une <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ueq:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, vs
; CHECK-DAG: csel {{.*}}, vs
; CHECK-DAG: csel {{.*}}, vs
; CHECK-DAG: csel {{.*}}, vs
define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ueq <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ugt:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, hi
; CHECK-DAG: csel {{.*}}, wzr, hi
; CHECK-DAG: csel {{.*}}, wzr, hi
; CHECK-DAG: csel {{.*}}, wzr, hi
define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ugt <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_uge:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, pl
; CHECK-DAG: csel {{.*}}, wzr, pl
; CHECK-DAG: csel {{.*}}, wzr, pl
; CHECK-DAG: csel {{.*}}, wzr, pl
define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp uge <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ult:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, lt
; CHECK-DAG: csel {{.*}}, wzr, lt
; CHECK-DAG: csel {{.*}}, wzr, lt
; CHECK-DAG: csel {{.*}}, wzr, lt
define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ult <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ule:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, le
; CHECK-DAG: csel {{.*}}, wzr, le
; CHECK-DAG: csel {{.*}}, wzr, le
; CHECK-DAG: csel {{.*}}, wzr, le
define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ule <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_uno:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, vs
; CHECK-DAG: csel {{.*}}, wzr, vs
; CHECK-DAG: csel {{.*}}, wzr, vs
; CHECK-DAG: csel {{.*}}, wzr, vs
define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp uno <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_one:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, gt
; CHECK-DAG: csel {{.*}}, gt
; CHECK-DAG: csel {{.*}}, gt
; CHECK-DAG: csel {{.*}}, gt
define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp one <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_oeq:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, wzr, eq
; CHECK-DAG: csel {{.*}}, wzr, eq
define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp oeq <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ogt:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, gt
; CHECK-DAG: csel {{.*}}, wzr, gt
; CHECK-DAG: csel {{.*}}, wzr, gt
; CHECK-DAG: csel {{.*}}, wzr, gt
define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ogt <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_oge:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, ge
; CHECK-DAG: csel {{.*}}, wzr, ge
; CHECK-DAG: csel {{.*}}, wzr, ge
; CHECK-DAG: csel {{.*}}, wzr, ge
define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp oge <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_olt:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, wzr, mi
; CHECK-DAG: csel {{.*}}, wzr, mi
define <4 x i1> @test_fcmp_olt(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp olt <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ole:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, ls
; CHECK-DAG: csel {{.*}}, wzr, ls
; CHECK-DAG: csel {{.*}}, wzr, ls
; CHECK-DAG: csel {{.*}}, wzr, ls
define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ole <4 x half> %a, %b
ret <4 x i1> %1
}
; Function Attrs: nounwind readnone
; CHECK-LABEL: test_fcmp_ord:
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: fcvt
; CHECK-DAG: csel {{.*}}, wzr, vc
; CHECK-DAG: csel {{.*}}, wzr, vc
; CHECK-DAG: csel {{.*}}, wzr, vc
; CHECK-DAG: csel {{.*}}, wzr, vc
define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
%1 = fcmp ord <4 x half> %a, %b
ret <4 x i1> %1
}
attributes #0 = { nounwind }

View File

@ -421,4 +421,88 @@ define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
ret <8 x i16> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp une <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 16 csel tests. Skipped.
define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ueq <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ugt <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp uge <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ult <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ule <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp uno <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp one <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp oeq <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ogt <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp oge <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp olt <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ole <8 x half> %a, %b
ret <8 x i1> %1
}
; FileCheck checks are unwieldy with 16 fcvt and 8 csel tests. Skipped.
define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
%1 = fcmp ord <8 x half> %a, %b
ret <8 x i1> %1
}
attributes #0 = { nounwind }

View File

@ -1,6 +1,8 @@
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-FIJI %s
; HSA: .hsa_code_object_version 1,0
; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
; HSA-FIJI: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"

View File

@ -1,5 +1,6 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
;GCN-LABEL: {{^}}main:

View File

@ -0,0 +1,33 @@
; RUN: llc -march=amdgcn -mcpu=verde < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
; When the offset of VGPR spills into scratch space gets too large, an additional SGPR
; is used to calculate the scratch load/store address. Make sure that this
; mechanism works even when many spills happen.
; Just test that it compiles successfully.
; CHECK-LABEL: test
define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in,
<96 x i32> addrspace(1)* %sdata_out, <96 x i32> %sdata_in) {
entry:
%tid = call i32 @llvm.SI.tid() nounwind readnone
%aptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
%a = load <1280 x i32>, <1280 x i32> addrspace(1)* %aptr
; mark most VGPR registers as used to increase register pressure
call void asm sideeffect "", "~{VGPR4},~{VGPR8},~{VGPR12},~{VGPR16},~{VGPR20},~{VGPR24},~{VGPR28},~{VGPR32}" ()
call void asm sideeffect "", "~{VGPR36},~{VGPR40},~{VGPR44},~{VGPR48},~{VGPR52},~{VGPR56},~{VGPR60},~{VGPR64}" ()
call void asm sideeffect "", "~{VGPR68},~{VGPR72},~{VGPR76},~{VGPR80},~{VGPR84},~{VGPR88},~{VGPR92},~{VGPR96}" ()
call void asm sideeffect "", "~{VGPR100},~{VGPR104},~{VGPR108},~{VGPR112},~{VGPR116},~{VGPR120},~{VGPR124},~{VGPR128}" ()
call void asm sideeffect "", "~{VGPR132},~{VGPR136},~{VGPR140},~{VGPR144},~{VGPR148},~{VGPR152},~{VGPR156},~{VGPR160}" ()
call void asm sideeffect "", "~{VGPR164},~{VGPR168},~{VGPR172},~{VGPR176},~{VGPR180},~{VGPR184},~{VGPR188},~{VGPR192}" ()
call void asm sideeffect "", "~{VGPR196},~{VGPR200},~{VGPR204},~{VGPR208},~{VGPR212},~{VGPR216},~{VGPR220},~{VGPR224}" ()
%outptr = getelementptr <1280 x i32>, <1280 x i32> addrspace(1)* %in, i32 %tid
store <1280 x i32> %a, <1280 x i32> addrspace(1)* %outptr
ret void
}
declare i32 @llvm.SI.tid() nounwind readnone

View File

@ -239,3 +239,20 @@ define void @test_well_formed_dag(i32 %in1, i32 %in2, i32* %addr) {
store i32 %add, i32* %addr
ret void
}
define { i32, i32 } @test_multi_use_add(i32 %base, i32 %offset) {
; CHECK-LABEL: test_multi_use_add:
; CHECK-THUMB: movs [[CONST:r[0-9]+]], #28
; CHECK-THUMB: movt [[CONST]], #1
%prod = mul i32 %offset, 65564
%sum = add i32 %base, %prod
%ptr = inttoptr i32 %sum to i32*
%loaded = load i32, i32* %ptr
%ret.tmp = insertvalue { i32, i32 } undef, i32 %sum, 0
%ret = insertvalue { i32, i32 } %ret.tmp, i32 %loaded, 1
ret { i32, i32 } %ret
}

View File

@ -186,3 +186,12 @@ entry:
; ELF64: blr
ret i32 -1
}
define zeroext i16 @ret20() nounwind {
entry:
; ELF64-LABEL: ret20
; ELF64: lis{{.*}}0
; ELF64: ori{{.*}}32768
; ELF64: blr
ret i16 32768
}

View File

@ -0,0 +1,10 @@
; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
define void @test() {
entry:
call void asm sideeffect "mtfsb1 ${0:s}", "i"(i32 7), !srcloc !1
ret void
}
; CHECK: #APP
; CHECK-NEXT: mtfsb1 25
!1 = !{i32 40}

View File

@ -0,0 +1,9 @@
; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
define <8 x i16> @test(<4 x i32> %a) {
entry:
%0 = tail call <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32> %a, <4 x i32> %a)
ret <8 x i16> %0
}
; CHECK: vpkswss 2,
declare <8 x i16> @llvm.ppc.altivec.vpkswss(<4 x i32>, <4 x i32>)

View File

@ -0,0 +1,136 @@
; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
define zeroext i32 @f1() {
entry:
ret i32 65535
}
; CHECK-LABEL: @f1
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 65535
define zeroext i32 @f2() {
entry:
ret i32 32768
}
; CHECK-LABEL: @f2
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 32768
define zeroext i32 @f3() {
entry:
ret i32 32767
}
; CHECK-LABEL: @f3
; CHECK: li 3, 32767
define zeroext i16 @f4() {
entry:
ret i16 65535
}
; CHECK-LABEL: @f4
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 65535
define zeroext i16 @f5() {
entry:
ret i16 32768
}
; CHECK-LABEL: @f5
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 32768
define zeroext i16 @f6() {
entry:
ret i16 32767
}
; CHECK-LABEL: @f6
; CHECK: li 3, 32767
define zeroext i16 @f7() {
entry:
ret i16 -1
}
; CHECK-LABEL: @f7
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 65535
define zeroext i16 @f8() {
entry:
ret i16 -32768
}
; CHECK-LABEL: @f8
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 32768
define signext i32 @f1s() {
entry:
ret i32 65535
}
; CHECK-LABEL: @f1s
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 65535
define signext i32 @f2s() {
entry:
ret i32 32768
}
; CHECK-LABEL: @f2s
; CHECK: lis 3, 0
; CHECK: ori 3, 3, 32768
define signext i32 @f3s() {
entry:
ret i32 32767
}
; CHECK-LABEL: @f3s
; CHECK: li 3, 32767
define signext i16 @f4s() {
entry:
ret i16 32767
}
; CHECK-LABEL: @f4s
; CHECK: li 3, 32767
define signext i32 @f1sn() {
entry:
ret i32 -65535
}
; CHECK-LABEL: @f1sn
; CHECK: lis 3, -1
; CHECK: ori 3, 3, 1
define signext i32 @f2sn() {
entry:
ret i32 -32768
}
; CHECK-LABEL: @f2sn
; CHECK: li 3, -32768
define signext i32 @f3sn() {
entry:
ret i32 -32767
}
; CHECK-LABEL: @f3sn
; CHECK: li 3, -32767
define signext i32 @f4sn() {
entry:
ret i32 -65536
}
; CHECK-LABEL: @f4sn
; CHECK: lis 3, -1
define signext i16 @f5sn() {
entry:
ret i16 -32767
}
; CHECK-LABEL: @f5sn
; CHECK: li 3, -32767
define signext i16 @f6sn() {
entry:
ret i16 -32768
}
; CHECK-LABEL: @f6sn
; CHECK: li 3, -32768

View File

@ -0,0 +1,8 @@
; RUN: llc -mcpu=pwr7 -mtriple=powerpc64le-unknown-unknown -O0 < %s | FileCheck %s
define internal signext i32 @foo() #0 {
ret i32 -125452974
}
; CHECK: lis 3, -1915
; CHECK: ori 3, 3, 48466

View File

@ -0,0 +1,26 @@
; This used to incorrectly use a TMLL for an always-false test at -O0.
;
; RUN: llc -O0 < %s -mtriple=s390x-linux-gnu | FileCheck %s
define void @test(i8 *%input, i32 *%result) {
entry:
; CHECK-NOT: tmll
%0 = load i8, i8* %input, align 1
%1 = trunc i8 %0 to i1
%2 = zext i1 %1 to i32
%3 = icmp sge i32 %2, 0
br i1 %3, label %if.then, label %if.else
if.then:
store i32 1, i32* %result, align 4
br label %return
if.else:
store i32 0, i32* %result, align 4
br label %return
return:
ret void
}

View File

@ -259,18 +259,22 @@ define void @prefetch(<8 x i64> %ind, i8* %base) {
; CHECK: ## BB#0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
; CHECK-NEXT: movb $120, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
; CHECK-NEXT: retq
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1)
ret void
}
declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
@ -790,3 +794,54 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <
ret void
}
define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
; CHECK-LABEL: scatter_mask_test:
; CHECK: ## BB#0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: movb $96, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
ret void
}
define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
; CHECK-LABEL: gather_mask_test:
; CHECK: ## BB#0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm2
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
; CHECK-NEXT: movw $1, %ax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm4
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
; CHECK-NEXT: movw $220, %ax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
%res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
%res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
%res4 = fadd <16 x float> %res, %res1
%res5 = fadd <16 x float> %res3, %res2
%res6 = fadd <16 x float> %res5, %res4
ret <16 x float> %res6
}

View File

@ -1,26 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s | FileCheck %s --check-prefix=KNL-32
; Verify that we don't crash during codegen due to a wrong lowering
; of a setcc node with illegal operand types and return type.
define <8 x i16> @pr25080(<8 x i32> %a) {
; CHECK-LABEL: pr25080:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpsraw $15, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
; AVX-LABEL: pr25080:
; AVX: # BB#0: # %entry
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i23>
%1 = icmp eq <8 x i23> %0, zeroinitializer
@ -28,3 +30,46 @@ entry:
%3 = sext <8 x i1> %2 to <8 x i16>
ret <8 x i16> %3
}
define void @pr26232(i64 %a) {
; KNL-32-LABEL: pr26232:
; KNL-32: # BB#0: # %for_test11.preheader
; KNL-32-NEXT: pushl %esi
; KNL-32-NEXT: .Ltmp0:
; KNL-32-NEXT: .cfi_def_cfa_offset 8
; KNL-32-NEXT: .Ltmp1:
; KNL-32-NEXT: .cfi_offset %esi, -8
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; KNL-32-NEXT: movw $-1, %dx
; KNL-32-NEXT: .align 16, 0x90
; KNL-32-NEXT: .LBB1_1: # %for_loop599
; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1
; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000
; KNL-32-NEXT: movl %eax, %esi
; KNL-32-NEXT: sbbl $0, %esi
; KNL-32-NEXT: movl $0, %esi
; KNL-32-NEXT: cmovlw %dx, %si
; KNL-32-NEXT: testw %si, %si
; KNL-32-NEXT: jne .LBB1_1
; KNL-32-NEXT: # BB#2: # %for_exit600
; KNL-32-NEXT: popl %esi
; KNL-32-NEXT: retl
allocas:
br label %for_test11.preheader
for_test11.preheader: ; preds = %for_test11.preheader, %allocas
br i1 undef, label %for_loop599, label %for_test11.preheader
for_loop599: ; preds = %for_loop599, %for_test11.preheader
%less_i_load605_ = icmp slt i64 %a, 65536
%less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0
%less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer
%"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, undef
%intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16
%res.i895 = icmp eq i16 %intmask.i894, 0
br i1 %res.i895, label %for_exit600, label %for_loop599
for_exit600: ; preds = %for_loop599
ret void
}

View File

@ -0,0 +1,102 @@
; RUN: llc -filetype=obj -o - < %s | llvm-dwarfdump - | FileCheck %s
;
; Created using clang -g -O3 from:
; struct S0 {
; short f0;
; int f3;
; } a;
; void fn1(short p1) {
; struct S0 b, c = {3};
; b.f3 = p1;
; a = b = c;
; }
;
; int main() { return 0; }
;
; This is similar to the bug in test/DebugInfo/ARM/PR26163.ll, except that there is an
; extra non-overlapping range first. Thus, we make sure that the backend actually looks
; at all expressions when determining whether to merge ranges, not just the first one.
; AS in 26163, we expect two ranges (as opposed to one), the first one being zero sized
;
;
; CHECK: 0x00000000: Beginning address offset: 0x0000000000000004
; CHECK: Ending address offset: 0x0000000000000004
; CHECK: Location description: 10 03 55 93 04
; CHECK: Beginning address offset: 0x0000000000000004
; CHECK: Ending address offset: 0x0000000000000014
; CHECK: Location description: 10 03 10 00
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
%struct.S0 = type { i16, i32 }
@a = common global %struct.S0 zeroinitializer, align 4
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
; The attributes are here to force the zero-sized range not to be at the start of
; the function, which has special interpretation in DWARF. The fact that this happens
; at all is probably an LLVM bug.
attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
define void @fn1(i16 signext %p1) #0 !dbg !4 {
entry:
tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !9, metadata !26), !dbg !27
tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !10, metadata !26), !dbg !28
tail call void @llvm.dbg.declare(metadata %struct.S0* undef, metadata !16, metadata !26), !dbg !29
tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !16, metadata !30), !dbg !29
tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !31), !dbg !29
tail call void @llvm.dbg.value(metadata i16 %p1, i64 0, metadata !10, metadata !32), !dbg !28
tail call void @llvm.dbg.value(metadata i32 3, i64 0, metadata !10, metadata !30), !dbg !28
tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !10, metadata !31), !dbg !28
store i32 3, i32* bitcast (%struct.S0* @a to i32*), align 4, !dbg !33
store i32 0, i32* getelementptr inbounds (%struct.S0, %struct.S0* @a, i64 0, i32 1), align 4, !dbg !33
ret void, !dbg !34
}
define i32 @main() !dbg !17 {
entry:
ret i32 0, !dbg !35
}
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23, !24}
!llvm.ident = !{!25}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3, globals: !20)
!1 = !DIFile(filename: "small.c", directory: "/Users/kfischer/Projects/clangbug")
!2 = !{}
!3 = !{!4, !17}
!4 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !5, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, variables: !8)
!5 = !DISubroutineType(types: !6)
!6 = !{null, !7}
!7 = !DIBasicType(name: "short", size: 16, align: 16, encoding: DW_ATE_signed)
!8 = !{!9, !10, !16}
!9 = !DILocalVariable(name: "p1", arg: 1, scope: !4, file: !1, line: 5, type: !7)
!10 = !DILocalVariable(name: "b", scope: !4, file: !1, line: 6, type: !11)
!11 = !DICompositeType(tag: DW_TAG_structure_type, name: "S0", file: !1, line: 1, size: 64, align: 32, elements: !12)
!12 = !{!13, !14}
!13 = !DIDerivedType(tag: DW_TAG_member, name: "f0", scope: !11, file: !1, line: 2, baseType: !7, size: 16, align: 16)
!14 = !DIDerivedType(tag: DW_TAG_member, name: "f3", scope: !11, file: !1, line: 3, baseType: !15, size: 32, align: 32, offset: 32)
!15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!16 = !DILocalVariable(name: "c", scope: !4, file: !1, line: 6, type: !11)
!17 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 11, type: !18, isLocal: false, isDefinition: true, scopeLine: 11, isOptimized: true, variables: !2)
!18 = !DISubroutineType(types: !19)
!19 = !{!15}
!20 = !{!21}
!21 = !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 4, type: !11, isLocal: false, isDefinition: true, variable: %struct.S0* @a)
!22 = !{i32 2, !"Dwarf Version", i32 2}
!23 = !{i32 2, !"Debug Info Version", i32 3}
!24 = !{i32 1, !"PIC Level", i32 2}
!25 = !{!"clang version 3.9.0 (https://github.com/llvm-mirror/clang 8f258397c5afd7a708bd95770c718e81d08fb11a) (https://github.com/llvm-mirror/llvm 18481855bdfa1b4a424f81be8525db002671348d)"}
!26 = !DIExpression()
!27 = !DILocation(line: 5, column: 16, scope: !4)
!28 = !DILocation(line: 6, column: 13, scope: !4)
!29 = !DILocation(line: 6, column: 16, scope: !4)
!30 = !DIExpression(DW_OP_bit_piece, 0, 32)
!31 = !DIExpression(DW_OP_bit_piece, 32, 32)
!32 = !DIExpression(DW_OP_bit_piece, 32, 16)
!33 = !DILocation(line: 8, column: 9, scope: !4)
!34 = !DILocation(line: 9, column: 1, scope: !4)
!35 = !DILocation(line: 11, column: 14, scope: !17)

View File

@ -1672,3 +1672,15 @@ define i1 @cmp_slt_rhs_inc(float %x, i32 %i) {
%cmp = icmp slt i32 %conv, %inc
ret i1 %cmp
}
; CHECK-LABEL: @PR26407
; CHECK-NEXT: %[[addx:.*]] = add i32 %x, 2147483647
; CHECK-NEXT: %[[addy:.*]] = add i32 %y, 2147483647
; CHECK-NEXT: %[[cmp:.*]] = icmp uge i32 %[[addx]], %[[addy]]
; CHECK-NEXT: ret i1 %[[cmp]]
define i1 @PR26407(i32 %x, i32 %y) {
%addx = add i32 %x, 2147483647
%addy = add i32 %y, 2147483647
%cmp = icmp uge i32 %addx, %addy
ret i1 %cmp
}

View File

@ -175,3 +175,33 @@ bb3:
ret <4 x double> %tmp4
}
; PR26354: https://llvm.org/bugs/show_bug.cgi?id=26354
; Don't create a shufflevector if we know that we're not going to replace the insertelement.
define double @pr26354(<2 x double>* %tmp, i1 %B) {
; CHECK-LABEL: @pr26354(
; CHECK: %ld = load <2 x double>, <2 x double>* %tmp
; CHECK-NEXT: %e1 = extractelement <2 x double> %ld, i32 0
; CHECK-NEXT: br i1 %B, label %if, label %end
; CHECK: if:
; CHECK-NEXT: %e2 = extractelement <2 x double> %ld, i32 1
; CHECK-NEXT: %i1 = insertelement <4 x double>
; CHECK-NEXT: br label %end
entry:
%ld = load <2 x double>, <2 x double>* %tmp
%e1 = extractelement <2 x double> %ld, i32 0
%e2 = extractelement <2 x double> %ld, i32 1
br i1 %B, label %if, label %end
if:
%i1 = insertelement <4 x double> zeroinitializer, double %e2, i32 3
br label %end
end:
%ph = phi <4 x double> [ undef, %entry ], [ %i1, %if ]
%e3 = extractelement <4 x double> %ph, i32 1
%mu = fmul double %e1, %e3
ret double %mu
}

View File

@ -136,3 +136,18 @@ define %B @structB(%B* %b.ptr) {
%1 = load %B, %B* %b.ptr, align 8
ret %B %1
}
%struct.S = type <{ i8, %struct.T }>
%struct.T = type { i32, i32 }
; Make sure that we do not increase alignment of packed struct element
define i32 @packed_alignment(%struct.S* dereferenceable(9) %s) {
; CHECK-LABEL: packed_alignment
; CHECK-NEXT: %tv.elt1 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 1, i32 1
; CHECK-NEXT: %tv.unpack2 = load i32, i32* %tv.elt1, align 1
; CHECK-NEXT: ret i32 %tv.unpack2
%t = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 1
%tv = load %struct.T, %struct.T* %t, align 1
%v = extractvalue %struct.T %tv, 1
ret i32 %v
}

View File

@ -205,39 +205,5 @@ for.body: ; preds = %for.body, %for.body
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @add_g
; CHECK: load <16 x i8>
; CHECK: xor <16 x i8>
; CHECK: icmp ult <16 x i8>
; CHECK: select <16 x i1> {{.*}}, <16 x i8>
; CHECK: store <16 x i8>
define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 {
%1 = icmp sgt i32 %len, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0
%2 = sext i8 %arg1 to i64
br label %3
._crit_edge: ; preds = %3, %0
ret void
; <label>:3 ; preds = %3, %.lr.ph
%indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ]
%x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%x5 = load i8, i8* %x4
%x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
%x8 = load i8, i8* %x7
%x9 = zext i8 %x5 to i32
%x10 = xor i32 %x9, 255
%x11 = icmp ult i32 %x10, 24
%x12 = select i1 %x11, i32 %x10, i32 24
%x13 = trunc i32 %x12 to i8
store i8 %x13, i8* %x4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %._crit_edge, label %3
}
attributes #0 = { nounwind }

View File

@ -1302,3 +1302,35 @@ l6:
; CHECK: entry
; CHECK-NEXT: switch
}
; Speculation depth must be limited to avoid a zero-cost instruction cycle.
; CHECK-LABEL: @PR26308(
; CHECK: cleanup4:
; CHECK-NEXT: br label %cleanup4
define i32 @PR26308(i1 %B, i64 %load) {
entry:
br label %while.body
while.body:
br label %cleanup
cleanup:
%cleanup.dest.slot.0 = phi i1 [ false, %while.body ]
br i1 %cleanup.dest.slot.0, label %for.cond, label %cleanup4
for.cond:
%e.0 = phi i64* [ undef, %cleanup ], [ %incdec.ptr, %for.cond2 ]
%pi = ptrtoint i64* %e.0 to i64
%incdec.ptr = getelementptr inbounds i64, i64* %e.0, i64 1
br label %for.cond2
for.cond2:
%storemerge = phi i64 [ %pi, %for.cond ], [ %load, %for.cond2 ]
br i1 %B, label %for.cond2, label %for.cond
cleanup4:
br label %while.body
}

View File

@ -25,7 +25,7 @@ if(NOT LLVM_USE_INTEL_JITEVENTS )
set(LLVM_TOOL_LLVM_JITLISTENER_BUILD Off)
endif()
if(CYGWIN)
if(CYGWIN OR NOT LLVM_ENABLE_PIC)
set(LLVM_TOOL_LTO_BUILD Off)
set(LLVM_TOOL_LLVM_LTO_BUILD Off)
endif()

View File

@ -155,9 +155,12 @@ while [ $# -gt 0 ]; do
done
if [ "$use_autoconf" = "no" ]; then
# See llvm.org/PR26146.
echo Skipping test-suite when using CMake.
do_test_suite="no"
if [ "$do_test_suite" = "yes" ]; then
# See llvm.org/PR26146.
echo Skipping test-suite build when using CMake.
echo It will still be exported.
do_test_suite="export-only"
fi
fi
# Check required arguments.
@ -202,9 +205,11 @@ if [ $do_libs = "yes" ]; then
projects="$projects libunwind"
fi
fi
if [ $do_test_suite = "yes" ]; then
projects="$projects test-suite"
fi
case $do_test_suite in
yes|export-only)
projects="$projects test-suite"
;;
esac
if [ $do_openmp = "yes" ]; then
projects="$projects openmp"
fi
@ -277,9 +282,16 @@ function export_sources() {
clang-tools-extra)
projsrc=llvm.src/tools/clang/tools/extra
;;
compiler-rt|libcxx|libcxxabi|libunwind|openmp|test-suite)
compiler-rt|libcxx|libcxxabi|libunwind|openmp)
projsrc=llvm.src/projects/$proj
;;
test-suite)
if [ $do_test_suite = 'yes' ]; then
projsrc=llvm.src/projects/$proj
else
projsrc=$proj.src
fi
;;
*)
echo "error: unknown project $proj"
exit 1

View File

@ -32,10 +32,6 @@ if (NOT LLVM_ENABLE_THREADS)
add_definitions( -DGTEST_HAS_PTHREAD=0 )
endif()
set(LIBS
LLVMSupport # Depends on llvm::raw_ostream
)
find_library(PTHREAD_LIBRARY_PATH pthread)
if (PTHREAD_LIBRARY_PATH)
list(APPEND LIBS pthread)
@ -46,6 +42,9 @@ add_llvm_library(gtest
LINK_LIBS
${LIBS}
LINK_COMPONENTS
Support # Depends on llvm::raw_ostream
)
add_subdirectory(UnitTestMain)

View File

@ -3,5 +3,7 @@ add_llvm_library(gtest_main
LINK_LIBS
gtest
LLVMSupport # Depends on llvm::cl
LINK_COMPONENTS
Support # Depends on llvm::cl
)