Vendor import of llvm release_40 branch r292732:

https://llvm.org/svn/llvm-project/llvm/branches/release_40@292732
This commit is contained in:
Dimitry Andric 2017-01-22 16:52:30 +00:00
parent 581a6d8501
commit 7c71d32ab5
19 changed files with 462 additions and 175 deletions

View File

@ -462,11 +462,9 @@ function(llvm_add_library name)
if(UNIX AND NOT APPLE AND NOT ARG_SONAME)
set_target_properties(${name}
PROPERTIES
# Concatenate the version numbers since ldconfig expects exactly
# one component indicating the ABI version, while LLVM uses
# major+minor for that.
SOVERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}
VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX})
# Since 4.0.0, the ABI version is indicated by the major version
SOVERSION ${LLVM_VERSION_MAJOR}
VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX})
endif()
endif()

View File

@ -67,13 +67,46 @@ Non-comprehensive list of changes in this release
Makes programs 10x faster by doing Special New Thing.
Improvements to ThinLTO (-flto=thin)
------------------------------------
* Integration with profile data (PGO). When available, profile data
enables more accurate function importing decisions, as well as
cross-module indirect call promotion.
* Significant build-time and binary-size improvements when compiling with
debug info (-g).
Changes to the LLVM IR
----------------------
Changes to the ARM Backend
Changes to the ARM Targets
--------------------------
During this release ...
**During this release the AArch64 target has:**
* Gained support for ILP32 relocations.
* Gained support for XRay.
* Made even more progress on GlobalISel. There is still some work left before
it is production-ready though.
* Refined the support for Qualcomm's Falkor and Samsung's Exynos CPUs.
* Learned a few new tricks for lowering multiplications by constants, folding
spilled/refilled copies etc.
**During this release the ARM target has:**
* Gained support for ROPI (read-only position independence) and RWPI
(read-write position independence), which can be used to remove the need for
a dynamic linker.
* Gained support for execute-only code, which is placed in pages without read
permissions.
* Gained a machine scheduler for Cortex-R52.
* Gained support for XRay.
* Gained Thumb1 implementations for several compiler-rt builtins. It also
has some support for building the builtins for HF targets.
* Started using the generic bitreverse intrinsic instead of rbit.
* Gained very basic support for GlobalISel.
A lot of work has also been done in LLD for ARM, which now supports more
relocations and TLS.
Changes to the MIPS Target

View File

@ -1,11 +1,6 @@
Overview
========
.. warning::
If you are using a released version of LLVM, see `the download page
<http://llvm.org/releases/>`_ to find your documentation.
The LLVM compiler infrastructure supports a wide range of projects, from
industrial strength compilers to specialized JIT applications to small
research projects.

View File

@ -68,7 +68,10 @@ class AssumptionCache {
AffectedValuesMap AffectedValues;
/// Get the vector of assumptions which affect a value from the cache.
SmallVector<WeakVH, 1> &getAffectedValues(Value *V);
SmallVector<WeakVH, 1> &getOrInsertAffectedValues(Value *V);
/// Copy affected values in the cache for OV to be affected values for NV.
void copyAffectedValuesInCache(Value *OV, Value *NV);
/// \brief Flag tracking whether we have scanned the function yet.
///

View File

@ -24,7 +24,7 @@
using namespace llvm;
using namespace llvm::PatternMatch;
SmallVector<WeakVH, 1> &AssumptionCache::getAffectedValues(Value *V) {
SmallVector<WeakVH, 1> &AssumptionCache::getOrInsertAffectedValues(Value *V) {
// Try using find_as first to avoid creating extra value handles just for the
// purpose of doing the lookup.
auto AVI = AffectedValues.find_as(V);
@ -98,7 +98,7 @@ void AssumptionCache::updateAffectedValues(CallInst *CI) {
}
for (auto &AV : Affected) {
auto &AVV = getAffectedValues(AV);
auto &AVV = getOrInsertAffectedValues(AV);
if (std::find(AVV.begin(), AVV.end(), CI) == AVV.end())
AVV.push_back(CI);
}
@ -111,20 +111,27 @@ void AssumptionCache::AffectedValueCallbackVH::deleted() {
// 'this' now dangles!
}
void AssumptionCache::copyAffectedValuesInCache(Value *OV, Value *NV) {
auto &NAVV = getOrInsertAffectedValues(NV);
auto AVI = AffectedValues.find(OV);
if (AVI == AffectedValues.end())
return;
for (auto &A : AVI->second)
if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
NAVV.push_back(A);
}
void AssumptionCache::AffectedValueCallbackVH::allUsesReplacedWith(Value *NV) {
if (!isa<Instruction>(NV) && !isa<Argument>(NV))
return;
// Any assumptions that affected this value now affect the new value.
auto &NAVV = AC->getAffectedValues(NV);
auto AVI = AC->AffectedValues.find(getValPtr());
if (AVI == AC->AffectedValues.end())
return;
for (auto &A : AVI->second)
if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
NAVV.push_back(A);
AC->copyAffectedValuesInCache(getValPtr(), NV);
// 'this' now might dangle! If the AffectedValues map was resized to add an
// entry for NV then this object might have been destroyed in favor of some
// copy in the grown map.
}
void AssumptionCache::scanFunction() {

View File

@ -405,6 +405,7 @@ char ModuleSummaryIndexWrapperPass::ID = 0;
INITIALIZE_PASS_BEGIN(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
"Module Summary Analysis", false, true)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
"Module Summary Analysis", false, true)

View File

@ -768,13 +768,12 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
unsigned ID, PlaceholderQueue &Placeholders) {
assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size());
assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
#ifndef NDEBUG
// Lookup first if the metadata hasn't already been loaded.
if (auto *MD = MetadataList.lookup(ID)) {
auto *N = dyn_cast_or_null<MDNode>(MD);
assert(N && N->isTemporary() && "Lazy loading an already loaded metadata");
if (!N->isTemporary())
return;
}
#endif
SmallVector<uint64_t, 64> Record;
StringRef Blob;
IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]);
@ -827,8 +826,22 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
auto getMD = [&](unsigned ID) -> Metadata * {
if (ID < MDStringRef.size())
return lazyLoadOneMDString(ID);
if (!IsDistinct)
if (!IsDistinct) {
if (auto *MD = MetadataList.lookup(ID))
return MD;
// If lazy-loading is enabled, we try recursively to load the operand
// instead of creating a temporary.
if (ID < (MDStringRef.size() + GlobalMetadataBitPosIndex.size())) {
// Create a temporary for the node that is referencing the operand we
// will lazy-load. It is needed before recursing in case there are
// uniquing cycles.
MetadataList.getMetadataFwdRef(NextMetadataNo);
lazyLoadOneMetadata(ID, Placeholders);
return MetadataList.lookup(ID);
}
// Return a temporary.
return MetadataList.getMetadataFwdRef(ID);
}
if (auto *MD = MetadataList.getMetadataIfResolved(ID))
return MD;
return &Placeholders.getPlaceholderOp(ID);

View File

@ -829,11 +829,22 @@ static std::string writeGeneratedObject(int count, StringRef CacheEntryPath,
// Main entry point for the ThinLTO processing
void ThinLTOCodeGenerator::run() {
// Prepare the resulting object vector
assert(ProducedBinaries.empty() && "The generator should not be reused");
if (SavedObjectsDirectoryPath.empty())
ProducedBinaries.resize(Modules.size());
else {
sys::fs::create_directories(SavedObjectsDirectoryPath);
bool IsDir;
sys::fs::is_directory(SavedObjectsDirectoryPath, IsDir);
if (!IsDir)
report_fatal_error("Unexistent dir: '" + SavedObjectsDirectoryPath + "'");
ProducedBinaryFiles.resize(Modules.size());
}
if (CodeGenOnly) {
// Perform only parallel codegen and return.
ThreadPool Pool;
assert(ProducedBinaries.empty() && "The generator should not be reused");
ProducedBinaries.resize(Modules.size());
int count = 0;
for (auto &ModuleBuffer : Modules) {
Pool.async([&](int count) {
@ -845,7 +856,12 @@ void ThinLTOCodeGenerator::run() {
/*IsImporting*/ false);
// CodeGen
ProducedBinaries[count] = codegen(*TheModule);
auto OutputBuffer = codegen(*TheModule);
if (SavedObjectsDirectoryPath.empty())
ProducedBinaries[count] = std::move(OutputBuffer);
else
ProducedBinaryFiles[count] = writeGeneratedObject(
count, "", SavedObjectsDirectoryPath, *OutputBuffer);
}, count++);
}
@ -866,18 +882,6 @@ void ThinLTOCodeGenerator::run() {
WriteIndexToFile(*Index, OS);
}
// Prepare the resulting object vector
assert(ProducedBinaries.empty() && "The generator should not be reused");
if (SavedObjectsDirectoryPath.empty())
ProducedBinaries.resize(Modules.size());
else {
sys::fs::create_directories(SavedObjectsDirectoryPath);
bool IsDir;
sys::fs::is_directory(SavedObjectsDirectoryPath, IsDir);
if (!IsDir)
report_fatal_error("Unexistent dir: '" + SavedObjectsDirectoryPath + "'");
ProducedBinaryFiles.resize(Modules.size());
}
// Prepare the module map.
auto ModuleMap = generateModuleMap(Modules);

View File

@ -29455,19 +29455,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// Combine brcond/cmov/setcc/.. based on comparing the result of
/// atomic_load_add to use EFLAGS produced by the addition
/// directly if possible. For example:
///
/// (setcc (cmp (atomic_load_add x, -C) C), COND_E)
/// becomes:
/// (setcc (LADD x, -C), COND_E)
///
/// and
/// Combine:
/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
/// becomes:
/// to:
/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
///
/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
/// Note that this is only legal for some op/cc combinations.
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
SelectionDAG &DAG) {
@ -29482,7 +29474,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
if (!Cmp.hasOneUse())
return SDValue();
// This applies to variations of the common case:
// This only applies to variations of the common case:
// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
@ -29501,9 +29493,8 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
return SDValue();
auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
if (!CmpRHSC)
if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
return SDValue();
APInt Comparand = CmpRHSC->getAPIntValue();
const unsigned Opc = CmpLHS.getOpcode();
@ -29519,19 +29510,16 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
if (Opc == ISD::ATOMIC_LOAD_SUB)
Addend = -Addend;
if (Comparand == -Addend) {
// No change to CC.
} else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
if (CC == X86::COND_S && Addend == 1)
CC = X86::COND_LE;
} else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
else if (CC == X86::COND_NS && Addend == 1)
CC = X86::COND_G;
} else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
else if (CC == X86::COND_G && Addend == -1)
CC = X86::COND_GE;
} else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
else if (CC == X86::COND_LE && Addend == -1)
CC = X86::COND_L;
} else {
else
return SDValue();
}
SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),

View File

@ -232,9 +232,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
isTargetKFreeBSD() || In64BitMode)
stackAlignment = 16;
assert((!isPMULLDSlow() || hasSSE41()) &&
"Feature Slow PMULLD can only be set on a subtarget with SSE4.1");
}
void X86Subtarget::initializeEnvironment() {

View File

@ -3163,6 +3163,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
// Don't bother if the instruction is in a BB which ends in an EHPad.
if (UseBB->getTerminator()->isEHPad())
continue;
// Don't bother rewriting PHIs in catchswitch blocks.
if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
continue;
// Ignore uses which are part of other SCEV expressions, to avoid
// analyzing them multiple times.
if (SE.isSCEVable(UserInst->getType())) {
@ -4672,7 +4675,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
// is the canonical backedge for this loop, which complicates post-inc
// users.
if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
!isa<IndirectBrInst>(BB->getTerminator())) {
!isa<IndirectBrInst>(BB->getTerminator()) &&
!isa<CatchSwitchInst>(BB->getTerminator())) {
BasicBlock *Parent = PN->getParent();
Loop *PNLoop = LI.getLoopFor(Parent);
if (!PNLoop || Parent != PNLoop->getHeader()) {

View File

@ -81,6 +81,10 @@ STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
STATISTIC(NumGVNMaxIterations,
"Maximum Number of iterations it took to converge GVN");
STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
STATISTIC(NumGVNAvoidedSortedLeaderChanges,
"Number of avoided sorted leader changes");
//===----------------------------------------------------------------------===//
// GVN Pass
@ -139,6 +143,10 @@ struct CongruenceClass {
// This is used so we can detect store equivalence changes properly.
int StoreCount = 0;
// The most dominating leader after our current leader, because the member set
// is not sorted and is expensive to keep sorted all the time.
std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
explicit CongruenceClass(unsigned ID) : ID(ID) {}
CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
: ID(ID), RepLeader(Leader), DefiningExpr(E) {}
@ -320,8 +328,8 @@ private:
// Templated to allow them to work both on BB's and BB-edges.
template <class T>
Value *lookupOperandLeader(Value *, const User *, const T &) const;
void performCongruenceFinding(Value *, const Expression *);
void moveValueToNewCongruenceClass(Value *, CongruenceClass *,
void performCongruenceFinding(Instruction *, const Expression *);
void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
CongruenceClass *);
// Reachability handling.
void updateReachableEdge(BasicBlock *, BasicBlock *);
@ -1056,20 +1064,43 @@ void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
// Move a value, currently in OldClass, to be part of NewClass
// Update OldClass for the move (including changing leaders, etc)
void NewGVN::moveValueToNewCongruenceClass(Value *V, CongruenceClass *OldClass,
void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
CongruenceClass *OldClass,
CongruenceClass *NewClass) {
DEBUG(dbgs() << "New congruence class for " << V << " is " << NewClass->ID
DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
<< "\n");
OldClass->Members.erase(V);
NewClass->Members.insert(V);
if (isa<StoreInst>(V)) {
if (I == OldClass->NextLeader.first)
OldClass->NextLeader = {nullptr, ~0U};
// The new instruction and new class leader may either be siblings in the
// dominator tree, or the new class leader should dominate the new member
// instruction. We simply check that the member instruction does not properly
// dominate the new class leader.
assert(
!isa<Instruction>(NewClass->RepLeader) || !NewClass->RepLeader ||
I == NewClass->RepLeader ||
!DT->properlyDominates(
I->getParent(),
cast<Instruction>(NewClass->RepLeader)->getParent()) &&
"New class for instruction should not be dominated by instruction");
if (NewClass->RepLeader != I) {
auto DFSNum = InstrDFS.lookup(I);
if (DFSNum < NewClass->NextLeader.second)
NewClass->NextLeader = {I, DFSNum};
}
OldClass->Members.erase(I);
NewClass->Members.insert(I);
if (isa<StoreInst>(I)) {
--OldClass->StoreCount;
assert(OldClass->StoreCount >= 0);
++NewClass->StoreCount;
assert(NewClass->StoreCount > 0);
}
ValueToClass[V] = NewClass;
ValueToClass[I] = NewClass;
// See if we destroyed the class or need to swap leaders.
if (OldClass->Members.empty() && OldClass != InitialClass) {
if (OldClass->DefiningExpr) {
@ -1078,25 +1109,48 @@ void NewGVN::moveValueToNewCongruenceClass(Value *V, CongruenceClass *OldClass,
<< " from table\n");
ExpressionToClass.erase(OldClass->DefiningExpr);
}
} else if (OldClass->RepLeader == V) {
} else if (OldClass->RepLeader == I) {
// When the leader changes, the value numbering of
// everything may change due to symbolization changes, so we need to
// reprocess.
OldClass->RepLeader = *(OldClass->Members.begin());
DEBUG(dbgs() << "Leader change!\n");
++NumGVNLeaderChanges;
// We don't need to sort members if there is only 1, and we don't care about
// sorting the initial class because everything either gets out of it or is
// unreachable.
if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
OldClass->RepLeader = *(OldClass->Members.begin());
} else if (OldClass->NextLeader.first) {
++NumGVNAvoidedSortedLeaderChanges;
OldClass->RepLeader = OldClass->NextLeader.first;
OldClass->NextLeader = {nullptr, ~0U};
} else {
++NumGVNSortedLeaderChanges;
// TODO: If this ends up to slow, we can maintain a dual structure for
// member testing/insertion, or keep things mostly sorted, and sort only
// here, or ....
std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
for (const auto X : OldClass->Members) {
auto DFSNum = InstrDFS.lookup(X);
if (DFSNum < MinDFS.second)
MinDFS = {X, DFSNum};
}
OldClass->RepLeader = MinDFS.first;
}
markLeaderChangeTouched(OldClass);
}
}
// Perform congruence finding on a given value numbering expression.
void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
ValueToExpression[V] = E;
void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
ValueToExpression[I] = E;
// This is guaranteed to return something, since it will at least find
// INITIAL.
CongruenceClass *VClass = ValueToClass[V];
assert(VClass && "Should have found a vclass");
CongruenceClass *IClass = ValueToClass[I];
assert(IClass && "Should have found a IClass");
// Dead classes should have been eliminated from the mapping.
assert(!VClass->Dead && "Found a dead class");
assert(!IClass->Dead && "Found a dead class");
CongruenceClass *EClass;
if (const auto *VE = dyn_cast<VariableExpression>(E)) {
@ -1118,13 +1172,13 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
NewClass->RepLeader =
lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
} else {
NewClass->RepLeader = V;
NewClass->RepLeader = I;
}
assert(!isa<VariableExpression>(E) &&
"VariableExpression should have been handled already");
EClass = NewClass;
DEBUG(dbgs() << "Created new congruence class for " << *V
DEBUG(dbgs() << "Created new congruence class for " << *I
<< " using expression " << *E << " at " << NewClass->ID
<< " and leader " << *(NewClass->RepLeader) << "\n");
DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
@ -1140,36 +1194,31 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
assert(!EClass->Dead && "We accidentally looked up a dead class");
}
}
bool ClassChanged = VClass != EClass;
bool LeaderChanged = LeaderChanges.erase(V);
bool ClassChanged = IClass != EClass;
bool LeaderChanged = LeaderChanges.erase(I);
if (ClassChanged || LeaderChanged) {
DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
<< "\n");
if (ClassChanged)
moveValueToNewCongruenceClass(V, VClass, EClass);
markUsersTouched(V);
if (auto *I = dyn_cast<Instruction>(V)) {
if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
// If this is a MemoryDef, we need to update the equivalence table. If
// we determined the expression is congruent to a different memory
// state, use that different memory state. If we determined it didn't,
// we update that as well. Right now, we only support store
// expressions.
if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
EClass->Members.size() != 1) {
auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
} else {
setMemoryAccessEquivTo(MA, nullptr);
}
markMemoryUsersTouched(MA);
moveValueToNewCongruenceClass(I, IClass, EClass);
markUsersTouched(I);
if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
// If this is a MemoryDef, we need to update the equivalence table. If
// we determined the expression is congruent to a different memory
// state, use that different memory state. If we determined it didn't,
// we update that as well. Right now, we only support store
// expressions.
if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
EClass->Members.size() != 1) {
auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
} else {
setMemoryAccessEquivTo(MA, nullptr);
}
markMemoryUsersTouched(MA);
}
} else if (StoreInst *SI = dyn_cast<StoreInst>(V)) {
} else if (auto *SI = dyn_cast<StoreInst>(I)) {
// There is, sadly, one complicating thing for stores. Stores do not
// produce values, only consume them. However, in order to make loads and
// stores value number the same, we ignore the value operand of the store.

View File

@ -5602,6 +5602,13 @@ void LoopVectorizationLegality::collectLoopUniforms() {
// is consecutive-like, the pointer operand should remain uniform.
else if (hasConsecutiveLikePtrOperand(&I))
ConsecutiveLikePtrs.insert(Ptr);
// Otherwise, if the memory instruction will be vectorized and its
// pointer operand is non-consecutive-like, the memory instruction should
// be a gather or scatter operation. Its pointer operand will be
// non-uniform.
else
PossibleNonUniformPtrs.insert(Ptr);
}
// Add to the Worklist all consecutive and consecutive-like pointers that

View File

@ -192,68 +192,4 @@ entry:
ret i8 %s2
}
define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
; CHECK-LABEL: test_sub_1_setcc_eq:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: lock decq (%rdi)
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
entry:
%tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
%tmp1 = icmp eq i64 %tmp0, 1
%tmp2 = zext i1 %tmp1 to i8
ret i8 %tmp2
}
define i8 @test_add_5_setcc_ne(i64* %p) #0 {
; CHECK-LABEL: test_add_5_setcc_ne:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: lock addq $5, (%rdi)
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
entry:
%tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
%tmp1 = icmp ne i64 %tmp0, -5
%tmp2 = zext i1 %tmp1 to i8
ret i8 %tmp2
}
define i8 @test_add_5_setcc_ne_comparand_mismatch(i64* %p) #0 {
; CHECK-LABEL: test_add_5_setcc_ne_comparand_mismatch:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: movl $5, %eax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
entry:
%tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
%tmp1 = icmp ne i64 %tmp0, 0
%tmp2 = zext i1 %tmp1 to i8
ret i8 %tmp2
}
declare void @g()
define zeroext i1 @test_sub_1_setcc_jcc(i64* %p) local_unnamed_addr #0 {
; TODO: It's possible to use "lock dec" here, but both uses of the cmp need to
; be updated.
; CHECK-LABEL: test_sub_1_setcc_jcc:
; CHECK: # BB#0: # %entry
; CHECK: movq $-1, %rax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: cmpq $1, %rax
; CHECK-NEXT: sete %bl
; CHECK-NEXT: jne
entry:
%add = atomicrmw volatile add i64* %p, i64 -1 seq_cst
%cmp = icmp ne i64 %add, 1
%not = xor i1 %cmp, true
br i1 %cmp, label %else, label %then
then:
tail call void @g()
br label %else
else:
ret i1 %not
}
attributes #0 = { nounwind }

View File

@ -4,6 +4,9 @@
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE4-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE4-64
; Make sure that the slow-pmulld feature can be used without SSE4.1.
; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
define <4 x i32> @foo(<4 x i8> %A) {
; CHECK32-LABEL: foo:
; CHECK32: # BB#0:

View File

@ -17,7 +17,7 @@
; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY
; NOTLAZY: 58 bitcode-reader - Number of Metadata records loaded
; NOTLAZY: 8 bitcode-reader - Number of MDStrings loaded
; NOTLAZY: 6 bitcode-reader - Number of MDStrings loaded
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@ -48,7 +48,7 @@ define void @globalfunc3(i32 %arg) {
!3 = !{!"3"}
!4 = !{!"4"}
!5 = !{!"5"}
!6 = !{!"6"}
!6 = !{!9}
!7 = !{!"7"}
!8 = !{!"8"}
!9 = !{!"9"}
!9 = !{!6}

View File

@ -0,0 +1,58 @@
; RUN: opt -S -loop-reduce < %s | FileCheck %s
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc19.0.24215"
define void @fn3() personality i32 (...)* @__CxxFrameHandler3 {
entry:
%call = invoke i32 @fn2()
to label %for.cond.preheader unwind label %catch.dispatch2
for.cond.preheader: ; preds = %entry
br label %for.cond
for.cond: ; preds = %for.cond.preheader, %for.cond
%b.0 = phi i32 [ %inc, %for.cond ], [ %call, %for.cond.preheader ]
%inc = add nsw i32 %b.0, 1
invoke void @fn1(i32 %inc)
to label %for.cond unwind label %catch.dispatch
; CHECK: %[[add:.*]] = add i32 %call, 1
; CHECK: br label %for.cond
; CHECK: for.cond: ; preds = %for.cond, %for.cond.preheader
; CHECK: %[[lsr_iv:.*]] = phi i32 [ %lsr.iv.next, %for.cond ], [ %[[add]], %for.cond.preheader ]
; CHECK: %[[lsr_iv_next:.*]] = add i32 %lsr.iv, 1
; CHECK: invoke void @fn1(i32 %[[lsr_iv]])
catch.dispatch: ; preds = %for.cond
%0 = catchswitch within none [label %catch] unwind label %catch.dispatch2
catch: ; preds = %catch.dispatch
%1 = catchpad within %0 [i8* null, i32 64, i8* null]
invoke void @_CxxThrowException(i8* null, i8* null) #2 [ "funclet"(token %1) ]
to label %unreachable unwind label %catch.dispatch2
catch.dispatch2: ; preds = %catch.dispatch, %catch, %entry
%a.0 = phi i32 [ undef, %entry ], [ %call, %catch ], [ %call, %catch.dispatch ]
%2 = catchswitch within none [label %catch3] unwind to caller
catch3: ; preds = %catch.dispatch2
%3 = catchpad within %2 [i8* null, i32 64, i8* null]
call void @fn1(i32 %a.0) [ "funclet"(token %3) ]
catchret from %3 to label %try.cont4
try.cont4: ; preds = %catch3
ret void
unreachable: ; preds = %catch
unreachable
}
declare i32 @fn2()
declare i32 @__CxxFrameHandler3(...)
declare void @fn1(i32)
declare void @_CxxThrowException(i8*, i8*)

View File

@ -0,0 +1,56 @@
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; CHECK-LABEL: PR31671
;
; Check a pointer in which one of its uses is consecutive-like and another of
; its uses is non-consecutive-like. In the test case below, %tmp3 is the
; pointer operand of an interleaved load, making it consecutive-like. However,
; it is also the pointer operand of a non-interleaved store that will become a
; scatter operation. %tmp3 (and the induction variable) should not be marked
; uniform-after-vectorization.
;
; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
; CHECK: vector.body:
; CHECK: %vec.ind = phi <16 x i64>
; CHECK: %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0
; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]]
; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>*
; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4
; CHECK: %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]]
; CHECK: %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>*
; CHECK: load <80 x float>, <80 x float>* %[[T4]], align 4
; CHECK: %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind
; CHECK: call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}})
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
define void @PR31671(float %x, %data* %d) #0 {
entry:
br label %for.body
for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
%tmp1 = load float, float* %tmp0, align 4
%tmp2 = fmul float %x, %tmp1
%tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
%tmp4 = load float, float* %tmp3, align 4
%tmp5 = fadd float %tmp4, %tmp2
store float %tmp5, float* %tmp3, align 4
%i.next = add nuw nsw i64 %i, 5
%cond = icmp slt i64 %i.next, 32000
br i1 %cond, label %for.body, label %for.end
for.end:
ret void
}
attributes #0 = { "target-cpu"="knl" }

View File

@ -0,0 +1,135 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
;; Both of these tests are tests of phi nodes that end up all equivalent to each other
;; Without proper leader ordering, we will end up cycling the leader between all of them and never converge.
define void @foo() {
; CHECK-LABEL: @foo(
; CHECK-NEXT: bb:
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ 1, [[BB18:%.*]] ]
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: br label [[BB4:%.*]]
; CHECK: bb4:
; CHECK-NEXT: br i1 undef, label [[BB18]], label [[BB7:%.*]]
; CHECK: bb7:
; CHECK-NEXT: br label [[BB9:%.*]]
; CHECK: bb9:
; CHECK-NEXT: br i1 undef, label [[BB2]], label [[BB11:%.*]]
; CHECK: bb11:
; CHECK-NEXT: br i1 undef, label [[BB16:%.*]], label [[BB14:%.*]]
; CHECK: bb14:
; CHECK-NEXT: br label [[BB4]]
; CHECK: bb16:
; CHECK-NEXT: br label [[BB7]]
; CHECK: bb18:
; CHECK-NEXT: br label [[BB1]]
;
bb:
br label %bb1
bb1: ; preds = %bb18, %bb
%tmp = phi i32 [ 0, %bb ], [ 1, %bb18 ]
br label %bb2
bb2: ; preds = %bb9, %bb1
%tmp3 = phi i32 [ %tmp, %bb1 ], [ %tmp8, %bb9 ]
br label %bb4
bb4: ; preds = %bb14, %bb2
%tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp15, %bb14 ]
br i1 undef, label %bb18, label %bb7
bb7: ; preds = %bb16, %bb4
%tmp8 = phi i32 [ %tmp17, %bb16 ], [ %tmp5, %bb4 ]
br label %bb9
bb9: ; preds = %bb7
br i1 undef, label %bb2, label %bb11
bb11: ; preds = %bb9
br i1 undef, label %bb16, label %bb14
bb14: ; preds = %bb11
%tmp15 = phi i32 [ %tmp8, %bb11 ]
br label %bb4
bb16: ; preds = %bb11
%tmp17 = phi i32 [ %tmp8, %bb11 ]
br label %bb7
bb18: ; preds = %bb4
br label %bb1
}
%struct.a = type {}
%struct.b = type {}
declare void @c.d.p(i64, i8*)
define void @e() {
; CHECK-LABEL: @e(
; CHECK-NEXT: [[F:%.*]] = alloca i32
; CHECK-NEXT: store i32 undef, i32* [[F]], !g !0
; CHECK-NEXT: br label [[H:%.*]]
; CHECK: h:
; CHECK-NEXT: call void @c.d.p(i64 8, i8* undef)
; CHECK-NEXT: [[I:%.*]] = load i32, i32* [[F]]
; CHECK-NEXT: [[J:%.*]] = load i32, i32* null
; CHECK-NEXT: [[K:%.*]] = icmp eq i32 [[I]], [[J]]
; CHECK-NEXT: br i1 [[K]], label [[L:%.*]], label [[Q:%.*]]
; CHECK: l:
; CHECK-NEXT: br label [[R:%.*]]
; CHECK: q:
; CHECK-NEXT: [[M:%.*]] = load %struct.a*, %struct.a** null
; CHECK-NEXT: br label [[R]]
; CHECK: r:
; CHECK-NEXT: switch i32 undef, label [[N:%.*]] [
; CHECK-NEXT: i32 0, label [[S:%.*]]
; CHECK-NEXT: ]
; CHECK: s:
; CHECK-NEXT: store i32 undef, i32* [[F]], !g !0
; CHECK-NEXT: br label [[H]]
; CHECK: n:
; CHECK-NEXT: [[O:%.*]] = load %struct.a*, %struct.a** null
; CHECK-NEXT: ret void
;
%f = alloca i32
store i32 undef, i32* %f, !g !0
br label %h
h: ; preds = %s, %0
call void @c.d.p(i64 8, i8* undef)
%i = load i32, i32* %f
%j = load i32, i32* null
%k = icmp eq i32 %i, %j
br i1 %k, label %l, label %q
l: ; preds = %h
br label %r
q: ; preds = %h
%m = load %struct.a*, %struct.a** null
%1 = bitcast %struct.a* %m to %struct.b*
br label %r
r: ; preds = %q, %l
switch i32 undef, label %n [
i32 0, label %s
]
s: ; preds = %r
store i32 undef, i32* %f, !g !0
br label %h
n: ; preds = %r
%o = load %struct.a*, %struct.a** null
%2 = bitcast %struct.a* %o to %struct.b*
ret void
}
!0 = !{}