Vendor import of llvm release_40 branch r292732:

https://llvm.org/svn/llvm-project/llvm/branches/release_40@292732
2017-01-22 16:52:30 +00:00 · 2017-01-22 16:52:30 +00:00 · 7c71d32ab5
commit 7c71d32ab5
parent 581a6d8501
19 changed files with 462 additions and 175 deletions
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@ -462,11 +462,9 @@ function(llvm_add_library name)
    if(UNIX AND NOT APPLE AND NOT ARG_SONAME)
      set_target_properties(${name}
        PROPERTIES
-		# Concatenate the version numbers since ldconfig expects exactly
-		# one component indicating the ABI version, while LLVM uses
-		# major+minor for that.
-        SOVERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}
-        VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX})
+        # Since 4.0.0, the ABI version is indicated by the major version
+        SOVERSION ${LLVM_VERSION_MAJOR}
+        VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX})
    endif()
  endif()

--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -67,13 +67,46 @@ Non-comprehensive list of changes in this release

   Makes programs 10x faster by doing Special New Thing.

+   Improvements to ThinLTO (-flto=thin)
+   ------------------------------------
+   * Integration with profile data (PGO). When available, profile data
+     enables more accurate function importing decisions, as well as
+     cross-module indirect call promotion.
+   * Significant build-time and binary-size improvements when compiling with
+     debug info (-g).
+
 Changes to the LLVM IR
 ----------------------

-Changes to the ARM Backend
+Changes to the ARM Targets
 --------------------------

- During this release ...
+**During this release the AArch64 target has:**
+
+* Gained support for ILP32 relocations.
+* Gained support for XRay.
+* Made even more progress on GlobalISel. There is still some work left before
+  it is production-ready though.
+* Refined the support for Qualcomm's Falkor and Samsung's Exynos CPUs.
+* Learned a few new tricks for lowering multiplications by constants, folding
+  spilled/refilled copies etc.
+
+**During this release the ARM target has:**
+
+* Gained support for ROPI (read-only position independence) and RWPI
+  (read-write position independence), which can be used to remove the need for
+  a dynamic linker.
+* Gained support for execute-only code, which is placed in pages without read
+  permissions.
+* Gained a machine scheduler for Cortex-R52.
+* Gained support for XRay.
+* Gained Thumb1 implementations for several compiler-rt builtins. It also
+  has some support for building the builtins for HF targets.
+* Started using the generic bitreverse intrinsic instead of rbit.
+* Gained very basic support for GlobalISel.
+
+A lot of work has also been done in LLD for ARM, which now supports more
+relocations and TLS.


 Changes to the MIPS Target
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,11 +1,6 @@
 Overview
 ========

-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
--- a/include/llvm/Analysis/AssumptionCache.h
+++ b/include/llvm/Analysis/AssumptionCache.h
@ -68,7 +68,10 @@ class AssumptionCache {
  AffectedValuesMap AffectedValues;

  /// Get the vector of assumptions which affect a value from the cache.
-  SmallVector<WeakVH, 1> &getAffectedValues(Value *V);
+  SmallVector<WeakVH, 1> &getOrInsertAffectedValues(Value *V);
+
+  /// Copy affected values in the cache for OV to be affected values for NV.
+  void copyAffectedValuesInCache(Value *OV, Value *NV);

  /// \brief Flag tracking whether we have scanned the function yet.
  ///
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@ -24,7 +24,7 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;

-SmallVector<WeakVH, 1> &AssumptionCache::getAffectedValues(Value *V) {
+SmallVector<WeakVH, 1> &AssumptionCache::getOrInsertAffectedValues(Value *V) {
  // Try using find_as first to avoid creating extra value handles just for the
  // purpose of doing the lookup.
  auto AVI = AffectedValues.find_as(V);
@ -98,7 +98,7 @@ void AssumptionCache::updateAffectedValues(CallInst *CI) {
  }

  for (auto &AV : Affected) {
-    auto &AVV = getAffectedValues(AV);
+    auto &AVV = getOrInsertAffectedValues(AV);
    if (std::find(AVV.begin(), AVV.end(), CI) == AVV.end())
      AVV.push_back(CI);
  }
@ -111,20 +111,27 @@ void AssumptionCache::AffectedValueCallbackVH::deleted() {
  // 'this' now dangles!
 }

+void AssumptionCache::copyAffectedValuesInCache(Value *OV, Value *NV) {
+  auto &NAVV = getOrInsertAffectedValues(NV);
+  auto AVI = AffectedValues.find(OV);
+  if (AVI == AffectedValues.end())
+    return;
+
+  for (auto &A : AVI->second)
+    if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
+      NAVV.push_back(A);
+}
+
 void AssumptionCache::AffectedValueCallbackVH::allUsesReplacedWith(Value *NV) {
  if (!isa<Instruction>(NV) && !isa<Argument>(NV))
    return;

  // Any assumptions that affected this value now affect the new value.

-  auto &NAVV = AC->getAffectedValues(NV);
-  auto AVI = AC->AffectedValues.find(getValPtr());
-  if (AVI == AC->AffectedValues.end())
-    return;
-
-  for (auto &A : AVI->second)
-    if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
-      NAVV.push_back(A);
+  AC->copyAffectedValuesInCache(getValPtr(), NV);
+  // 'this' now might dangle! If the AffectedValues map was resized to add an
+  // entry for NV then this object might have been destroyed in favor of some
+  // copy in the grown map.
 }

 void AssumptionCache::scanFunction() {
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@ -405,6 +405,7 @@ char ModuleSummaryIndexWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
                      "Module Summary Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
                    "Module Summary Analysis", false, true)

--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@ -768,13 +768,12 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
    unsigned ID, PlaceholderQueue &Placeholders) {
  assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size());
  assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
-#ifndef NDEBUG
  // Lookup first if the metadata hasn't already been loaded.
  if (auto *MD = MetadataList.lookup(ID)) {
    auto *N = dyn_cast_or_null<MDNode>(MD);
-    assert(N && N->isTemporary() && "Lazy loading an already loaded metadata");
+    if (!N->isTemporary())
+      return;
  }
-#endif
  SmallVector<uint64_t, 64> Record;
  StringRef Blob;
  IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]);
@ -827,8 +826,22 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
  auto getMD = [&](unsigned ID) -> Metadata * {
    if (ID < MDStringRef.size())
      return lazyLoadOneMDString(ID);
-    if (!IsDistinct)
+    if (!IsDistinct) {
+      if (auto *MD = MetadataList.lookup(ID))
+        return MD;
+      // If lazy-loading is enabled, we try recursively to load the operand
+      // instead of creating a temporary.
+      if (ID < (MDStringRef.size() + GlobalMetadataBitPosIndex.size())) {
+        // Create a temporary for the node that is referencing the operand we
+        // will lazy-load. It is needed before recursing in case there are
+        // uniquing cycles.
+        MetadataList.getMetadataFwdRef(NextMetadataNo);
+        lazyLoadOneMetadata(ID, Placeholders);
+        return MetadataList.lookup(ID);
+      }
+      // Return a temporary.
      return MetadataList.getMetadataFwdRef(ID);
+    }
    if (auto *MD = MetadataList.getMetadataIfResolved(ID))
      return MD;
    return &Placeholders.getPlaceholderOp(ID);
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@ -829,11 +829,22 @@ static std::string writeGeneratedObject(int count, StringRef CacheEntryPath,

 // Main entry point for the ThinLTO processing
 void ThinLTOCodeGenerator::run() {
+  // Prepare the resulting object vector
+  assert(ProducedBinaries.empty() && "The generator should not be reused");
+  if (SavedObjectsDirectoryPath.empty())
+    ProducedBinaries.resize(Modules.size());
+  else {
+    sys::fs::create_directories(SavedObjectsDirectoryPath);
+    bool IsDir;
+    sys::fs::is_directory(SavedObjectsDirectoryPath, IsDir);
+    if (!IsDir)
+      report_fatal_error("Unexistent dir: '" + SavedObjectsDirectoryPath + "'");
+    ProducedBinaryFiles.resize(Modules.size());
+  }
+
  if (CodeGenOnly) {
    // Perform only parallel codegen and return.
    ThreadPool Pool;
-    assert(ProducedBinaries.empty() && "The generator should not be reused");
-    ProducedBinaries.resize(Modules.size());
    int count = 0;
    for (auto &ModuleBuffer : Modules) {
      Pool.async([&](int count) {
@ -845,7 +856,12 @@ void ThinLTOCodeGenerator::run() {
                                              /*IsImporting*/ false);

        // CodeGen
-        ProducedBinaries[count] = codegen(*TheModule);
+        auto OutputBuffer = codegen(*TheModule);
+        if (SavedObjectsDirectoryPath.empty())
+          ProducedBinaries[count] = std::move(OutputBuffer);
+        else
+          ProducedBinaryFiles[count] = writeGeneratedObject(
+              count, "", SavedObjectsDirectoryPath, *OutputBuffer);
      }, count++);
    }

@ -866,18 +882,6 @@ void ThinLTOCodeGenerator::run() {
    WriteIndexToFile(*Index, OS);
  }

-  // Prepare the resulting object vector
-  assert(ProducedBinaries.empty() && "The generator should not be reused");
-  if (SavedObjectsDirectoryPath.empty())
-    ProducedBinaries.resize(Modules.size());
-  else {
-    sys::fs::create_directories(SavedObjectsDirectoryPath);
-    bool IsDir;
-    sys::fs::is_directory(SavedObjectsDirectoryPath, IsDir);
-    if (!IsDir)
-      report_fatal_error("Unexistent dir: '" + SavedObjectsDirectoryPath + "'");
-    ProducedBinaryFiles.resize(Modules.size());
-  }

  // Prepare the module map.
  auto ModuleMap = generateModuleMap(Modules);
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -29455,19 +29455,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

-/// Combine brcond/cmov/setcc/.. based on comparing the result of
-/// atomic_load_add to use EFLAGS produced by the addition
-/// directly if possible. For example:
-///
-///   (setcc (cmp (atomic_load_add x, -C) C), COND_E)
-/// becomes:
-///   (setcc (LADD x, -C), COND_E)
-///
-/// and
+/// Combine:
 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
-/// becomes:
+/// to:
 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
-///
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
 /// Note that this is only legal for some op/cc combinations.
 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
                                       SelectionDAG &DAG) {
@ -29482,7 +29474,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
  if (!Cmp.hasOneUse())
    return SDValue();

-  // This applies to variations of the common case:
+  // This only applies to variations of the common case:
  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
@ -29501,9 +29493,8 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
    return SDValue();

  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
-  if (!CmpRHSC)
+  if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
    return SDValue();
-  APInt Comparand = CmpRHSC->getAPIntValue();

  const unsigned Opc = CmpLHS.getOpcode();

@ -29519,19 +29510,16 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
  if (Opc == ISD::ATOMIC_LOAD_SUB)
    Addend = -Addend;

-  if (Comparand == -Addend) {
-    // No change to CC.
-  } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
+  if (CC == X86::COND_S && Addend == 1)
    CC = X86::COND_LE;
-  } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
+  else if (CC == X86::COND_NS && Addend == 1)
    CC = X86::COND_G;
-  } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
+  else if (CC == X86::COND_G && Addend == -1)
    CC = X86::COND_GE;
-  } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
+  else if (CC == X86::COND_LE && Addend == -1)
    CC = X86::COND_L;
-  } else {
+  else
    return SDValue();
-  }

  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@ -232,9 +232,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
           isTargetKFreeBSD() || In64BitMode)
    stackAlignment = 16;
-
-  assert((!isPMULLDSlow() || hasSSE41()) &&
-         "Feature Slow PMULLD can only be set on a subtarget with SSE4.1");
 }

 void X86Subtarget::initializeEnvironment() {
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -3163,6 +3163,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
        // Don't bother if the instruction is in a BB which ends in an EHPad.
        if (UseBB->getTerminator()->isEHPad())
          continue;
+        // Don't bother rewriting PHIs in catchswitch blocks.
+        if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
+          continue;
        // Ignore uses which are part of other SCEV expressions, to avoid
        // analyzing them multiple times.
        if (SE.isSCEVable(UserInst->getType())) {
@ -4672,7 +4675,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
      // is the canonical backedge for this loop, which complicates post-inc
      // users.
      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
-          !isa<IndirectBrInst>(BB->getTerminator())) {
+          !isa<IndirectBrInst>(BB->getTerminator()) &&
+          !isa<CatchSwitchInst>(BB->getTerminator())) {
        BasicBlock *Parent = PN->getParent();
        Loop *PNLoop = LI.getLoopFor(Parent);
        if (!PNLoop || Parent != PNLoop->getHeader()) {
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@ -81,6 +81,10 @@ STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
 STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
 STATISTIC(NumGVNMaxIterations,
          "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
+STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
+STATISTIC(NumGVNAvoidedSortedLeaderChanges,
+          "Number of avoided sorted leader changes");

 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@ -139,6 +143,10 @@ struct CongruenceClass {
  // This is used so we can detect store equivalence changes properly.
  int StoreCount = 0;

+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
  explicit CongruenceClass(unsigned ID) : ID(ID) {}
  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
@ -320,8 +328,8 @@ class NewGVN : public FunctionPass {
  // Templated to allow them to work both on BB's and BB-edges.
  template <class T>
  Value *lookupOperandLeader(Value *, const User *, const T &) const;
-  void performCongruenceFinding(Value *, const Expression *);
-  void moveValueToNewCongruenceClass(Value *, CongruenceClass *,
+  void performCongruenceFinding(Instruction *, const Expression *);
+  void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
                                     CongruenceClass *);
  // Reachability handling.
  void updateReachableEdge(BasicBlock *, BasicBlock *);
@ -1056,20 +1064,43 @@ void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {

 // Move a value, currently in OldClass, to be part of NewClass
 // Update OldClass for the move (including changing leaders, etc)
-void NewGVN::moveValueToNewCongruenceClass(Value *V, CongruenceClass *OldClass,
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+                                           CongruenceClass *OldClass,
                                           CongruenceClass *NewClass) {
-  DEBUG(dbgs() << "New congruence class for " << V << " is " << NewClass->ID
+  DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
               << "\n");
-  OldClass->Members.erase(V);
-  NewClass->Members.insert(V);
-  if (isa<StoreInst>(V)) {
+
+  if (I == OldClass->NextLeader.first)
+    OldClass->NextLeader = {nullptr, ~0U};
+
+  // The new instruction and new class leader may either be siblings in the
+  // dominator tree, or the new class leader should dominate the new member
+  // instruction.  We simply check that the member instruction does not properly
+  // dominate the new class leader.
+  assert(
+      !isa<Instruction>(NewClass->RepLeader) || !NewClass->RepLeader ||
+      I == NewClass->RepLeader ||
+      !DT->properlyDominates(
+          I->getParent(),
+          cast<Instruction>(NewClass->RepLeader)->getParent()) &&
+          "New class for instruction should not be dominated by instruction");
+
+  if (NewClass->RepLeader != I) {
+    auto DFSNum = InstrDFS.lookup(I);
+    if (DFSNum < NewClass->NextLeader.second)
+      NewClass->NextLeader = {I, DFSNum};
+  }
+
+  OldClass->Members.erase(I);
+  NewClass->Members.insert(I);
+  if (isa<StoreInst>(I)) {
    --OldClass->StoreCount;
    assert(OldClass->StoreCount >= 0);
    ++NewClass->StoreCount;
    assert(NewClass->StoreCount > 0);
  }

-  ValueToClass[V] = NewClass;
+  ValueToClass[I] = NewClass;
  // See if we destroyed the class or need to swap leaders.
  if (OldClass->Members.empty() && OldClass != InitialClass) {
    if (OldClass->DefiningExpr) {
@ -1078,25 +1109,48 @@ void NewGVN::moveValueToNewCongruenceClass(Value *V, CongruenceClass *OldClass,
                   << " from table\n");
      ExpressionToClass.erase(OldClass->DefiningExpr);
    }
-  } else if (OldClass->RepLeader == V) {
+  } else if (OldClass->RepLeader == I) {
    // When the leader changes, the value numbering of
    // everything may change due to symbolization changes, so we need to
    // reprocess.
-    OldClass->RepLeader = *(OldClass->Members.begin());
+    DEBUG(dbgs() << "Leader change!\n");
+    ++NumGVNLeaderChanges;
+    // We don't need to sort members if there is only 1, and we don't care about
+    // sorting the initial class because everything either gets out of it or is
+    // unreachable.
+    if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
+      OldClass->RepLeader = *(OldClass->Members.begin());
+    } else if (OldClass->NextLeader.first) {
+      ++NumGVNAvoidedSortedLeaderChanges;
+      OldClass->RepLeader = OldClass->NextLeader.first;
+      OldClass->NextLeader = {nullptr, ~0U};
+    } else {
+      ++NumGVNSortedLeaderChanges;
+      // TODO: If this ends up to slow, we can maintain a dual structure for
+      // member testing/insertion, or keep things mostly sorted, and sort only
+      // here, or ....
+      std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
+      for (const auto X : OldClass->Members) {
+        auto DFSNum = InstrDFS.lookup(X);
+        if (DFSNum < MinDFS.second)
+          MinDFS = {X, DFSNum};
+      }
+      OldClass->RepLeader = MinDFS.first;
+    }
    markLeaderChangeTouched(OldClass);
  }
 }

 // Perform congruence finding on a given value numbering expression.
-void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
-  ValueToExpression[V] = E;
+void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
+  ValueToExpression[I] = E;
  // This is guaranteed to return something, since it will at least find
  // INITIAL.

-  CongruenceClass *VClass = ValueToClass[V];
-  assert(VClass && "Should have found a vclass");
+  CongruenceClass *IClass = ValueToClass[I];
+  assert(IClass && "Should have found a IClass");
  // Dead classes should have been eliminated from the mapping.
-  assert(!VClass->Dead && "Found a dead class");
+  assert(!IClass->Dead && "Found a dead class");

  CongruenceClass *EClass;
  if (const auto *VE = dyn_cast<VariableExpression>(E)) {
@ -1118,13 +1172,13 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
        NewClass->RepLeader =
            lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
      } else {
-        NewClass->RepLeader = V;
+        NewClass->RepLeader = I;
      }
      assert(!isa<VariableExpression>(E) &&
             "VariableExpression should have been handled already");

      EClass = NewClass;
-      DEBUG(dbgs() << "Created new congruence class for " << *V
+      DEBUG(dbgs() << "Created new congruence class for " << *I
                   << " using expression " << *E << " at " << NewClass->ID
                   << " and leader " << *(NewClass->RepLeader) << "\n");
      DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
@ -1140,36 +1194,31 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
      assert(!EClass->Dead && "We accidentally looked up a dead class");
    }
  }
-  bool ClassChanged = VClass != EClass;
-  bool LeaderChanged = LeaderChanges.erase(V);
+  bool ClassChanged = IClass != EClass;
+  bool LeaderChanged = LeaderChanges.erase(I);
  if (ClassChanged || LeaderChanged) {
    DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
                 << "\n");

    if (ClassChanged)
-
-      moveValueToNewCongruenceClass(V, VClass, EClass);
-
-
-    markUsersTouched(V);
-    if (auto *I = dyn_cast<Instruction>(V)) {
-      if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
-        // If this is a MemoryDef, we need to update the equivalence table. If
-        // we determined the expression is congruent to a different memory
-        // state, use that different memory state.  If we determined it didn't,
-        // we update that as well.  Right now, we only support store
-        // expressions.
-        if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
-            EClass->Members.size() != 1) {
-          auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
-          setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
-        } else {
-          setMemoryAccessEquivTo(MA, nullptr);
-        }
-        markMemoryUsersTouched(MA);
+      moveValueToNewCongruenceClass(I, IClass, EClass);
+    markUsersTouched(I);
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
+      // If this is a MemoryDef, we need to update the equivalence table. If
+      // we determined the expression is congruent to a different memory
+      // state, use that different memory state.  If we determined it didn't,
+      // we update that as well.  Right now, we only support store
+      // expressions.
+      if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
+          EClass->Members.size() != 1) {
+        auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
+        setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
+      } else {
+        setMemoryAccessEquivTo(MA, nullptr);
      }
+      markMemoryUsersTouched(MA);
    }
-  } else if (StoreInst *SI = dyn_cast<StoreInst>(V)) {
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
    // There is, sadly, one complicating thing for stores.  Stores do not
    // produce values, only consume them.  However, in order to make loads and
    // stores value number the same, we ignore the value operand of the store.
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -5602,6 +5602,13 @@ void LoopVectorizationLegality::collectLoopUniforms() {
      // is consecutive-like, the pointer operand should remain uniform.
      else if (hasConsecutiveLikePtrOperand(&I))
        ConsecutiveLikePtrs.insert(Ptr);
+
+      // Otherwise, if the memory instruction will be vectorized and its
+      // pointer operand is non-consecutive-like, the memory instruction should
+      // be a gather or scatter operation. Its pointer operand will be
+      // non-uniform.
+      else
+        PossibleNonUniformPtrs.insert(Ptr);
    }

  // Add to the Worklist all consecutive and consecutive-like pointers that
--- a/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@ -192,68 +192,4 @@ entry:
  ret i8 %s2
 }

-define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
-; CHECK-LABEL: test_sub_1_setcc_eq:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lock decq (%rdi)
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
-entry:
-  %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
-  %tmp1 = icmp eq i64 %tmp0, 1
-  %tmp2 = zext i1 %tmp1 to i8
-  ret i8 %tmp2
-}
-
-define i8 @test_add_5_setcc_ne(i64* %p) #0 {
-; CHECK-LABEL: test_add_5_setcc_ne:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lock addq $5, (%rdi)
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
-entry:
-  %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
-  %tmp1 = icmp ne i64 %tmp0, -5
-  %tmp2 = zext i1 %tmp1 to i8
-  ret i8 %tmp2
-}
-
-define i8 @test_add_5_setcc_ne_comparand_mismatch(i64* %p) #0 {
-; CHECK-LABEL: test_add_5_setcc_ne_comparand_mismatch:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movl $5, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    retq
-entry:
-  %tmp0 = atomicrmw add i64* %p, i64 5 seq_cst
-  %tmp1 = icmp ne i64 %tmp0, 0
-  %tmp2 = zext i1 %tmp1 to i8
-  ret i8 %tmp2
-}
-
-declare void @g()
-define zeroext i1 @test_sub_1_setcc_jcc(i64* %p) local_unnamed_addr #0 {
-; TODO: It's possible to use "lock dec" here, but both uses of the cmp need to
-; be updated.
-; CHECK-LABEL: test_sub_1_setcc_jcc:
-; CHECK:       # BB#0: # %entry
-; CHECK:	     movq $-1, %rax
-; CHECK-NEXT:  lock xaddq %rax, (%rdi)
-; CHECK-NEXT:  cmpq $1, %rax
-; CHECK-NEXT:  sete %bl
-; CHECK-NEXT:  jne
-entry:
-  %add = atomicrmw volatile add i64* %p, i64 -1 seq_cst
-  %cmp = icmp ne i64 %add, 1
-  %not = xor i1 %cmp, true
-  br i1 %cmp, label %else, label %then
-then:
-  tail call void @g()
-  br label %else
-else:
-  ret i1 %not
-}
-
 attributes #0 = { nounwind }
--- a/test/CodeGen/X86/slow-pmulld.ll
+++ b/test/CodeGen/X86/slow-pmulld.ll
@ -4,6 +4,9 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE4-32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE4-64

+; Make sure that the slow-pmulld feature can be used without SSE4.1.
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
+
 define <4 x i32> @foo(<4 x i8> %A) {
 ; CHECK32-LABEL: foo:
 ; CHECK32:       # BB#0:
--- a/test/ThinLTO/X86/lazyload_metadata.ll
+++ b/test/ThinLTO/X86/lazyload_metadata.ll
@ -17,7 +17,7 @@
 ; RUN:          -o /dev/null -disable-ondemand-mds-loading -stats \
 ; RUN:  2>&1 | FileCheck %s -check-prefix=NOTLAZY
 ; NOTLAZY: 58 bitcode-reader  - Number of Metadata records loaded
-; NOTLAZY: 8 bitcode-reader  - Number of MDStrings loaded
+; NOTLAZY: 6 bitcode-reader  - Number of MDStrings loaded


 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@ -48,7 +48,7 @@ define void @globalfunc3(i32 %arg) {
 !3 = !{!"3"}
 !4 = !{!"4"}
 !5 = !{!"5"}
-!6 = !{!"6"}
+!6 = !{!9}
 !7 = !{!"7"}
 !8 = !{!"8"}
-!9 = !{!"9"}
+!9 = !{!6}
--- a/test/Transforms/LoopStrengthReduce/pr31627.ll
+++ b/test/Transforms/LoopStrengthReduce/pr31627.ll
@ -0,0 +1,58 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+define void @fn3() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %call = invoke i32 @fn2()
+          to label %for.cond.preheader unwind label %catch.dispatch2
+
+for.cond.preheader:                               ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.preheader, %for.cond
+  %b.0 = phi i32 [ %inc, %for.cond ], [ %call, %for.cond.preheader ]
+  %inc = add nsw i32 %b.0, 1
+  invoke void @fn1(i32 %inc)
+          to label %for.cond unwind label %catch.dispatch
+
+; CHECK:   %[[add:.*]] = add i32 %call, 1
+; CHECK:   br label %for.cond
+
+; CHECK: for.cond:                                         ; preds = %for.cond, %for.cond.preheader
+; CHECK:   %[[lsr_iv:.*]] = phi i32 [ %lsr.iv.next, %for.cond ], [ %[[add]], %for.cond.preheader ]
+; CHECK:   %[[lsr_iv_next:.*]] = add i32 %lsr.iv, 1
+; CHECK:   invoke void @fn1(i32 %[[lsr_iv]])
+
+
+catch.dispatch:                                   ; preds = %for.cond
+  %0 = catchswitch within none [label %catch] unwind label %catch.dispatch2
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  invoke void @_CxxThrowException(i8* null, i8* null) #2 [ "funclet"(token %1) ]
+          to label %unreachable unwind label %catch.dispatch2
+
+catch.dispatch2:                                  ; preds = %catch.dispatch, %catch, %entry
+  %a.0 = phi i32 [ undef, %entry ], [ %call, %catch ], [ %call, %catch.dispatch ]
+  %2 = catchswitch within none [label %catch3] unwind to caller
+
+catch3:                                           ; preds = %catch.dispatch2
+  %3 = catchpad within %2 [i8* null, i32 64, i8* null]
+  call void @fn1(i32 %a.0) [ "funclet"(token %3) ]
+  catchret from %3 to label %try.cont4
+
+try.cont4:                                        ; preds = %catch3
+  ret void
+
+unreachable:                                      ; preds = %catch
+  unreachable
+}
+
+declare i32 @fn2()
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @fn1(i32)
+
+declare void @_CxxThrowException(i8*, i8*)
--- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@ -0,0 +1,56 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: PR31671
+;
+; Check a pointer in which one of its uses is consecutive-like and another of
+; its uses is non-consecutive-like. In the test case below, %tmp3 is the
+; pointer operand of an interleaved load, making it consecutive-like. However,
+; it is also the pointer operand of a non-interleaved store that will become a
+; scatter operation. %tmp3 (and the induction variable) should not be marked
+; uniform-after-vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
+; CHECK:     vector.body:
+; CHECK:       %vec.ind = phi <16 x i64>
+; CHECK:       %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0
+; CHECK:       %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]]
+; CHECK:       %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>*
+; CHECK:       load <80 x float>, <80 x float>* %[[T2]], align 4
+; CHECK:       %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]]
+; CHECK:       %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>*
+; CHECK:       load <80 x float>, <80 x float>* %[[T4]], align 4
+; CHECK:       %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind
+; CHECK:       call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}})
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+
+%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
+
+define void @PR31671(float %x, %data* %d) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fmul float %x, %tmp1
+  %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+  %tmp4 = load float, float* %tmp3, align 4
+  %tmp5 = fadd float %tmp4, %tmp2
+  store float %tmp5, float* %tmp3, align 4
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, 32000
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="knl" }
--- a/test/Transforms/NewGVN/pr31613.ll
+++ b/test/Transforms/NewGVN/pr31613.ll
@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+;; Both of these tests are tests of phi nodes that end up all equivalent to each other
+;; Without proper leader ordering, we will end up cycling the leader between all of them and never converge.
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ 1, [[BB18:%.*]] ]
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br i1 undef, label [[BB18]], label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[BB11:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    br i1 undef, label [[BB16:%.*]], label [[BB14:%.*]]
+; CHECK:       bb14:
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb16:
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       bb18:
+; CHECK-NEXT:    br label [[BB1]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb18, %bb
+  %tmp = phi i32 [ 0, %bb ], [ 1, %bb18 ]
+  br label %bb2
+
+bb2:                                              ; preds = %bb9, %bb1
+  %tmp3 = phi i32 [ %tmp, %bb1 ], [ %tmp8, %bb9 ]
+  br label %bb4
+
+bb4:                                              ; preds = %bb14, %bb2
+  %tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp15, %bb14 ]
+  br i1 undef, label %bb18, label %bb7
+
+bb7:                                              ; preds = %bb16, %bb4
+  %tmp8 = phi i32 [ %tmp17, %bb16 ], [ %tmp5, %bb4 ]
+  br label %bb9
+
+bb9:                                              ; preds = %bb7
+  br i1 undef, label %bb2, label %bb11
+
+bb11:                                             ; preds = %bb9
+  br i1 undef, label %bb16, label %bb14
+
+bb14:                                             ; preds = %bb11
+  %tmp15 = phi i32 [ %tmp8, %bb11 ]
+  br label %bb4
+
+bb16:                                             ; preds = %bb11
+  %tmp17 = phi i32 [ %tmp8, %bb11 ]
+  br label %bb7
+
+bb18:                                             ; preds = %bb4
+  br label %bb1
+}
+
+%struct.a = type {}
+%struct.b = type {}
+
+declare void @c.d.p(i64, i8*)
+
+define void @e() {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:    [[F:%.*]] = alloca i32
+; CHECK-NEXT:    store i32 undef, i32* [[F]], !g !0
+; CHECK-NEXT:    br label [[H:%.*]]
+; CHECK:       h:
+; CHECK-NEXT:    call void @c.d.p(i64 8, i8* undef)
+; CHECK-NEXT:    [[I:%.*]] = load i32, i32* [[F]]
+; CHECK-NEXT:    [[J:%.*]] = load i32, i32* null
+; CHECK-NEXT:    [[K:%.*]] = icmp eq i32 [[I]], [[J]]
+; CHECK-NEXT:    br i1 [[K]], label [[L:%.*]], label [[Q:%.*]]
+; CHECK:       l:
+; CHECK-NEXT:    br label [[R:%.*]]
+; CHECK:       q:
+; CHECK-NEXT:    [[M:%.*]] = load %struct.a*, %struct.a** null
+; CHECK-NEXT:    br label [[R]]
+; CHECK:       r:
+; CHECK-NEXT:    switch i32 undef, label [[N:%.*]] [
+; CHECK-NEXT:    i32 0, label [[S:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       s:
+; CHECK-NEXT:    store i32 undef, i32* [[F]], !g !0
+; CHECK-NEXT:    br label [[H]]
+; CHECK:       n:
+; CHECK-NEXT:    [[O:%.*]] = load %struct.a*, %struct.a** null
+; CHECK-NEXT:    ret void
+;
+  %f = alloca i32
+  store i32 undef, i32* %f, !g !0
+  br label %h
+
+h:                                                ; preds = %s, %0
+  call void @c.d.p(i64 8, i8* undef)
+  %i = load i32, i32* %f
+  %j = load i32, i32* null
+  %k = icmp eq i32 %i, %j
+  br i1 %k, label %l, label %q
+
+l:                                                ; preds = %h
+  br label %r
+
+q:                                                ; preds = %h
+  %m = load %struct.a*, %struct.a** null
+  %1 = bitcast %struct.a* %m to %struct.b*
+  br label %r
+
+r:                                                ; preds = %q, %l
+  switch i32 undef, label %n [
+  i32 0, label %s
+  ]
+
+s:                                                ; preds = %r
+  store i32 undef, i32* %f, !g !0
+  br label %h
+
+n:                                                ; preds = %r
+  %o = load %struct.a*, %struct.a** null
+  %2 = bitcast %struct.a* %o to %struct.b*
+  ret void
+}
+
+!0 = !{}