Vendor import of llvm release_60 branch r323338:

https://llvm.org/svn/llvm-project/llvm/branches/release_60@323338
2018-01-24 20:23:48 +00:00 · 2018-01-24 20:23:48 +00:00 · a096e0bdf6
commit a096e0bdf6
parent d215fd3b74
74 changed files with 2673 additions and 593 deletions
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@ -37,6 +37,8 @@ set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)

 set(LLVM_ENABLE_ZLIB @LLVM_ENABLE_ZLIB@)

+set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@)
+
 set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@)

 set(LLVM_NATIVE_ARCH @LLVM_NATIVE_ARCH@)
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -54,6 +54,8 @@ Non-comprehensive list of changes in this release
  ``DIVariables`` to the instructions in a ``Module``. The ``CheckDebugify``
  pass determines how much of the metadata is lost.

+* Significantly improved quality of CodeView debug info for Windows.
+
 * Note..

 .. NOTE
@ -69,10 +71,13 @@ Non-comprehensive list of changes in this release
 Changes to the LLVM IR
 ----------------------

-Changes to the ARM Backend
--------------------------
+Changes to the ARM Target
+-------------------------

- During this release ...
+During this release the ARM target has:
+
+* Got support for enabling SjLj exception handling on platforms where it
+  isn't the default.


 Changes to the MIPS Target
@ -89,7 +94,10 @@ Changes to the PowerPC Target
 Changes to the X86 Target
 -------------------------

- During this release ...
+During this release ...
+
+* Got support for enabling SjLj exception handling on platforms where it
+  isn't the default.

 Changes to the AMDGPU Target
 -----------------------------
@ -116,8 +124,46 @@ Changes to the C API
 External Open Source Projects Using LLVM 6
 ==========================================

-* A project...
+JFS - JIT Fuzzing Solver
+------------------------

+`JFS <https://github.com/delcypher/jfs>`_ is an experimental constraint solver
+designed to investigate using coverage guided fuzzing as an incomplete strategy
+for solving boolean, BitVector, and floating-point constraints.
+It is built on top of LLVM, Clang, LibFuzzer, and Z3.
+
+The solver works by generating a C++ program where the reachability of an
+`abort()` statement is equivalent to finding a satisfying assignment to the
+constraints. This program is then compiled by Clang with `SanitizerCoverage
+<https://releases.llvm.org/6.0.0/tools/clang/docs/SanitizerCoverage.html>`_
+instrumentation and then fuzzed using :doc:`LibFuzzer <LibFuzzer>`.
+
+Zig Programming Language
+------------------------
+
+`Zig <http://ziglang.org>`_  is an open-source programming language designed
+for robustness, optimality, and clarity. It is intended to replace C. It
+provides high level features such as Generics,
+Compile Time Function Execution, and Partial Evaluation, yet exposes low level
+LLVM IR features such as Aliases. Zig uses Clang to provide automatic
+import of .h symbols - even inline functions and macros. Zig uses LLD combined
+with lazily building compiler-rt to provide out-of-the-box cross-compiling for
+all supported targets.
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.

 Additional Information
 ======================
--- a/include/llvm/Analysis/RegionInfoImpl.h
+++ b/include/llvm/Analysis/RegionInfoImpl.h
@ -254,23 +254,23 @@ std::string RegionBase<Tr>::getNameStr() const {
 template <class Tr>
 void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
  if (!contains(BB))
-    llvm_unreachable("Broken region found: enumerated BB not in region!");
+    report_fatal_error("Broken region found: enumerated BB not in region!");

  BlockT *entry = getEntry(), *exit = getExit();

  for (BlockT *Succ :
       make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
    if (!contains(Succ) && exit != Succ)
-      llvm_unreachable("Broken region found: edges leaving the region must go "
-                       "to the exit node!");
+      report_fatal_error("Broken region found: edges leaving the region must go "
+                         "to the exit node!");
  }

  if (entry != BB) {
    for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
                                   InvBlockTraits::child_end(BB))) {
      if (!contains(Pred))
-        llvm_unreachable("Broken region found: edges entering the region must "
-                         "go to the entry node!");
+        report_fatal_error("Broken region found: edges entering the region must "
+                           "go to the entry node!");
    }
  }
 }
@ -557,7 +557,7 @@ void RegionInfoBase<Tr>::verifyBBMap(const RegionT *R) const {
    } else {
      BlockT *BB = Element->template getNodeAs<BlockT>();
      if (getRegionFor(BB) != R)
-        llvm_unreachable("BB map does not match region nesting");
+        report_fatal_error("BB map does not match region nesting");
    }
  }
 }
--- a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@ -56,7 +56,7 @@ public:
                      int64_t &Off);

  /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(SDValue Ptr, const SelectionDAG &DAG);
+  static BaseIndexOffset match(LSBaseSDNode *N, const SelectionDAG &DAG);
 };

 } // end namespace llvm
--- a/include/llvm/MC/MCCodeView.h
+++ b/include/llvm/MC/MCCodeView.h
@ -177,13 +177,7 @@ public:
                               unsigned IACol);

  /// Retreive the function info if this is a valid function id, or nullptr.
-  MCCVFunctionInfo *getCVFunctionInfo(unsigned FuncId) {
-    if (FuncId >= Functions.size())
-      return nullptr;
-    if (Functions[FuncId].isUnallocatedFunctionInfo())
-      return nullptr;
-    return &Functions[FuncId];
-  }
+  MCCVFunctionInfo *getCVFunctionInfo(unsigned FuncId);

  /// Saves the information from the currently parsed .cv_loc directive
  /// and sets CVLocSeen.  When the next instruction is assembled an entry
@ -199,50 +193,22 @@ public:
    CurrentCVLoc.setIsStmt(IsStmt);
    CVLocSeen = true;
  }
-  void clearCVLocSeen() { CVLocSeen = false; }

  bool getCVLocSeen() { return CVLocSeen; }
+  void clearCVLocSeen() { CVLocSeen = false; }
+
  const MCCVLoc &getCurrentCVLoc() { return CurrentCVLoc; }

  bool isValidCVFileNumber(unsigned FileNumber);

  /// \brief Add a line entry.
-  void addLineEntry(const MCCVLineEntry &LineEntry) {
-    size_t Offset = MCCVLines.size();
-    auto I = MCCVLineStartStop.insert(
-        {LineEntry.getFunctionId(), {Offset, Offset + 1}});
-    if (!I.second)
-      I.first->second.second = Offset + 1;
-    MCCVLines.push_back(LineEntry);
-  }
+  void addLineEntry(const MCCVLineEntry &LineEntry);

-  std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId) {
-    std::vector<MCCVLineEntry> FilteredLines;
+  std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId);

-    auto I = MCCVLineStartStop.find(FuncId);
-    if (I != MCCVLineStartStop.end())
-      for (size_t Idx = I->second.first, End = I->second.second; Idx != End;
-           ++Idx)
-        if (MCCVLines[Idx].getFunctionId() == FuncId)
-          FilteredLines.push_back(MCCVLines[Idx]);
-    return FilteredLines;
-  }
+  std::pair<size_t, size_t> getLineExtent(unsigned FuncId);

-  std::pair<size_t, size_t> getLineExtent(unsigned FuncId) {
-    auto I = MCCVLineStartStop.find(FuncId);
-    // Return an empty extent if there are no cv_locs for this function id.
-    if (I == MCCVLineStartStop.end())
-      return {~0ULL, 0};
-    return I->second;
-  }
-
-  ArrayRef<MCCVLineEntry> getLinesForExtent(size_t L, size_t R) {
-    if (R <= L)
-      return None;
-    if (L >= MCCVLines.size())
-      return None;
-    return makeArrayRef(&MCCVLines[L], R - L);
-  }
+  ArrayRef<MCCVLineEntry> getLinesForExtent(size_t L, size_t R);

  /// Emits a line table substream.
  void emitLineTableForFunction(MCObjectStreamer &OS, unsigned FuncId,
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@ -628,7 +628,7 @@ struct SemiNCAInfo {
        DecreasingLevel>
        Bucket;  // Queue of tree nodes sorted by level in descending order.
    SmallDenseSet<TreeNodePtr, 8> Affected;
-    SmallDenseSet<TreeNodePtr, 8> Visited;
+    SmallDenseMap<TreeNodePtr, unsigned, 8> Visited;
    SmallVector<TreeNodePtr, 8> AffectedQueue;
    SmallVector<TreeNodePtr, 8> VisitedNotAffectedQueue;
  };
@ -706,7 +706,7 @@ struct SemiNCAInfo {
      // algorithm does not really know or use the set of roots and can make a
      // different (implicit) decision about which nodes within an infinite loop
      // becomes a root.
-      if (DT.isVirtualRoot(TN->getIDom())) {
+      if (TN && !DT.isVirtualRoot(TN->getIDom())) {
        DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
                     << " is not virtual root's child\n"
                     << "The entire tree needs to be rebuilt\n");
@ -753,14 +753,16 @@ struct SemiNCAInfo {

    while (!II.Bucket.empty()) {
      const TreeNodePtr CurrentNode = II.Bucket.top().second;
+      const unsigned  CurrentLevel = CurrentNode->getLevel();
      II.Bucket.pop();
      DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
                   << BlockNamePrinter(CurrentNode) << "\n");
-      II.Visited.insert(CurrentNode);
+
+      II.Visited.insert({CurrentNode, CurrentLevel});
      II.AffectedQueue.push_back(CurrentNode);

      // Discover and collect affected successors of the current node.
-      VisitInsertion(DT, BUI, CurrentNode, CurrentNode->getLevel(), NCD, II);
+      VisitInsertion(DT, BUI, CurrentNode, CurrentLevel, NCD, II);
    }

    // Finish by updating immediate dominators and levels.
@ -772,13 +774,17 @@ struct SemiNCAInfo {
                             const TreeNodePtr TN, const unsigned RootLevel,
                             const TreeNodePtr NCD, InsertionInfo &II) {
    const unsigned NCDLevel = NCD->getLevel();
-    DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << "\n");
+    DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << ",  RootLevel "
+                 << RootLevel << "\n");

    SmallVector<TreeNodePtr, 8> Stack = {TN};
    assert(TN->getBlock() && II.Visited.count(TN) && "Preconditions!");

+    SmallPtrSet<TreeNodePtr, 8> Processed;
+
    do {
      TreeNodePtr Next = Stack.pop_back_val();
+      DEBUG(dbgs() << " Next: " << BlockNamePrinter(Next) << "\n");

      for (const NodePtr Succ :
           ChildrenGetter<IsPostDom>::Get(Next->getBlock(), BUI)) {
@ -786,19 +792,31 @@ struct SemiNCAInfo {
        assert(SuccTN && "Unreachable successor found at reachable insertion");
        const unsigned SuccLevel = SuccTN->getLevel();

-        DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ)
-                     << ", level = " << SuccLevel << "\n");
+        DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ) << ", level = "
+                     << SuccLevel << "\n");
+
+        // Do not process the same node multiple times.
+        if (Processed.count(Next) > 0)
+          continue;

        // Succ dominated by subtree From -- not affected.
        // (Based on the lemma 2.5 from the second paper.)
        if (SuccLevel > RootLevel) {
          DEBUG(dbgs() << "\t\tDominated by subtree From\n");
-          if (II.Visited.count(SuccTN) != 0)
-            continue;
+          if (II.Visited.count(SuccTN) != 0) {
+            DEBUG(dbgs() << "\t\t\talready visited at level "
+                         << II.Visited[SuccTN] << "\n\t\t\tcurrent level "
+                         << RootLevel << ")\n");
+
+            // A node can be necessary to visit again if we see it again at
+            // a lower level than before.
+            if (II.Visited[SuccTN] >= RootLevel)
+              continue;
+          }

          DEBUG(dbgs() << "\t\tMarking visited not affected "
                       << BlockNamePrinter(Succ) << "\n");
-          II.Visited.insert(SuccTN);
+          II.Visited.insert({SuccTN, RootLevel});
          II.VisitedNotAffectedQueue.push_back(SuccTN);
          Stack.push_back(SuccTN);
        } else if ((SuccLevel > NCDLevel + 1) &&
@ -809,6 +827,8 @@ struct SemiNCAInfo {
          II.Bucket.push({SuccLevel, SuccTN});
        }
      }
+
+      Processed.insert(Next);
    } while (!Stack.empty());
  }

@ -920,21 +940,21 @@ struct SemiNCAInfo {
    const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To);
    const TreeNodePtr NCD = DT.getNode(NCDBlock);

-    // To dominates From -- nothing to do.
-    if (ToTN == NCD) return;
+    // If To dominates From -- nothing to do.
+    if (ToTN != NCD) {
+      DT.DFSInfoValid = false;

-    DT.DFSInfoValid = false;
+      const TreeNodePtr ToIDom = ToTN->getIDom();
+      DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
+                   << BlockNamePrinter(ToIDom) << "\n");

-    const TreeNodePtr ToIDom = ToTN->getIDom();
-    DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
-                 << BlockNamePrinter(ToIDom) << "\n");
-
-    // To remains reachable after deletion.
-    // (Based on the caption under Figure 4. from the second paper.)
-    if (FromTN != ToIDom || HasProperSupport(DT, BUI, ToTN))
-      DeleteReachable(DT, BUI, FromTN, ToTN);
-    else
-      DeleteUnreachable(DT, BUI, ToTN);
+      // To remains reachable after deletion.
+      // (Based on the caption under Figure 4. from the second paper.)
+      if (FromTN != ToIDom || HasProperSupport(DT, BUI, ToTN))
+        DeleteReachable(DT, BUI, FromTN, ToTN);
+      else
+        DeleteUnreachable(DT, BUI, ToTN);
+    }

    if (IsPostDom) UpdateRootsAfterUpdate(DT, BUI);
  }
--- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@ -95,14 +95,9 @@ private:
  bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);

  /// \brief Try to vectorize a list of operands.
-  /// \@param BuildVector A list of users to ignore for the purpose of
-  ///                     scheduling and cost estimation when NeedExtraction
-  ///                     is false.
  /// \returns true if a value was vectorized.
  bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          ArrayRef<Value *> BuildVector = None,
-                          bool AllowReorder = false,
-                          bool NeedExtraction = false);
+                          bool AllowReorder = false);

  /// \brief Try to vectorize a chain that may start at the operands of \p I.
  bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@ -2700,8 +2700,13 @@ public:
    // we still need to collect it due to original value is different.
    // And later we will need all original values as anchors during
    // finding the common Phi node.
+    // We also must reject the case when base offset is different and
+    // scale reg is not null, we cannot handle this case due to merge of
+    // different offsets will be used as ScaleReg.
    if (DifferentField != ExtAddrMode::MultipleFields &&
-        DifferentField != ExtAddrMode::ScaleField) {
+        DifferentField != ExtAddrMode::ScaleField &&
+        (DifferentField != ExtAddrMode::BaseOffsField ||
+         !NewAddrMode.ScaledReg)) {
      AddrModes.emplace_back(NewAddrMode);
      return true;
    }
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@ -577,7 +577,8 @@ bool GlobalMerge::doInitialization(Module &M) {
  for (auto &GV : M.globals()) {
    // Merge is safe for "normal" internal or external globals only
    if (GV.isDeclaration() || GV.isThreadLocal() ||
-        GV.hasSection() || GV.hasImplicitSection())
+        GV.hasSection() || GV.hasImplicitSection() ||
+        GV.hasDLLExportStorageClass())
      continue;

    // It's not safe to merge globals that may be preempted
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@ -719,15 +719,14 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
    CurSrcPair = Pair;
    ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI,
                            !DisableAdvCopyOpt, TII);
-    ValueTrackerResult Res;
-    bool ShouldRewrite = false;

-    do {
-      // Follow the chain of copies until we reach the top of the use-def chain
-      // or find a more suitable source.
-      Res = ValTracker.getNextSource();
+    // Follow the chain of copies until we find a more suitable source, a phi
+    // or have to abort.
+    while (true) {
+      ValueTrackerResult Res = ValTracker.getNextSource();
+      // Abort at the end of a chain (without finding a suitable source).
      if (!Res.isValid())
-        break;
+        return false;

      // Insert the Def -> Use entry for the recently found source.
      ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
@ -763,24 +762,19 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
      if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
        return false;

+      // Keep following the chain if the value isn't any better yet.
      const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);
-      ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC,
-                                                CurSrcPair.SubReg);
-    } while (!ShouldRewrite);
+      if (!TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, CurSrcPair.SubReg))
+        continue;

-    // Continue looking for new sources...
-    if (Res.isValid())
-      continue;
+      // We currently cannot deal with subreg operands on PHI instructions
+      // (see insertPHI()).
+      if (PHICount > 0 && CurSrcPair.SubReg != 0)
+        continue;

-    // Do not continue searching for a new source if the there's at least
-    // one use-def which cannot be rewritten.
-    if (!ShouldRewrite)
-      return false;
-  }
-
-  if (PHICount >= RewritePHILimit) {
-    DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
-    return false;
+      // We found a suitable source, and are done with this chain.
+      break;
+    }
  }

  // If we did not find a more suitable source, there is nothing to optimize.
@ -799,6 +793,9 @@ insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
  assert(!SrcRegs.empty() && "No sources to create a PHI instruction?");

  const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg);
+  // NewRC is only correct if no subregisters are involved. findNextSource()
+  // should have rejected those cases already.
+  assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand");
  unsigned NewVR = MRI->createVirtualRegister(NewRC);
  MachineBasicBlock *MBB = OrigPHI->getParent();
  MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(),
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -3842,9 +3842,16 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
      EVT ExtVT;
      if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
          isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {
-        // Only add this load if we can make it more narrow.
-        if (ExtVT.bitsLT(Load->getMemoryVT()))
+
+        // ZEXTLOAD is already small enough.
+        if (Load->getExtensionType() == ISD::ZEXTLOAD &&
+            ExtVT.bitsGE(Load->getMemoryVT()))
+          continue;
+
+        // Use LE to convert equal sized loads to zext.
+        if (ExtVT.bitsLE(Load->getMemoryVT()))
          Loads.insert(Load);
+
        continue;
      }
      return false;
@ -3899,11 +3906,13 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
    if (Loads.size() == 0)
      return false;

+    DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
    SDValue MaskOp = N->getOperand(1);

    // If it exists, fixup the single node we allow in the tree that needs
    // masking.
    if (FixupNode) {
+      DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
      SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
                                FixupNode->getValueType(0),
                                SDValue(FixupNode, 0), MaskOp);
@ -3914,14 +3923,21 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {

    // Narrow any constants that need it.
    for (auto *LogicN : NodesWithConsts) {
-      auto *C = cast<ConstantSDNode>(LogicN->getOperand(1));
-      SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0),
-                                SDValue(C, 0), MaskOp);
-      DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And);
+      SDValue Op0 = LogicN->getOperand(0);
+      SDValue Op1 = LogicN->getOperand(1);
+
+      if (isa<ConstantSDNode>(Op0))
+          std::swap(Op0, Op1);
+
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
+                                Op1, MaskOp);
+
+      DAG.UpdateNodeOperands(LogicN, Op0, And);
    }

    // Create narrow loads.
    for (auto *Load : Loads) {
+      DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
      SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
                                SDValue(Load, 0), MaskOp);
      DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
@ -5209,7 +5225,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
      return SDValue();

    // Loads must share the same base address
-    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
    int64_t ByteOffsetFromBase = 0;
    if (!Base)
      Base = Ptr;
@ -12928,7 +12944,7 @@ void DAGCombiner::getStoreMergeCandidates(
    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
  // This holds the base pointer, index, and the offset in bytes from the base
  // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
  EVT MemVT = St->getMemoryVT();

  SDValue Val = peekThroughBitcast(St->getValue());
@ -12949,7 +12965,7 @@ void DAGCombiner::getStoreMergeCandidates(
  EVT LoadVT;
  if (IsLoadSrc) {
    auto *Ld = cast<LoadSDNode>(Val);
-    LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+    LBasePtr = BaseIndexOffset::match(Ld, DAG);
    LoadVT = Ld->getMemoryVT();
    // Load and store should be the same type.
    if (MemVT != LoadVT)
@ -12968,7 +12984,7 @@ void DAGCombiner::getStoreMergeCandidates(
        return false;
      // The Load's Base Ptr must also match
      if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {
-        auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
+        auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
        if (LoadVT != OtherLd->getMemoryVT())
          return false;
        if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
@ -12992,7 +13008,7 @@ void DAGCombiner::getStoreMergeCandidates(
          Val.getOpcode() != ISD::EXTRACT_SUBVECTOR)
        return false;
    }
-    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
+    Ptr = BaseIndexOffset::match(Other, DAG);
    return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
  };

@ -13365,7 +13381,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
      if (Ld->getMemoryVT() != MemVT)
        break;

-      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
      // If this is not the first ptr that we check.
      int64_t LdOffset = 0;
      if (LdBasePtr.getBase().getNode()) {
@ -17432,44 +17448,46 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
  unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();

  // Check for BaseIndexOffset matching.
-  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
-  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG);
+  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG);
  int64_t PtrDiff;
-  if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
-    return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
+  if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) {
+    if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
+      return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));

-  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
-  // able to calculate their relative offset if at least one arises
-  // from an alloca. However, these allocas cannot overlap and we
-  // can infer there is no alias.
-  if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
-    if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
-      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-      // If the base are the same frame index but the we couldn't find a
-      // constant offset, (indices are different) be conservative.
-      if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
-                     !MFI.isFixedObjectIndex(B->getIndex())))
-        return false;
-    }
+    // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+    // able to calculate their relative offset if at least one arises
+    // from an alloca. However, these allocas cannot overlap and we
+    // can infer there is no alias.
+    if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
+      if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
+        MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+        // If the base are the same frame index but the we couldn't find a
+        // constant offset, (indices are different) be conservative.
+        if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
+                       !MFI.isFixedObjectIndex(B->getIndex())))
+          return false;
+      }

-  bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
-  bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
-  bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
-  bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
-  bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
-  bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
+    bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
+    bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
+    bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
+    bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
+    bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
+    bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());

-  // If of mismatched base types or checkable indices we can check
-  // they do not alias.
-  if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
-       (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
-      (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))
-    return false;
+    // If of mismatched base types or checkable indices we can check
+    // they do not alias.
+    if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
+         (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
+        (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))
+      return false;
+  }

-  // If we know required SrcValue1 and SrcValue2 have relatively large alignment
-  // compared to the size and offset of the access, we may be able to prove they
-  // do not alias. This check is conservative for now to catch cases created by
-  // splitting vector types.
+  // If we know required SrcValue1 and SrcValue2 have relatively large
+  // alignment compared to the size and offset of the access, we may be able
+  // to prove they do not alias. This check is conservative for now to catch
+  // cases created by splitting vector types.
  int64_t SrcValOffset0 = Op0->getSrcValueOffset();
  int64_t SrcValOffset1 = Op1->getSrcValueOffset();
  unsigned OrigAlignment0 = Op0->getOriginalAlignment();
@ -17479,8 +17497,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
    int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
    int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;

-    // There is no overlap between these relatively aligned accesses of similar
-    // size. Return no alias.
+    // There is no overlap between these relatively aligned accesses of
+    // similar size. Return no alias.
    if ((OffAlign0 + NumBytes0) <= OffAlign1 ||
        (OffAlign1 + NumBytes1) <= OffAlign0)
      return false;
@ -17643,7 +17661,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {

  // This holds the base pointer, index, and the offset in bytes from the base
  // pointer.
-  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);

  // We must have a base and an offset.
  if (!BasePtr.getBase().getNode())
@ -17669,7 +17687,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
      break;

    // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG);

    // Check that the base pointer is the same as the original one.
    if (!BasePtr.equalBaseIndex(Ptr, DAG))
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -2965,12 +2965,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
    case ISD::ZERO_EXTEND:
      LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
                        DAG.getValueType(AtomicType));
-      RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+      RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
      ExtRes = LHS;
      break;
    case ISD::ANY_EXTEND:
      LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
-      RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+      RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
      break;
    default:
      llvm_unreachable("Invalid atomic op extension");
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -7947,11 +7947,8 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
  if (VT.getSizeInBits() / 8 != Bytes)
    return false;

-  SDValue Loc = LD->getOperand(1);
-  SDValue BaseLoc = Base->getOperand(1);
-
-  auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this);
-  auto LocDecomp = BaseIndexOffset::match(Loc, *this);
+  auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
+  auto LocDecomp = BaseIndexOffset::match(LD, *this);

  int64_t Offset = 0;
  if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@ -21,6 +21,9 @@ using namespace llvm;

 bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
                                     const SelectionDAG &DAG, int64_t &Off) {
+  // Conservatively fail if we a match failed..
+  if (!Base.getNode() || !Other.Base.getNode())
+    return false;
  // Initial Offset difference.
  Off = Other.Offset - Offset;

@ -72,13 +75,29 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
 }

 /// Parses tree in Ptr for base, index, offset addresses.
-BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
+BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N,
+                                       const SelectionDAG &DAG) {
+  SDValue Ptr = N->getBasePtr();
+
  // (((B + I*M) + c)) + c ...
  SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
  SDValue Index = SDValue();
  int64_t Offset = 0;
  bool IsIndexSignExt = false;

+  // pre-inc/pre-dec ops are components of EA.
+  if (N->getAddressingMode() == ISD::PRE_INC) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+      Offset += C->getSExtValue();
+    else // If unknown, give up now.
+      return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+  } else if (N->getAddressingMode() == ISD::PRE_DEC) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+      Offset -= C->getSExtValue();
+    else // If unknown, give up now.
+      return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+  }
+
  // Consume constant adds & ors with appropriate masking.
  while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) {
    if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -132,9 +132,18 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

-    // Darwin 10 and higher has an optimized __bzero.
-    if (!TT.isMacOSX() || !TT.isMacOSXVersionLT(10, 6) || TT.isArch64Bit()) {
-      setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero");
+    // Some darwins have an optimized __bzero/bzero function.
+    switch (TT.getArch()) {
+    case Triple::x86:
+    case Triple::x86_64:
+      if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
+        setLibcallName(RTLIB::BZERO, "__bzero");
+      break;
+    case Triple::aarch64:
+      setLibcallName(RTLIB::BZERO, "bzero");
+      break;
+    default:
+      break;
    }

    if (darwinHasSinCos(TT)) {
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@ -954,7 +954,12 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
    NewGV->setLinkage(GlobalValue::InternalLinkage);

  Constant *C = NewGV;
-  if (DGV)
+  // Only create a bitcast if necessary. In particular, with
+  // DebugTypeODRUniquing we may reach metadata in the destination module
+  // containing a GV from the source module, in which case SGV will be
+  // the same as DGV and NewGV, and TypeMap.get() will assert since it
+  // assumes it is being invoked on a type in the source module.
+  if (DGV && NewGV != SGV)
    C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));

  if (DGV && NewGV != DGV) {
--- a/lib/MC/MCCodeView.cpp
+++ b/lib/MC/MCCodeView.cpp
@ -76,6 +76,14 @@ bool CodeViewContext::addFile(MCStreamer &OS, unsigned FileNumber,
  return true;
 }

+MCCVFunctionInfo *CodeViewContext::getCVFunctionInfo(unsigned FuncId) {
+  if (FuncId >= Functions.size())
+    return nullptr;
+  if (Functions[FuncId].isUnallocatedFunctionInfo())
+    return nullptr;
+  return &Functions[FuncId];
+}
+
 bool CodeViewContext::recordFunctionId(unsigned FuncId) {
  if (FuncId >= Functions.size())
    Functions.resize(FuncId + 1);
@ -247,6 +255,67 @@ void CodeViewContext::emitFileChecksumOffset(MCObjectStreamer &OS,
  OS.EmitValueImpl(SRE, 4);
 }

+void CodeViewContext::addLineEntry(const MCCVLineEntry &LineEntry) {
+  size_t Offset = MCCVLines.size();
+  auto I = MCCVLineStartStop.insert(
+      {LineEntry.getFunctionId(), {Offset, Offset + 1}});
+  if (!I.second)
+    I.first->second.second = Offset + 1;
+  MCCVLines.push_back(LineEntry);
+}
+
+std::vector<MCCVLineEntry>
+CodeViewContext::getFunctionLineEntries(unsigned FuncId) {
+  std::vector<MCCVLineEntry> FilteredLines;
+  auto I = MCCVLineStartStop.find(FuncId);
+  if (I != MCCVLineStartStop.end()) {
+    MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(FuncId);
+    for (size_t Idx = I->second.first, End = I->second.second; Idx != End;
+         ++Idx) {
+      unsigned LocationFuncId = MCCVLines[Idx].getFunctionId();
+      if (LocationFuncId == FuncId) {
+        // This was a .cv_loc directly for FuncId, so record it.
+        FilteredLines.push_back(MCCVLines[Idx]);
+      } else {
+        // Check if the current location is inlined in this function. If it is,
+        // synthesize a statement .cv_loc at the original inlined call site.
+        auto I = SiteInfo->InlinedAtMap.find(LocationFuncId);
+        if (I != SiteInfo->InlinedAtMap.end()) {
+          MCCVFunctionInfo::LineInfo &IA = I->second;
+          // Only add the location if it differs from the previous location.
+          // Large inlined calls will have many .cv_loc entries and we only need
+          // one line table entry in the parent function.
+          if (FilteredLines.empty() ||
+              FilteredLines.back().getFileNum() != IA.File ||
+              FilteredLines.back().getLine() != IA.Line ||
+              FilteredLines.back().getColumn() != IA.Col) {
+            FilteredLines.push_back(MCCVLineEntry(
+                MCCVLines[Idx].getLabel(),
+                MCCVLoc(FuncId, IA.File, IA.Line, IA.Col, false, false)));
+          }
+        }
+      }
+    }
+  }
+  return FilteredLines;
+}
+
+std::pair<size_t, size_t> CodeViewContext::getLineExtent(unsigned FuncId) {
+  auto I = MCCVLineStartStop.find(FuncId);
+  // Return an empty extent if there are no cv_locs for this function id.
+  if (I == MCCVLineStartStop.end())
+    return {~0ULL, 0};
+  return I->second;
+}
+
+ArrayRef<MCCVLineEntry> CodeViewContext::getLinesForExtent(size_t L, size_t R) {
+  if (R <= L)
+    return None;
+  if (L >= MCCVLines.size())
+    return None;
+  return makeArrayRef(&MCCVLines[L], R - L);
+}
+
 void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
                                               unsigned FuncId,
                                               const MCSymbol *FuncBegin,
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@ -868,6 +868,40 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
    if (OpFlags & AArch64II::MO_GOT) {
      I.setDesc(TII.get(AArch64::LOADgot));
      I.getOperand(1).setTargetFlags(OpFlags);
+    } else if (TM.getCodeModel() == CodeModel::Large) {
+      // Materialize the global using movz/movk instructions.
+      unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      auto InsertPt = std::next(I.getIterator());
+      auto MovZ =
+          BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi))
+              .addDef(MovZDstReg);
+      MovZ->addOperand(MF, I.getOperand(1));
+      MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
+                                         AArch64II::MO_NC);
+      MovZ->addOperand(MF, MachineOperand::CreateImm(0));
+      constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
+
+      auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags,
+                           unsigned Offset, unsigned ForceDstReg) {
+        unsigned DstReg =
+            ForceDstReg ? ForceDstReg
+                        : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+        auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(),
+                            TII.get(AArch64::MOVKXi))
+                        .addDef(DstReg)
+                        .addReg(SrcReg);
+        MovI->addOperand(MF, MachineOperand::CreateGA(
+                                 GV, MovZ->getOperand(1).getOffset(), Flags));
+        MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
+        constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
+        return DstReg;
+      };
+      unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(),
+                                  AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
+      DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
+      BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
+      I.eraseFromParent();
+      return true;
    } else {
      I.setDesc(TII.get(AArch64::MOVaddr));
      I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@ -821,7 +821,6 @@ namespace llvm {
                MutableArrayRef<int> NewMask, unsigned Options = None);
    OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
                MutableArrayRef<int> NewMask);
-    OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results);
    OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
                ResultStack &Results);
    OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
@ -1139,25 +1138,6 @@ OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
  return concat(Out[0], Out[1], Results);
 }

-OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) {
-  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
-
-  int VecLen = SM.Mask.size();
-  SmallVector<uint8_t,128> UsedBytes(VecLen);
-  bool HasUnused = false;
-  for (int I = 0; I != VecLen; ++I) {
-    if (SM.Mask[I] != -1)
-      UsedBytes[I] = 0xFF;
-    else
-      HasUnused = true;
-  }
-  if (!HasUnused)
-    return Va;
-  SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode));
-  Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)});
-  return OpRef::res(Results.top());
-}
-
 OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
                         ResultStack &Results) {
  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -142,6 +142,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);

+  // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+
  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
  for (MVT VT : MVT::integer_valuetypes()) {
    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@ -1154,6 +1157,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case PPCISD::Hi:              return "PPCISD::Hi";
  case PPCISD::Lo:              return "PPCISD::Lo";
  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
+  case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
+  case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
@ -8834,6 +8839,42 @@ SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
  return Op;
 }

+// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
+// compared to a value that is atomically loaded (atomic loads zero-extend).
+SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
+         "Expecting an atomic compare-and-swap here.");
+  SDLoc dl(Op);
+  auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
+  EVT MemVT = AtomicNode->getMemoryVT();
+  if (MemVT.getSizeInBits() >= 32)
+    return Op;
+
+  SDValue CmpOp = Op.getOperand(2);
+  // If this is already correctly zero-extended, leave it alone.
+  auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
+  if (DAG.MaskedValueIsZero(CmpOp, HighBits))
+    return Op;
+
+  // Clear the high bits of the compare operand.
+  unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
+  SDValue NewCmpOp =
+    DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
+                DAG.getConstant(MaskVal, dl, MVT::i32));
+
+  // Replace the existing compare operand with the properly zero-extended one.
+  SmallVector<SDValue, 4> Ops;
+  for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
+    Ops.push_back(AtomicNode->getOperand(i));
+  Ops[2] = NewCmpOp;
+  MachineMemOperand *MMO = AtomicNode->getMemOperand();
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
+  auto NodeTy =
+    (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
+  return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
+}
+
 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                  SelectionDAG &DAG) const {
  SDLoc dl(Op);
@ -9325,6 +9366,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    return LowerREM(Op, DAG);
  case ISD::BSWAP:
    return LowerBSWAP(Op, DAG);
+  case ISD::ATOMIC_CMP_SWAP:
+    return LowerATOMIC_CMP_SWAP(Op, DAG);
  }
 }

--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@ -430,6 +430,11 @@ namespace llvm {
      /// The 4xf32 load used for v4i1 constants.
      QVLFSb,

+      /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
+      /// except they ensure that the compare input is zero-extended for
+      /// sub-word versions because the atomic loads zero-extend.
+      ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16,
+
      /// GPRC = TOC_ENTRY GA, TOC
      /// Loads the entry for GA from the TOC, where the TOC base is given by
      /// the last operand.
@ -955,6 +960,7 @@ namespace llvm {
    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@ -257,6 +257,13 @@ def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
 def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                           [SDNPHasChain, SDNPOptInGlue]>;

+// PPC-specific atomic operations.
+def PPCatomicCmpSwap_8 :
+  SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
+         [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def PPCatomicCmpSwap_16 :
+  SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
+         [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
                           [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
@ -1710,6 +1717,11 @@ let usesCustomInserter = 1 in {
  }
 }

+def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new),
+        (ATOMIC_CMP_SWAP_I8 xoaddr:$ptr, i32:$old, i32:$new)>;
+def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new),
+        (ATOMIC_CMP_SWAP_I16 xoaddr:$ptr, i32:$old, i32:$new)>;
+
 // Instructions to support atomic operations
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
 def LBARX : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@ -2375,6 +2375,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
            .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
            .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
    Flags |= Prefix;
+    if (getLexer().is(AsmToken::EndOfStatement)) {
+      // We don't have real instr with the given prefix
+      //  let's use the prefix as the instr.
+      // TODO: there could be several prefixes one after another
+      Flags = X86::IP_NO_PREFIX;
+      break;
+    }
    Name = Parser.getTok().getString();
    Parser.Lex(); // eat the prefix
    // Hack: we could have something like "rep # some comment" or
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -7893,8 +7893,14 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
    IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
                                 VT.getVectorNumElements());
  IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
-  return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
-                     SDLoc(V), VT, IndicesVec, SrcVec);
+  if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
+    SrcVec =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
+                    SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
+  }
+  if (VT == MVT::v16i8)
+    return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
+  return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
 }

 SDValue
@ -18262,6 +18268,18 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
  }

+  // For v64i1 without 64-bit support we need to split and rejoin.
+  if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+    assert(Subtarget.hasBWI() && "Expected BWI to be legal");
+    SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
+    SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
+    SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
+    SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
+    SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
+    SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
  if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
    SDValue Op1Scalar;
    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
@ -28652,13 +28670,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
      }
    }

+    SDValue NewV1 = V1; // Save operand in case early exit happens.
    if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
-                                V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
-                                ShuffleVT) &&
+                                NewV1, DL, DAG, Subtarget, Shuffle,
+                                ShuffleSrcVT, ShuffleVT) &&
        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
      if (Depth == 1 && Root.getOpcode() == Shuffle)
        return SDValue(); // Nothing to do!
-      Res = DAG.getBitcast(ShuffleSrcVT, V1);
+      Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
      DCI.AddToWorklist(Res.getNode());
      Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
      DCI.AddToWorklist(Res.getNode());
@ -28680,33 +28699,36 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
    }
  }

+  SDValue NewV1 = V1; // Save operands in case early exit happens.
+  SDValue NewV2 = V2;
  if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
-                               V1, V2, DL, DAG, Subtarget, Shuffle,
+                               NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
                               ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
    if (Depth == 1 && Root.getOpcode() == Shuffle)
      return SDValue(); // Nothing to do!
-    V1 = DAG.getBitcast(ShuffleSrcVT, V1);
-    DCI.AddToWorklist(V1.getNode());
-    V2 = DAG.getBitcast(ShuffleSrcVT, V2);
-    DCI.AddToWorklist(V2.getNode());
-    Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
+    NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
+    DCI.AddToWorklist(NewV1.getNode());
+    NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
+    DCI.AddToWorklist(NewV2.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
    DCI.AddToWorklist(Res.getNode());
    return DAG.getBitcast(RootVT, Res);
  }

-  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
-                                      AllowIntDomain, V1, V2, DL, DAG,
-                                      Subtarget, Shuffle, ShuffleVT,
-                                      PermuteImm) &&
+  NewV1 = V1; // Save operands in case early exit happens.
+  NewV2 = V2;
+  if (matchBinaryPermuteVectorShuffle(
+          MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
+          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
    if (Depth == 1 && Root.getOpcode() == Shuffle)
      return SDValue(); // Nothing to do!
-    V1 = DAG.getBitcast(ShuffleVT, V1);
-    DCI.AddToWorklist(V1.getNode());
-    V2 = DAG.getBitcast(ShuffleVT, V2);
-    DCI.AddToWorklist(V2.getNode());
-    Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
+    NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
+    DCI.AddToWorklist(NewV1.getNode());
+    NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
+    DCI.AddToWorklist(NewV2.getNode());
+    Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
                      DAG.getConstant(PermuteImm, DL, MVT::i8));
    DCI.AddToWorklist(Res.getNode());
    return DAG.getBitcast(RootVT, Res);
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -754,7 +754,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
  // type remains the same.
  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
    MVT LegalVT = LT.second;
-    if (LegalVT.getVectorElementType().getSizeInBits() ==
+    if (LegalVT.isVector() &&
+        LegalVT.getVectorElementType().getSizeInBits() ==
            Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
        LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {

--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@ -648,7 +648,7 @@ private:
          // track in a CHI. In the PDom walk, there can be values in the
          // stack which are not control dependent e.g., nested loop.
          if (si != RenameStack.end() && si->second.size() &&
-              DT->dominates(Pred, si->second.back()->getParent())) {
+              DT->properlyDominates(Pred, si->second.back()->getParent())) {
            C.Dest = BB;                     // Assign the edge
            C.I = si->second.pop_back_val(); // Assign the argument
            DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@ -14,7 +14,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
@ -177,9 +176,8 @@ class StructurizeCFG : public RegionPass {
  Region *ParentRegion;

  DominatorTree *DT;
-  LoopInfo *LI;

-  SmallVector<RegionNode *, 8> Order;
+  std::deque<RegionNode *> Order;
  BBSet Visited;

  BBPhiMap DeletedPhis;
@ -204,7 +202,7 @@ class StructurizeCFG : public RegionPass {

  void gatherPredicates(RegionNode *N);

-  void collectInfos();
+  void analyzeNode(RegionNode *N);

  void insertConditions(bool Loops);

@ -258,7 +256,6 @@ public:
      AU.addRequired<DivergenceAnalysis>();
    AU.addRequiredID(LowerSwitchID);
    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();

    AU.addPreserved<DominatorTreeWrapperPass>();
    RegionPass::getAnalysisUsage(AU);
@ -292,55 +289,17 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {

 /// \brief Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
-  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
-  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
+  assert(Visited.empty());
+  assert(Predicates.empty());
+  assert(Loops.empty());
+  assert(LoopPreds.empty());

-  // The reverse post-order traversal of the list gives us an ordering close
-  // to what we want.  The only problem with it is that sometimes backedges
-  // for outer loops will be visited before backedges for inner loops.
-  for (RegionNode *RN : RPOT) {
-    BasicBlock *BB = RN->getEntry();
-    Loop *Loop = LI->getLoopFor(BB);
-    ++LoopBlocks[Loop];
+  // This must be RPO order for the back edge detection to work
+  for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) {
+    // FIXME: Is there a better order to use for structurization?
+    Order.push_back(RN);
+    analyzeNode(RN);
  }
-
-  unsigned CurrentLoopDepth = 0;
-  Loop *CurrentLoop = nullptr;
-  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    BasicBlock *BB = (*I)->getEntry();
-    unsigned LoopDepth = LI->getLoopDepth(BB);
-
-    if (is_contained(Order, *I))
-      continue;
-
-    if (LoopDepth < CurrentLoopDepth) {
-      // Make sure we have visited all blocks in this loop before moving back to
-      // the outer loop.
-
-      auto LoopI = I;
-      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
-        LoopI++;
-        BasicBlock *LoopBB = (*LoopI)->getEntry();
-        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
-          --BlockCount;
-          Order.push_back(*LoopI);
-        }
-      }
-    }
-
-    CurrentLoop = LI->getLoopFor(BB);
-    if (CurrentLoop)
-      LoopBlocks[CurrentLoop]--;
-
-    CurrentLoopDepth = LoopDepth;
-    Order.push_back(*I);
-  }
-
-  // This pass originally used a post-order traversal and then operated on
-  // the list in reverse. Now that we are using a reverse post-order traversal
-  // rather than re-working the whole pass to operate on the list in order,
-  // we just reverse the list and continue to operate on it in reverse.
-  std::reverse(Order.begin(), Order.end());
 }

 /// \brief Determine the end of the loops
@ -466,32 +425,19 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 }

 /// \brief Collect various loop and predicate infos
-void StructurizeCFG::collectInfos() {
-  // Reset predicate
-  Predicates.clear();
+void StructurizeCFG::analyzeNode(RegionNode *RN) {
+  DEBUG(dbgs() << "Visiting: "
+        << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+        << RN->getEntry()->getName() << '\n');

-  // and loop infos
-  Loops.clear();
-  LoopPreds.clear();
+  // Analyze all the conditions leading to a node
+  gatherPredicates(RN);

-  // Reset the visited nodes
-  Visited.clear();
+  // Remember that we've seen this node
+  Visited.insert(RN->getEntry());

-  for (RegionNode *RN : reverse(Order)) {
-    DEBUG(dbgs() << "Visiting: "
-                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-                 << RN->getEntry()->getName() << " Loop Depth: "
-                 << LI->getLoopDepth(RN->getEntry()) << "\n");
-
-    // Analyze all the conditions leading to a node
-    gatherPredicates(RN);
-
-    // Remember that we've seen this node
-    Visited.insert(RN->getEntry());
-
-    // Find the last back edges
-    analyzeLoops(RN);
-  }
+  // Find the last back edges
+  analyzeLoops(RN);
 }

 /// \brief Insert the missing branch conditions
@ -664,7 +610,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
 BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
  LLVMContext &Context = Func->getContext();
  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
-                       Order.back()->getEntry();
+                       Order.front()->getEntry();
  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
                                        Func, Insert);
  DT->addNewBlock(Flow, Dominator);
@ -744,7 +690,8 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
 /// Take one node from the order vector and wire it up
 void StructurizeCFG::wireFlow(bool ExitUseAllowed,
                              BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.pop_back_val();
+  RegionNode *Node = Order.front();
+  Order.pop_front();
  Visited.insert(Node->getEntry());

  if (isPredictableTrue(Node)) {
@ -768,7 +715,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

    PrevNode = Node;
    while (!Order.empty() && !Visited.count(LoopEnd) &&
-           dominatesPredicates(Entry, Order.back())) {
+           dominatesPredicates(Entry, Order.front())) {
      handleLoops(false, LoopEnd);
    }

@ -779,7 +726,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

 void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                                 BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.back();
+  RegionNode *Node = Order.front();
  BasicBlock *LoopStart = Node->getEntry();

  if (!Loops.count(LoopStart)) {
@ -924,10 +871,9 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
  ParentRegion = R;

  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

  orderNodes();
-  collectInfos();
+
  createFlow();
  insertConditions(false);
  insertConditions(true);
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -2630,9 +2630,12 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
  Instruction *LastInduction = VecInd;
  for (unsigned Part = 0; Part < UF; ++Part) {
    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
-    recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+
    if (isa<TruncInst>(EntryVal))
      addMetadata(LastInduction, EntryVal);
+    else
+      recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+
    LastInduction = cast<Instruction>(addFastMathFlag(
        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
  }
@ -2754,15 +2757,17 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {

  // If we haven't yet vectorized the induction variable, splat the scalar
  // induction variable, and build the necessary step vectors.
+  // TODO: Don't do it unless the vectorized IV is really required.
  if (!VectorizedIV) {
    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
    for (unsigned Part = 0; Part < UF; ++Part) {
      Value *EntryPart =
          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
-      recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
      if (Trunc)
        addMetadata(EntryPart, Trunc);
+      else
+        recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
    }
  }

--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -1347,7 +1347,6 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
              Lane << " from " << *Scalar << ".\n");
        ExternalUses.emplace_back(Scalar, nullptr, Lane);
-        continue;
      }
      for (User *U : Scalar->users()) {
        DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@ -4417,13 +4416,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  if (!A || !B)
    return false;
  Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, None, true);
+  return tryToVectorizeList(VL, R, true);
 }

 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           ArrayRef<Value *> BuildVector,
-                                           bool AllowReorder,
-                                           bool NeedExtraction) {
+                                           bool AllowReorder) {
  if (VL.size() < 2)
    return false;

@ -4517,12 +4514,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                   << "\n");
      ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

-      ArrayRef<Value *> EmptyArray;
-      ArrayRef<Value *> BuildVectorSlice;
-      if (!BuildVector.empty())
-        BuildVectorSlice = BuildVector.slice(I, OpsWidth);
-
-      R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
+      R.buildTree(Ops);
      // TODO: check if we can allow reordering for more cases.
      if (AllowReorder && R.shouldReorder()) {
        // Conceptually, there is nothing actually preventing us from trying to
@ -4530,7 +4522,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
        // reductions. However, at this point, we only expect to get here when
        // there are exactly two operations.
        assert(Ops.size() == 2);
-        assert(BuildVectorSlice.empty());
        Value *ReorderedOps[] = {Ops[1], Ops[0]};
        R.buildTree(ReorderedOps, None);
      }
@ -4550,31 +4541,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                 << " and with tree size "
                                 << ore::NV("TreeSize", R.getTreeSize()));

-        Value *VectorizedRoot = R.vectorizeTree();
-
-        // Reconstruct the build vector by extracting the vectorized root. This
-        // way we handle the case where some elements of the vector are
-        // undefined.
-        //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
-        if (!BuildVectorSlice.empty()) {
-          // The insert point is the last build vector instruction. The
-          // vectorized root will precede it. This guarantees that we get an
-          // instruction. The vectorized tree could have been constant folded.
-          Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
-          unsigned VecIdx = 0;
-          for (auto &V : BuildVectorSlice) {
-            IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
-                                        ++BasicBlock::iterator(InsertAfter));
-            Instruction *I = cast<Instruction>(V);
-            assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
-            Instruction *Extract =
-                cast<Instruction>(Builder.CreateExtractElement(
-                    VectorizedRoot, Builder.getInt32(VecIdx++)));
-            I->setOperand(1, Extract);
-            I->moveAfter(Extract);
-            InsertAfter = I;
-          }
-        }
+        R.vectorizeTree();
        // Move to the next bundle.
        I += VF - 1;
        NextInst = I + 1;
@ -5495,11 +5462,9 @@ private:
 ///
 /// Returns true if it matches
 static bool findBuildVector(InsertElementInst *LastInsertElem,
-                            SmallVectorImpl<Value *> &BuildVector,
                            SmallVectorImpl<Value *> &BuildVectorOpds) {
  Value *V = nullptr;
  do {
-    BuildVector.push_back(LastInsertElem);
    BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
    V = LastInsertElem->getOperand(0);
    if (isa<UndefValue>(V))
@ -5508,7 +5473,6 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
    if (!LastInsertElem || !LastInsertElem->hasOneUse())
      return false;
  } while (true);
-  std::reverse(BuildVector.begin(), BuildVector.end());
  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  return true;
 }
@ -5517,11 +5481,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
 ///
 /// \return true if it matches.
 static bool findBuildAggregate(InsertValueInst *IV,
-                               SmallVectorImpl<Value *> &BuildVector,
                               SmallVectorImpl<Value *> &BuildVectorOpds) {
  Value *V;
  do {
-    BuildVector.push_back(IV);
    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
    V = IV->getAggregateOperand();
    if (isa<UndefValue>(V))
@ -5530,7 +5492,6 @@ static bool findBuildAggregate(InsertValueInst *IV,
    if (!IV || !IV->hasOneUse())
      return false;
  } while (true);
-  std::reverse(BuildVector.begin(), BuildVector.end());
  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  return true;
 }
@ -5706,27 +5667,25 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
  if (!R.canMapToVector(IVI->getType(), DL))
    return false;

-  SmallVector<Value *, 16> BuildVector;
  SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+  if (!findBuildAggregate(IVI, BuildVectorOpds))
    return false;

  DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
  // Aggregate value is unlikely to be processed in vector register, we need to
  // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+  return tryToVectorizeList(BuildVectorOpds, R);
 }

 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                   BasicBlock *BB, BoUpSLP &R) {
-  SmallVector<Value *, 16> BuildVector;
  SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+  if (!findBuildVector(IEI, BuildVectorOpds))
    return false;

  // Vectorize starting with the build vector operands ignoring the BuildVector
  // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+  return tryToVectorizeList(BuildVectorOpds, R);
 }

 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@ -5804,8 +5763,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
      // is done when there are exactly two elements since tryToVectorizeList
      // asserts that there are only two values when AllowReorder is true.
      bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
-                                            None, AllowReorder)) {
+      if (NumElts > 1 &&
+          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
        // Success start over because instructions might have been changed.
        HaveVectorizedPhiNodes = true;
        Changed = true;
--- a/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
@ -0,0 +1,61 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+  @foo1 = common global [1073741824 x i32] zeroinitializer, align 4
+  @foo2 = common global [1073741824 x i32] zeroinitializer, align 4
+
+  define i32 @gv_large() {
+  entry:
+    %retval = alloca i32, align 4
+    store i32 0, i32* %retval, align 4
+    %0 = load i32, i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0), align 4
+    %1 = load i32, i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0), align 4
+    %add = add nsw i32 %0, %1
+    ret i32 %add
+  }
+
+...
+---
+name:            gv_large
+legalized:       true
+regBankSelected: true
+stack:
+  - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+constants:
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: gv_large
+    ; CHECK: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo1, 0
+    ; CHECK: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @foo1, 16
+    ; CHECK: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @foo1, 32
+    ; CHECK: [[MOVKXi2:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @foo1, 48
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY [[MOVKXi2]]
+    ; CHECK: [[MOVZXi1:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo2, 0
+    ; CHECK: [[MOVKXi3:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi1]], target-flags(aarch64-g1, aarch64-nc) @foo2, 16
+    ; CHECK: [[MOVKXi4:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi3]], target-flags(aarch64-g2, aarch64-nc) @foo2, 32
+    ; CHECK: [[MOVKXi5:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi4]], target-flags(aarch64-g3) @foo2, 48
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[MOVKXi5]]
+    ; CHECK: STRWui %wzr, %stack.0.retval, 0 :: (store 4 into %ir.retval)
+    ; CHECK: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0)`)
+    ; CHECK: [[LDRWui1:%[0-9]+]]:gpr32 = LDRWui [[COPY1]], 0 :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0)`)
+    ; CHECK: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRWui]], [[LDRWui1]]
+    ; CHECK: %w0 = COPY [[ADDWrr]]
+    ; CHECK: RET_ReallyLR implicit %w0
+    %1:gpr(s32) = G_CONSTANT i32 0
+    %4:gpr(p0) = G_GLOBAL_VALUE @foo1
+    %3:gpr(p0) = COPY %4(p0)
+    %7:gpr(p0) = G_GLOBAL_VALUE @foo2
+    %6:gpr(p0) = COPY %7(p0)
+    %0:gpr(p0) = G_FRAME_INDEX %stack.0.retval
+    G_STORE %1(s32), %0(p0) :: (store 4 into %ir.retval)
+    %2:gpr(s32) = G_LOAD %3(p0) :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0)`)
+    %5:gpr(s32) = G_LOAD %6(p0) :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0)`)
+    %8:gpr(s32) = G_ADD %2, %5
+    %w0 = COPY %8(s32)
+    RET_ReallyLR implicit %w0
+
+...
--- a/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/test/CodeGen/AArch64/atomic-ops-lse.ll
@ -629,14 +629,29 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {

 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
-
-; CHECK: casab w[[NEW:[0-9]+]], w[[OLD:[0-9]+]], [x[[ADDR]]]
-; CHECK-NOT: dmb
+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+; CHECK-NEXT: casab w0, w1, [x[[ADDR]]]
+; CHECK-NEXT: ret

   ret i8 %old
 }

+define i1 @test_atomic_cmpxchg_i8_1(i8 %wanted, i8 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i8_1:
+   %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+   %success = extractvalue { i8, i1 } %pair, 1
+
+; CHECK-NOT: dmb
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+
+; CHECK: casab w[[NEW:[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp w[[NEW]], w0, uxtb
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+   ret i1 %success
+}
+
 define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i16:
   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new acquire acquire
@ -644,14 +659,30 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {

 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
-
-; CHECK: casah w0, w1, [x[[ADDR]]]
-; CHECK-NOT: dmb
+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+; CHECK-NEXT: casah w0, w1, [x[[ADDR]]]
+; CHECK-NEXT: ret

   ret i16 %old
 }

+define i1 @test_atomic_cmpxchg_i16_1(i16 %wanted, i16 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i16_1:
+   %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new acquire acquire
+   %success = extractvalue { i16, i1 } %pair, 1
+
+; CHECK-NOT: dmb
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+
+; CHECK: casah w[[NEW:[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp w[[NEW]], w0, uxth
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+
+   ret i1 %success
+}
+
 define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i32:
   %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new acquire acquire
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@ -66,9 +66,10 @@ ENDIF:                                            ; preds = %LOOP

 ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
+; OPT: llvm.amdgcn.break
+; OPT: llvm.amdgcn.if.break
+; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.loop
-; OPT: llvm.amdgcn.if.break
-; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.end.cf

 ; GCN-LABEL: {{^}}multi_if_break_loop:
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@ -124,55 +124,100 @@ bb23:                                             ; preds = %bb10
 ; Earlier version of above, before a run of the structurizer.
 ; IR-LABEL: @nested_loop_conditions(

-; IR: Flow7:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
-; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
-; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
-; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
+; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
+; IR:   br i1 %tmp1235, label %bb14.lr.ph, label %Flow

-; IR: Flow1:
-; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
-; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
-; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
-; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
-; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
-; IR-NEXT: br i1 %18, label %Flow7, label %bb14
-
-; IR: Flow2:
-; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
-; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
-; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
-; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
-; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
-; IR-NEXT: br i1 %25, label %bb21, label %Flow3
-
-; IR: bb21:
-; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %27 = xor i1 %tmp12, true
-; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
-; IR-NEXT: br label %Flow3
+; IR: bb14.lr.ph:
+; IR: br label %bb14

 ; IR: Flow3:
-; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
-; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
-; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
-; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
-; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
-; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
+; IR:   call void @llvm.amdgcn.end.cf(i64 %18)
+; IR:   %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
+; IR:   %1 = extractvalue { i1, i64 } %0, 0
+; IR:   %2 = extractvalue { i1, i64 } %0, 1
+; IR:   br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
+
+; IR: bb4.bb13_crit_edge:
+; IR:   br label %Flow4
+
+; IR: Flow4:
+; IR:   %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
+; IR:   call void @llvm.amdgcn.end.cf(i64 %2)
+; IR:   br label %Flow
+
+; IR: bb13:
+; IR:   br label %bb31
+
+; IR: Flow:
+; IR:   %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
+; IR:   %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
+; IR:   %6 = extractvalue { i1, i64 } %5, 0
+; IR:   %7 = extractvalue { i1, i64 } %5, 1
+; IR:   br i1 %6, label %bb13, label %bb31
+
+; IR: bb14:
+; IR:   %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
+; IR:   %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
+; IR:   %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
+; IR:   %tmp15 = icmp eq i32 %tmp1037, 1
+; IR:   %8 = xor i1 %tmp15, true
+; IR:   %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR:   %10 = extractvalue { i1, i64 } %9, 0
+; IR:   %11 = extractvalue { i1, i64 } %9, 1
+; IR:   br i1 %10, label %bb31.loopexit, label %Flow1
+
+; IR: Flow1:
+; IR:   %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
+; IR:   %13 = extractvalue { i1, i64 } %12, 0
+; IR:   %14 = extractvalue { i1, i64 } %12, 1
+; IR:   br i1 %13, label %bb16, label %Flow2
+
+; IR: bb16:
+; IR:   %tmp17 = bitcast i64 %tmp3 to <2 x i32>
+; IR:   br label %bb18
+
+; IR: Flow2:
+; IR:   %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
+; IR:   %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
+; IR:   %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
+; IR:   %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
+; IR:   %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
+; IR:   call void @llvm.amdgcn.end.cf(i64 %14)
+; IR:   %19 = call i1 @llvm.amdgcn.loop(i64 %18)
+; IR:   br i1 %19, label %Flow3, label %bb14
+
+; IR: bb18:
+; IR:   %tmp19 = load volatile i32, i32 addrspace(1)* undef
+; IR:   %tmp20 = icmp slt i32 %tmp19, 9
+; IR:   br i1 %tmp20, label %bb21, label %bb18
+
+; IR: bb21:
+; IR:   %tmp22 = extractelement <2 x i32> %tmp17, i64 1
+; IR:   %tmp23 = lshr i32 %tmp22, 16
+; IR:   %tmp24 = select i1 undef, i32 undef, i32 %tmp23
+; IR:   %tmp25 = uitofp i32 %tmp24 to float
+; IR:   %tmp26 = fmul float %tmp25, 0x3EF0001000000000
+; IR:   %tmp27 = fsub float %tmp26, undef
+; IR:   %tmp28 = fcmp olt float %tmp27, 5.000000e-01
+; IR:   %tmp29 = select i1 %tmp28, i64 1, i64 2
+; IR:   %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
+; IR:   %tmp7 = zext i32 %tmp30 to i64
+; IR:   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
+; IR:   %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
+; IR:   %tmp10 = extractelement <4 x i32> %tmp9, i64 0
+; IR:   %tmp11 = load volatile i32, i32 addrspace(1)* undef
+; IR:   %tmp12 = icmp slt i32 %tmp11, 9
+; IR:   %20 = xor i1 %tmp12, true
+; IR:   %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
+; IR:   br label %Flow2
+
+; IR: bb31.loopexit:
+; IR:   br label %Flow1

 ; IR: bb31:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
-; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
-; IR-NEXT: ret void
+; IR:   call void @llvm.amdgcn.end.cf(i64 %7)
+; IR:   store volatile i32 0, i32 addrspace(1)* undef
+; IR:   ret void


 ; GCN-LABEL: {{^}}nested_loop_conditions:
--- a/test/CodeGen/ARM/and-load-combine.ll
+++ b/test/CodeGen/ARM/and-load-combine.ll
@ -852,8 +852,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    ldrb r0, [r0]
 ; ARM-NEXT:    uxtb r2, r2
-; ARM-NEXT:    and r0, r0, r1
-; ARM-NEXT:    uxtb r1, r0
+; ARM-NEXT:    and r1, r0, r1
 ; ARM-NEXT:    mov r0, #0
 ; ARM-NEXT:    cmp r1, r2
 ; ARM-NEXT:    movweq r0, #1
@ -863,8 +862,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; ARMEB:       @ %bb.0: @ %entry
 ; ARMEB-NEXT:    ldrb r0, [r0]
 ; ARMEB-NEXT:    uxtb r2, r2
-; ARMEB-NEXT:    and r0, r0, r1
-; ARMEB-NEXT:    uxtb r1, r0
+; ARMEB-NEXT:    and r1, r0, r1
 ; ARMEB-NEXT:    mov r0, #0
 ; ARMEB-NEXT:    cmp r1, r2
 ; ARMEB-NEXT:    movweq r0, #1
@ -872,9 +870,8 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ;
 ; THUMB1-LABEL: test6:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    ldrb r0, [r0]
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    uxtb r3, r0
+; THUMB1-NEXT:    ldrb r3, [r0]
+; THUMB1-NEXT:    ands r3, r1
 ; THUMB1-NEXT:    uxtb r2, r2
 ; THUMB1-NEXT:    movs r0, #1
 ; THUMB1-NEXT:    movs r1, #0
@ -889,8 +886,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    ldrb r0, [r0]
 ; THUMB2-NEXT:    uxtb r2, r2
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    uxtb r1, r0
+; THUMB2-NEXT:    ands r1, r0
 ; THUMB2-NEXT:    movs r0, #0
 ; THUMB2-NEXT:    cmp r1, r2
 ; THUMB2-NEXT:    it eq
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@ -49,9 +49,10 @@ entry:
 ; CHECK-THUMBV6:       mov [[EXPECTED:r[0-9]+]], r1
 ; CHECK-THUMBV6-NEXT:  bl __sync_val_compare_and_swap_1
 ; CHECK-THUMBV6-NEXT:  mov [[RES:r[0-9]+]], r0
+; CHECK-THUMBV6-NEXT:  uxtb [[EXPECTED_ZEXT:r[0-9]+]], [[EXPECTED]]
 ; CHECK-THUMBV6-NEXT:  movs r0, #1
 ; CHECK-THUMBV6-NEXT:  movs [[ZERO:r[0-9]+]], #0
-; CHECK-THUMBV6-NEXT:  cmp [[RES]], [[EXPECTED]]
+; CHECK-THUMBV6-NEXT:  cmp [[RES]], [[EXPECTED_ZEXT]]
 ; CHECK-THUMBV6-NEXT:  beq [[END:.LBB[0-9_]+]]
 ; CHECK-THUMBV6-NEXT:  mov r0, [[ZERO]]
 ; CHECK-THUMBV6-NEXT: [[END]]:
--- a/test/CodeGen/ARM/cmpxchg-O0.ll
+++ b/test/CodeGen/ARM/cmpxchg-O0.ll
@ -17,7 +17,8 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
-; CHECK:     cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK:     uxtb [[DESIRED_ZEXT:r[0-9]+]], [[DESIRED]]
+; CHECK:     cmp{{(\.w)?}} [[OLD]], [[DESIRED_ZEXT]]
 ; CHECK:     {{moveq|movweq}} {{r[0-9]+}}, #1
 ; CHECK:     dmb ish
  %res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic
@ -36,7 +37,8 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
 ; CHECK:     cmp{{(\.w)?}} [[STATUS]], #0
 ; CHECK:     bne [[RETRY]]
 ; CHECK: [[DONE]]:
-; CHECK:     cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK:     uxth [[DESIRED_ZEXT:r[0-9]+]], [[DESIRED]]
+; CHECK:     cmp{{(\.w)?}} [[OLD]], [[DESIRED_ZEXT]]
 ; CHECK:     {{moveq|movweq}} {{r[0-9]+}}, #1
 ; CHECK:     dmb ish
  %res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic
--- a/test/CodeGen/ARM/global-merge-dllexport.ll
+++ b/test/CodeGen/ARM/global-merge-dllexport.ll
@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=thumbv7-win32 -arm-global-merge | FileCheck %s
+
+@x = global i32 0, align 4
+@y = dllexport global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK: f1:
+; CHECK: movw [[REG1:r[0-9]+]], :lower16:x
+; CHECK: movt [[REG1]], :upper16:x
+  store i32 %a1, i32* @x, align 4
+  store i32 %a2, i32* @y, align 4
+  ret void
+}
+
+; CHECK-NOT: .L_MergedGlobals
--- a/test/CodeGen/ARM/global-merge-external.ll
+++ b/test/CodeGen/ARM/global-merge-external.ll
@ -1,8 +1,9 @@
-; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge                                 | FileCheck %s --check-prefix=CHECK-MERGE
-; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=true  | FileCheck %s --check-prefix=CHECK-MERGE
-; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=false | FileCheck %s --check-prefix=CHECK-NO-MERGE
-; RUN: llc < %s -mtriple=arm-macho -arm-global-merge                                 | FileCheck %s --check-prefix=CHECK-NO-MERGE
-; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -relocation-model=pic           | FileCheck %s --check-prefix=CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge                                 | FileCheck %s --check-prefixes=CHECK,CHECK-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=true  | FileCheck %s --check-prefixes=CHECK,CHECK-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -global-merge-on-external=false | FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-macho -arm-global-merge                                 | FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-eabi  -arm-global-merge -relocation-model=pic           | FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=thumbv7-win32 -arm-global-merge                             | FileCheck %s --check-prefixes=CHECK-WIN32

@x = global i32 0, align 4
@y = global i32 0, align 4
@ -10,10 +11,13 @@

 define void @f1(i32 %a1, i32 %a2) {
 ;CHECK:          f1:
-;CHECK:          ldr {{r[0-9]+}}, [[LABEL1:\.LCPI[0-9]+_[0-9]]]
+;CHECK:          ldr {{r[0-9]+}}, [[LABEL1:\.?LCPI[0-9]+_[0-9]]]
 ;CHECK:          [[LABEL1]]:
 ;CHECK-MERGE:    .long .L_MergedGlobals
 ;CHECK-NO-MERGE: .long {{_?x}}
+;CHECK-WIN32:    f1:
+;CHECK-WIN32:    movw [[REG1:r[0-9]+]], :lower16:.L_MergedGlobals
+;CHECK-WIN32:    movt [[REG1]], :upper16:.L_MergedGlobals
  store i32 %a1, i32* @x, align 4
  store i32 %a2, i32* @y, align 4
  ret void
@ -21,10 +25,13 @@ define void @f1(i32 %a1, i32 %a2) {

 define void @g1(i32 %a1, i32 %a2) {
 ;CHECK:          g1:
-;CHECK:          ldr {{r[0-9]+}}, [[LABEL2:\.LCPI[0-9]+_[0-9]]]
+;CHECK:          ldr {{r[0-9]+}}, [[LABEL2:\.?LCPI[0-9]+_[0-9]]]
 ;CHECK:          [[LABEL2]]:
 ;CHECK-MERGE:    .long .L_MergedGlobals
 ;CHECK-NO-MERGE: .long {{_?y}}
+;CHECK-WIN32:    g1:
+;CHECK-WIN32:    movw [[REG2:r[0-9]+]], :lower16:.L_MergedGlobals
+;CHECK-WIN32:    movt [[REG2]], :upper16:.L_MergedGlobals
  store i32 %a1, i32* @y, align 4
  store i32 %a2, i32* @z, align 4
  ret void
@ -35,6 +42,7 @@ define void @g1(i32 %a1, i32 %a2) {
 ;CHECK-MERGE:	.type	.L_MergedGlobals,%object
 ;CHECK-MERGE:	.local	.L_MergedGlobals
 ;CHECK-MERGE:	.comm	.L_MergedGlobals,12,4
+;CHECK-WIN32:	.lcomm	.L_MergedGlobals,12,4

 ;CHECK-MERGE:	.globl	x
 ;CHECK-MERGE: x = .L_MergedGlobals
@ -45,3 +53,10 @@ define void @g1(i32 %a1, i32 %a2) {
 ;CHECK-MERGE:	.globl	z
 ;CHECK-MERGE: z = .L_MergedGlobals+8
 ;CHECK-MERGE: .size z, 4
+
+;CHECK-WIN32:	.globl	x
+;CHECK-WIN32: x = .L_MergedGlobals
+;CHECK-WIN32:	.globl	y
+;CHECK-WIN32: y = .L_MergedGlobals+4
+;CHECK-WIN32:	.globl	z
+;CHECK-WIN32: z = .L_MergedGlobals+8
--- a/test/CodeGen/ARM/peephole-phi.mir
+++ b/test/CodeGen/ARM/peephole-phi.mir
@ -0,0 +1,67 @@
+# RUN: llc -o - %s -mtriple=armv7-- -verify-machineinstrs -run-pass=peephole-opt | FileCheck %s
+#
+# Make sure we do not crash on this input.
+# Note that this input could in principle be optimized, but right now we don't
+# have this case implemented so the output should simply be unchanged.
+#
+# CHECK-LABEL: name: func
+# CHECK: body: |
+# CHECK:   bb.0:
+# CHECK:     Bcc %bb.2, 1, undef %cpsr
+#
+# CHECK:   bb.1:
+# CHECK:     %0:dpr = IMPLICIT_DEF
+# CHECK:     %1:gpr, %2:gpr = VMOVRRD %0, 14, %noreg
+# CHECK:     B %bb.3
+#
+# CHECK:   bb.2:
+# CHECK:     %3:spr = IMPLICIT_DEF
+# CHECK:     %4:gpr = VMOVRS %3, 14, %noreg
+#
+# CHECK:   bb.3:
+# CHECK:     %5:gpr = PHI %1, %bb.1, %4, %bb.2
+# CHECK:     %6:spr = VMOVSR %5, 14, %noreg
+---
+name: func0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    Bcc %bb.2, 1, undef %cpsr
+
+  bb.1:
+    %0:dpr = IMPLICIT_DEF
+    %1:gpr, %2:gpr = VMOVRRD %0:dpr, 14, %noreg
+    B %bb.3
+
+  bb.2:
+    %3:spr = IMPLICIT_DEF
+    %4:gpr = VMOVRS %3:spr, 14, %noreg
+
+  bb.3:
+    %5:gpr = PHI %1, %bb.1, %4, %bb.2
+    %6:spr = VMOVSR %5, 14, %noreg
+...
+
+# CHECK-LABEL: name: func1
+# CHECK:    %6:spr = PHI %0, %bb.1, %2, %bb.2
+# CHEKC:    %7:spr = COPY %6
+---
+name: func1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    Bcc %bb.2, 1, undef %cpsr
+
+  bb.1:
+    %1:spr = IMPLICIT_DEF
+    %0:gpr = VMOVRS %1, 14, %noreg
+    B %bb.3
+
+  bb.2:
+    %3:spr = IMPLICIT_DEF
+    %2:gpr = VMOVRS %3:spr, 14, %noreg
+
+  bb.3:
+    %4:gpr = PHI %0, %bb.1, %2, %bb.2
+    %5:spr = VMOVSR %4, 14, %noreg
+...
--- a/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
+++ b/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Make sure that a negative value for the compare-and-swap is zero extended
+; from i8/i16 to i32 since it will be compared for equality.
+; RUN: llc -mtriple=powerpc64le-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=CHECK-P7
+
+@str = private unnamed_addr constant [46 x i8] c"FAILED: __atomic_compare_exchange_n() failed.\00"
+@str.1 = private unnamed_addr constant [59 x i8] c"FAILED: __atomic_compare_exchange_n() set the wrong value.\00"
+@str.2 = private unnamed_addr constant [7 x i8] c"PASSED\00"
+
+define signext i32 @main() {
+; CHECK-LABEL: main:
+; CHECK:    li 3, -32477
+; CHECK:    lis 12, 0
+; CHECK:    li 6, 234
+; CHECK:    sth 3, 46(1)
+; CHECK:    ori 4, 12, 33059
+; CHECK:    sync
+; CHECK:  .LBB0_1: # %L.entry
+; CHECK:    lharx 3, 0, 5
+; CHECK:    cmpw 4, 3
+; CHECK:    bne 0, .LBB0_3
+; CHECK:    sthcx. 6, 0, 5
+; CHECK:    bne 0, .LBB0_1
+; CHECK:    b .LBB0_4
+; CHECK:  .LBB0_3: # %L.entry
+; CHECK:    sthcx. 3, 0, 5
+; CHECK:  .LBB0_4: # %L.entry
+; CHECK:    cmplwi 3, 33059
+; CHECK:    lwsync
+; CHECK:    lhz 3, 46(1)
+; CHECK:    cmplwi 3, 234
+;
+; CHECK-P7-LABEL: main:
+; CHECK-P7:    lis 4, 0
+; CHECK-P7:    li 7, 0
+; CHECK-P7:    li 3, -32477
+; CHECK-P7:    sth 3, 46(1)
+; CHECK-P7:    li 5, 234
+; CHECK-P7:    ori 4, 4, 33059
+; CHECK-P7:    rlwinm 3, 6, 3, 27, 27
+; CHECK-P7:    ori 7, 7, 65535
+; CHECK-P7:    sync
+; CHECK-P7:    slw 8, 5, 3
+; CHECK-P7:    slw 5, 7, 3
+; CHECK-P7:    slw 9, 4, 3
+; CHECK-P7:    and 7, 8, 5
+; CHECK-P7:    rldicr 4, 6, 0, 61
+; CHECK-P7:    and 8, 9, 5
+; CHECK-P7:  .LBB0_1: # %L.entry
+; CHECK-P7:    lwarx 9, 0, 4
+; CHECK-P7:    and 6, 9, 5
+; CHECK-P7:    cmpw 0, 6, 8
+; CHECK-P7:    bne 0, .LBB0_3
+; CHECK-P7:    andc 9, 9, 5
+; CHECK-P7:    or 9, 9, 7
+; CHECK-P7:    stwcx. 9, 0, 4
+; CHECK-P7:    bne 0, .LBB0_1
+; CHECK-P7:    b .LBB0_4
+; CHECK-P7:  .LBB0_3: # %L.entry
+; CHECK-P7:    stwcx. 9, 0, 4
+; CHECK-P7:  .LBB0_4: # %L.entry
+; CHECK-P7:    srw 3, 6, 3
+; CHECK-P7:    lwsync
+; CHECK-P7:    cmplwi 3, 33059
+; CHECK-P7:    lhz 3, 46(1)
+; CHECK-P7:    cmplwi 3, 234
+L.entry:
+  %value.addr = alloca i16, align 2
+  store i16 -32477, i16* %value.addr, align 2
+  %0 = cmpxchg i16* %value.addr, i16 -32477, i16 234 seq_cst seq_cst
+  %1 = extractvalue { i16, i1 } %0, 1
+  br i1 %1, label %L.B0000, label %L.B0003
+
+L.B0003:                                          ; preds = %L.entry
+  %puts = call i32 @puts(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @str, i64 0, i64 0))
+  ret i32 1
+
+L.B0000:                                          ; preds = %L.entry
+  %2 = load i16, i16* %value.addr, align 2
+  %3 = icmp eq i16 %2, 234
+  br i1 %3, label %L.B0001, label %L.B0005
+
+L.B0005:                                          ; preds = %L.B0000
+  %puts1 = call i32 @puts(i8* getelementptr inbounds ([59 x i8], [59 x i8]* @str.1, i64 0, i64 0))
+  ret i32 1
+
+L.B0001:                                          ; preds = %L.B0000
+  %puts2 = call i32 @puts(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @str.2, i64 0, i64 0))
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #0
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@ -404,6 +404,7 @@ define void @test39() {
 define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test40:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    b .LBB40_2
 ; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB40_1:
@ -423,6 +424,7 @@ define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test41(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test41:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:  .LBB41_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -444,6 +446,7 @@ define void @test41(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test42(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test42:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:  .LBB42_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -465,6 +468,7 @@ define void @test42(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test43:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB43_2
 ; PPC64LE-NEXT:    .p2align 5
@ -485,6 +489,7 @@ define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test44:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB44_2
 ; PPC64LE-NEXT:    .p2align 5
@ -505,6 +510,7 @@ define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test45(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test45:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB45_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -527,6 +533,7 @@ define void @test45(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test46(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test46:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB46_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -549,6 +556,7 @@ define void @test46(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test47(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test47:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB47_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -571,6 +579,7 @@ define void @test47(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test48(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test48:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB48_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -593,6 +602,7 @@ define void @test48(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test49(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test49:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB49_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -615,6 +625,7 @@ define void @test49(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test50:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    b .LBB50_2
 ; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB50_1:
@ -634,6 +645,7 @@ define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test51(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test51:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:  .LBB51_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -655,6 +667,7 @@ define void @test51(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test52(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test52:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:  .LBB52_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -676,6 +689,7 @@ define void @test52(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test53:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB53_2
 ; PPC64LE-NEXT:    .p2align 5
@ -696,6 +710,7 @@ define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test54:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB54_2
 ; PPC64LE-NEXT:    .p2align 5
@ -716,6 +731,7 @@ define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test55(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test55:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB55_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -738,6 +754,7 @@ define void @test55(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test56(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test56:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB56_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -760,6 +777,7 @@ define void @test56(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test57(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test57:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB57_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -782,6 +800,7 @@ define void @test57(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test58(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test58:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB58_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -804,6 +823,7 @@ define void @test58(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test59(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test59:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB59_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -1248,6 +1268,7 @@ define void @test79(i64* %ptr, i64 %cmp, i64 %val) {
 define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test80:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    b .LBB80_2
 ; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB80_1:
@ -1267,6 +1288,7 @@ define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test81:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:  .LBB81_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -1288,6 +1310,7 @@ define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test82:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:  .LBB82_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -1309,6 +1332,7 @@ define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test83:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB83_2
 ; PPC64LE-NEXT:    .p2align 5
@ -1329,6 +1353,7 @@ define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test84:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB84_2
 ; PPC64LE-NEXT:    .p2align 5
@ -1349,6 +1374,7 @@ define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test85:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB85_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -1371,6 +1397,7 @@ define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test86:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB86_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -1393,6 +1420,7 @@ define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test87:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB87_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -1415,6 +1443,7 @@ define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test88:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB88_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -1437,6 +1466,7 @@ define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test89:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB89_1:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
@ -1459,6 +1489,7 @@ define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
 define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test90:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    b .LBB90_2
 ; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB90_1:
@ -1478,6 +1509,7 @@ define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test91:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:  .LBB91_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -1499,6 +1531,7 @@ define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test92:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:  .LBB92_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
@ -1520,6 +1553,7 @@ define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test93:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB93_2
 ; PPC64LE-NEXT:    .p2align 5
@ -1540,6 +1574,7 @@ define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test94:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    b .LBB94_2
 ; PPC64LE-NEXT:    .p2align 5
@ -1560,6 +1595,7 @@ define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test95:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB95_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -1582,6 +1618,7 @@ define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test96:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:  .LBB96_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -1604,6 +1641,7 @@ define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test97:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB97_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -1626,6 +1664,7 @@ define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test98:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB98_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
@ -1648,6 +1687,7 @@ define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test99:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:  .LBB99_1:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@ -4780,3 +4780,42 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x doub
  ret <2 x double> %res
 }

+; PR35977
+define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
+; CHECK-LABEL: test_zext_v8i8_to_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
+  %tmp2 = load <8 x i8>, <8 x i8>* %tmp
+  %tmp3 = extractelement <8 x i8> %tmp2, i32 0
+  %tmp4 = zext i8 %tmp3 to i16
+  %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
+  %tmp6 = extractelement <8 x i8> %tmp2, i32 1
+  %tmp7 = zext i8 %tmp6 to i16
+  %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
+  %tmp9 = extractelement <8 x i8> %tmp2, i32 2
+  %tmp10 = zext i8 %tmp9 to i16
+  %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
+  %tmp12 = extractelement <8 x i8> %tmp2, i32 3
+  %tmp13 = zext i8 %tmp12 to i16
+  %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
+  %tmp15 = extractelement <8 x i8> %tmp2, i32 4
+  %tmp16 = zext i8 %tmp15 to i16
+  %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
+  %tmp18 = extractelement <8 x i8> %tmp2, i32 5
+  %tmp19 = zext i8 %tmp18 to i16
+  %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
+  %tmp21 = extractelement <8 x i8> %tmp2, i32 6
+  %tmp22 = zext i8 %tmp21 to i16
+  %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
+  %tmp24 = extractelement <8 x i8> %tmp2, i32 7
+  %tmp25 = zext i8 %tmp24 to i16
+  %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
+  %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
+  store <8 x i16> %tmp27, <8 x i16>* %tmp28
+  ret void
+}
--- a/test/CodeGen/X86/darwin-bzero.ll
+++ b/test/CodeGen/X86/darwin-bzero.ll
@ -1,10 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck -check-prefixes=CHECK,BZERO %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck -check-prefixes=CHECK,BZERO %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck -check-prefixes=CHECK,NOBZERO %s
+; RUN: llc < %s -mtriple=x86_64-apple-ios10.0-simulator | FileCheck -check-prefixes=CHECK,NOBZERO %s

 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind

 ; CHECK-LABEL: foo:
-; CHECK: {{calll|callq}} ___bzero
+; BZERO: {{calll|callq}} ___bzero
+; NOBZERO-NOT: bzero
 define void @foo(i8* %p, i32 %len) {
  call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false)
  ret void
--- a/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-A-constraint.ll
@ -19,7 +19,8 @@ entry:
  %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
  ret { i64, i64 } %.fca.1.insert
 }
-; CHECK: lock cmpxchg16b
+; CHECK: lock
+; CHECK-NEXT: cmpxchg16b

 attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
--- a/test/CodeGen/X86/pr35761.ll
+++ b/test/CodeGen/X86/pr35761.ll
@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux %s -o - | FileCheck %s
+
+@x = global i8 0, align 1
+@y = global i32 0, align 4
+@z = global i24 0, align 4
+
+define void @PR35761(i32 %call) {
+; CHECK-LABEL: PR35761:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl {{.*}}(%rip), %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    movzbl {{.*}}(%rip), %ecx
+; CHECK-NEXT:    xorl $255, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    movw %cx, {{.*}}(%rip)
+; CHECK-NEXT:    movb $0, z+{{.*}}(%rip)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i8, i8* @x, align 1
+  %tobool = trunc i8 %0 to i1
+  %conv = zext i1 %tobool to i32
+  %or = or i32 32767, %call
+  %neg = xor i32 %or, -1
+  %neg1 = xor i32 %neg, -1
+  %1 = load i32, i32* @y, align 4
+  %xor = xor i32 %neg1, %1
+  %or2 = or i32 %conv, %xor
+  %conv3 = trunc i32 %or2 to i8
+  %bf.load = load i24, i24* @z, align 4
+  %2 = zext i8 %conv3 to i24
+  %bf.value = and i24 %2, 4194303
+  store i24 %bf.value, i24* @z, align 2
+  ret void
+}
+
--- a/test/CodeGen/X86/pr35972.ll
+++ b/test/CodeGen/X86/pr35972.ll
@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-unknown-linux-gnu %s -o - -mattr=avx512bw | FileCheck %s
+
+define void @test3(i32 %c, <64 x i1>* %ptr) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    sbbl %ecx, %ecx
+; CHECK-NEXT:    kmovd %ecx, %k0
+; CHECK-NEXT:    kunpckdq %k0, %k0, %k0
+; CHECK-NEXT:    kmovq %k0, (%eax)
+; CHECK-NEXT:    retl
+  %cmp = icmp eq i32 %c, 0
+  %insert = insertelement <64 x i1> undef, i1 %cmp, i32 0
+  %shuf = shufflevector <64 x i1> %insert, <64 x i1> undef, <64 x i32> zeroinitializer
+  store <64 x i1> %shuf, <64 x i1>* %ptr
+  ret void
+}
+
--- a/test/CodeGen/X86/pr37563.ll
+++ b/test/CodeGen/X86/pr37563.ll
@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+%struct.S = type <{ i16, i24, [5 x i8], i8, i16, [2 x i8] }>
+
+@z = global { i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] } { i16 -724, i8 94, i8 -18, i8 5, i8 undef, i8 96, i8 104, i8 -24, i8 10, i8 0, [5 x i8] undef }, align 8
+@tf_3_var_136 = global i64 0, align 8
+@.str = private unnamed_addr constant [6 x i8] c"%llu\0A\00", align 1
+
+define void @PR35763() {
+; CHECK-LABEL: PR35763:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl {{.*}}(%rip), %eax
+; CHECK-NEXT:    movzwl z+{{.*}}(%rip), %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    movq %rcx, {{.*}}(%rip)
+; CHECK-NEXT:    movl z+{{.*}}(%rip), %eax
+; CHECK-NEXT:    movzbl z+{{.*}}(%rip), %ecx
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    orq %rax, %rcx
+; CHECK-NEXT:    movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
+; CHECK-NEXT:    andq %rcx, %rax
+; CHECK-NEXT:    movl %eax, z+{{.*}}(%rip)
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movb %al, z+{{.*}}(%rip)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i16, i16* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 0), align 8
+  %conv = sext i16 %0 to i32
+  %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 1) to i32*), align 2
+  %bf.clear = and i32 %bf.load, 2097151
+  %bf.cast = zext i32 %bf.clear to i64
+  %conv1 = trunc i64 %bf.cast to i32
+  %or = or i32 %conv, %conv1
+  %conv2 = trunc i32 %or to i16
+  %conv3 = zext i16 %conv2 to i64
+  store i64 %conv3, i64* @tf_3_var_136, align 8
+  %bf.load4 = load i40, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2
+  %bf.clear5 = and i40 %bf.load4, -8589869057
+  store i40 %bf.clear5, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2
+  ret void
+}
--- a/test/CodeGen/X86/var-permute-128.ll
+++ b/test/CodeGen/X86/var-permute-128.ll
@ -207,13 +207,12 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
 ; SSSE3-LABEL: var_shuffle_v16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufb %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
  %index0 = extractelement <16 x i8> %indices, i32 0
  %index1 = extractelement <16 x i8> %indices, i32 1
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@ -1277,3 +1277,183 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
  %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
  ret <8 x float> %ret7
 }
+
+define <8 x i32> @pr35820(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
+; AVX1-LABEL: pr35820:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX1-NEXT:    movq %r8, %r10
+; AVX1-NEXT:    shrq $30, %r10
+; AVX1-NEXT:    vmovq %xmm1, %r9
+; AVX1-NEXT:    movq %r9, %rsi
+; AVX1-NEXT:    shrq $30, %rsi
+; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andl $3, %r9d
+; AVX1-NEXT:    andl $12, %esi
+; AVX1-NEXT:    andl $3, %r8d
+; AVX1-NEXT:    andl $12, %r10d
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    shrq $30, %rdi
+; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rdx
+; AVX1-NEXT:    shrq $30, %rdx
+; AVX1-NEXT:    andl $3, %ecx
+; AVX1-NEXT:    andl $12, %edx
+; AVX1-NEXT:    andl $3, %eax
+; AVX1-NEXT:    andl $12, %edi
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; INT256-LABEL: pr35820:
+; INT256:       # %bb.0: # %entry
+; INT256-NEXT:    # kill: def %xmm0 killed %xmm0 def %ymm0
+; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; INT256-NEXT:    retq
+entry:
+  %tmp1 = extractelement <8 x i32> %indices, i32 0
+  %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
+  %tmp2 = extractelement <8 x i32> %indices, i32 1
+  %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
+  %tmp3 = extractelement <8 x i32> %indices, i32 2
+  %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
+  %tmp4 = extractelement <8 x i32> %indices, i32 3
+  %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
+  %tmp5 = extractelement <8 x i32> %indices, i32 4
+  %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
+  %tmp6 = extractelement <8 x i32> %indices, i32 5
+  %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
+  %tmp7 = extractelement <8 x i32> %indices, i32 6
+  %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
+  %tmp8 = extractelement <8 x i32> %indices, i32 7
+  %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
+  %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
+  %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
+  %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
+  %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
+  %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
+  %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
+  %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
+  %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
+  ret <8 x i32> %tmp16
+}
+
+define <8 x float> @pr35820_float(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
+; AVX1-LABEL: pr35820_float:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX1-NEXT:    movq %r8, %r10
+; AVX1-NEXT:    shrq $30, %r10
+; AVX1-NEXT:    vmovq %xmm1, %r9
+; AVX1-NEXT:    movq %r9, %rdx
+; AVX1-NEXT:    shrq $30, %rdx
+; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andl $3, %r9d
+; AVX1-NEXT:    andl $12, %edx
+; AVX1-NEXT:    andl $3, %r8d
+; AVX1-NEXT:    andl $12, %r10d
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    shrq $30, %rdi
+; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rsi
+; AVX1-NEXT:    shrq $30, %rsi
+; AVX1-NEXT:    andl $3, %ecx
+; AVX1-NEXT:    andl $12, %esi
+; AVX1-NEXT:    andl $3, %eax
+; AVX1-NEXT:    andl $12, %edi
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; INT256-LABEL: pr35820_float:
+; INT256:       # %bb.0: # %entry
+; INT256-NEXT:    # kill: def %xmm0 killed %xmm0 def %ymm0
+; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; INT256-NEXT:    retq
+entry:
+  %tmp1 = extractelement <8 x i32> %indices, i32 0
+  %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
+  %tmp2 = extractelement <8 x i32> %indices, i32 1
+  %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
+  %tmp3 = extractelement <8 x i32> %indices, i32 2
+  %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
+  %tmp4 = extractelement <8 x i32> %indices, i32 3
+  %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
+  %tmp5 = extractelement <8 x i32> %indices, i32 4
+  %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
+  %tmp6 = extractelement <8 x i32> %indices, i32 5
+  %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
+  %tmp7 = extractelement <8 x i32> %indices, i32 6
+  %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
+  %tmp8 = extractelement <8 x i32> %indices, i32 7
+  %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
+  %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
+  %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
+  %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
+  %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
+  %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
+  %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
+  %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
+  %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
+  ret <8 x float> %tmp16
+}
+
+define <4 x i32> @big_source(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
+; AVX-LABEL: big_source:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    movq %rsp, %rbp
+; AVX-NEXT:    andq $-32, %rsp
+; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    vmovq %xmm1, %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shrq $30, %rcx
+; AVX-NEXT:    andl $28, %ecx
+; AVX-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX-NEXT:    movq %rdx, %rsi
+; AVX-NEXT:    sarq $32, %rsi
+; AVX-NEXT:    andl $7, %eax
+; AVX-NEXT:    andl $7, %edx
+; AVX-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX-NEXT:    andl $7, %esi
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT:    movq %rbp, %rsp
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+entry:
+  %tmp1 = extractelement <4 x i32> %indices, i32 0
+  %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
+  %tmp2 = extractelement <4 x i32> %indices, i32 1
+  %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
+  %tmp3 = extractelement <4 x i32> %indices, i32 2
+  %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
+  %tmp4 = extractelement <4 x i32> %indices, i32 3
+  %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
+  %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
+  %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
+  %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
+  %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
+  ret <4 x i32> %tmp12
+}
--- a/test/MC/COFF/cv-inline-linetable.s
+++ b/test/MC/COFF/cv-inline-linetable.s
@ -135,3 +135,29 @@ Ltmp1:
 	.cv_filechecksums               # File index to string table offset subsection
 	.cv_stringtable                 # String table

+# CHECK-LABEL:  FunctionLineTable [
+# CHECK:    LinkageName: ?baz@@YAXXZ
+# CHECK:    Flags: 0x1
+# CHECK:    CodeSize: 0x3D
+# CHECK:    FilenameSegment [
+# CHECK:      Filename: D:\src\llvm\build\t.cpp (0x0)
+# CHECK:      +0x0 [
+# CHECK:        LineNumberStart: 13
+# CHECK:      ]
+# CHECK:      +0x1 [
+# CHECK:        LineNumberStart: 14
+# CHECK:      ]
+# CHECK:      +0x8 [
+# CHECK:        LineNumberStart: 15
+# CHECK:      ]
+#	There shouldn't be any other line number entries because all the other
+#	.cv_locs are on line 15 where the top-level inline call site is.
+# CHECK-NOT: LineNumberStart
+# CHECK:      +0x34 [
+# CHECK:        LineNumberStart: 16
+# CHECK:      ]
+# CHECK:      +0x3B [
+# CHECK:        LineNumberStart: 17
+# CHECK:      ]
+# CHECK:    ]
+# CHECK:  ]
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@ -99,7 +99,8 @@
 // CHECK: shll $2, %eax
        sall $2, %eax

-// CHECK: rep movsb
+// CHECK: rep
+// CHECK-NEXT: movsb
 rep     # comment
 movsb

@ -1557,3 +1558,38 @@ ptwriteq 0xdeadbeef(%rbx,%rcx,8)
 // CHECK: ptwriteq %rax
 // CHECK:  encoding: [0xf3,0x48,0x0f,0xae,0xe0]
 ptwriteq %rax
+
+//  __asm __volatile(
+//    "pushf        \n\t"
+//    "popf       \n\t"
+//    "rep        \n\t"
+//    ".byte  0x0f, 0xa7, 0xd0"
+//  );
+// CHECK: pushfq
+// CHECK-NEXT: popfq
+// CHECK-NEXT: rep
+// CHECK-NEXT: .byte 15
+// CHECK-NEXT: .byte 167
+// CHECK-NEXT: .byte 208
+pushfq
+popfq
+rep
+.byte 15
+.byte 167
+.byte 208
+
+// CHECK: lock
+// CHECK: cmpxchgl
+        cmp $0, %edx
+        je 1f
+        lock
+1:      cmpxchgl %ecx,(%rdi)
+
+// CHECK: rep
+// CHECK-NEXT: byte
+rep
+.byte 0xa4      # movsb
+
+// CHECK: lock
+// This line has to be the last one in the file
+lock
--- a/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll
+++ b/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll
@ -0,0 +1,46 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+%struct.CFVS = type { %struct.Vec }
+%struct.Vec = type { i8 }
+%struct.S = type { i8 }
+
+define void @_ZN4CFVSD2Ev(%struct.CFVS* %this) unnamed_addr align 2 !dbg !8 {
+entry:
+  %this.addr = alloca %struct.CFVS*, align 8
+  store %struct.CFVS* %this, %struct.CFVS** %this.addr, align 8
+  %this1 = load %struct.CFVS*, %struct.CFVS** %this.addr, align 8
+  %m_val = getelementptr inbounds %struct.CFVS, %struct.CFVS* %this1, i32 0, i32 0
+  ret void
+}
+
+declare dereferenceable(1) %struct.S* @_Z3Getv()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bz188598-b.cpp", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!8 = distinct !DISubprogram(name: "~CFVS", linkageName: "_ZN4CFVSD2Ev", scope: !9, file: !1, line: 2, type: !28, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !27, variables: !2)
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !10, line: 7, size: 8, elements: !11, identifier: "_ZTS4CFVS")
+!10 = !DIFile(filename: "./bz188598.h", directory: "")
+!11 = !{!12, !27}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "m_val", scope: !9, file: !10, line: 9, baseType: !13, size: 8)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Vec<&Get>", file: !10, line: 4, size: 8, elements: !14, templateParams: !19, identifier: "_ZTS3VecIXadL_Z3GetvEEE")
+!14 = !{!35}
+!19 = !{!20}
+!20 = !DITemplateValueParameter(name: "F", type: !21, value: %struct.S* ()* @_Z3Getv)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64)
+!22 = !DIDerivedType(tag: DW_TAG_typedef, name: "Func", file: !10, line: 2, baseType: !23)
+!23 = !DISubroutineType(types: !24)
+!24 = !{!35}
+!27 = !DISubprogram(name: "~CFVS", scope: !9, file: !10, line: 8, type: !28, isLocal: false, isDefinition: false, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false)
+!28 = !DISubroutineType(types: !29)
+!29 = !{null, !30}
+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!35 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
--- a/test/ThinLTO/X86/dicompositetype-unique2.ll
+++ b/test/ThinLTO/X86/dicompositetype-unique2.ll
@ -0,0 +1,69 @@
+; RUN: opt -module-summary -o %t1.bc %s
+; RUN: opt -module-summary -o %t2.bc %S/Inputs/dicompositetype-unique2.ll
+; RUN: llvm-lto --thinlto-action=run %t1.bc %t2.bc -thinlto-save-temps=%t3.
+; RUN: llvm-dis %t3.0.3.imported.bc -o - | FileCheck %s
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t --save-temps \
+; RUN: -r %t1.bc,_ZN1CD2Ev,pl \
+; RUN: -r %t1.bc,_ZN4CFVSD2Ev,l \
+; RUN: -r %t1.bc,_Z3Getv,l \
+; RUN: -r %t2.bc,_ZN4CFVSD2Ev,pl \
+; RUN: -r %t2.bc,_Z3Getv,l
+; RUN: llvm-dis %t.1.3.import.bc -o - | FileCheck %s
+
+; Only llvm-lto2 adds the dso_local keyword, hence the {{.*}}
+; CHECK: define available_externally{{.*}} void @_ZN4CFVSD2Ev
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+%class.C = type <{ i32 (...)**, %class.A, %struct.CFVS, [6 x i8] }>
+%class.A = type { %struct.Vec }
+%struct.Vec = type { i8 }
+%struct.CFVS = type { %struct.Vec }
+%struct.S = type { i8 }
+
+define void @_ZN1CD2Ev(%class.C* %this) unnamed_addr align 2 !dbg !8 {
+entry:
+  %this.addr = alloca %class.C*, align 8
+  %this1 = load %class.C*, %class.C** %this.addr, align 8
+  %m = getelementptr inbounds %class.C, %class.C* %this1, i32 0, i32 2
+  call void @_ZN4CFVSD2Ev(%struct.CFVS* %m), !dbg !50
+  ret void
+}
+
+declare void @_ZN4CFVSD2Ev(%struct.CFVS*) unnamed_addr
+
+declare dereferenceable(1) %struct.S* @_Z3Getv()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bz188598-a.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!8 = distinct !DISubprogram(name: "~C", linkageName: "_ZN1CD2Ev", scope: !9, file: !1, line: 9, type: !47, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !46, variables: !2)
+!9 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C", file: !1, line: 5, size: 128, elements: !10, vtableHolder: !9, identifier: "_ZTS1C")
+!10 = !{!38, !46}
+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Vec<&Get>", file: !16, line: 4, size: 8, elements: !17, templateParams: !22, identifier: "_ZTS3VecIXadL_Z3GetvEEE")
+!16 = !DIFile(filename: "./bz188598.h", directory: ".")
+!17 = !{!55}
+!22 = !{!23}
+!23 = !DITemplateValueParameter(name: "F", type: !24, value: %struct.S* ()* @_Z3Getv)
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !25, size: 64)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Func", file: !16, line: 2, baseType: !26)
+!26 = !DISubroutineType(types: !27)
+!27 = !{!55}
+!38 = !DIDerivedType(tag: DW_TAG_member, name: "m", scope: !9, file: !1, line: 7, baseType: !39, size: 8, offset: 72)
+!39 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !16, line: 7, size: 8, elements: !40, identifier: "_ZTS4CFVS")
+!40 = !{!41}
+!41 = !DIDerivedType(tag: DW_TAG_member, name: "m_val", scope: !39, file: !16, line: 9, baseType: !15, size: 8)
+!46 = !DISubprogram(name: "~C", scope: !9, file: !1, line: 6, type: !47, isLocal: false, isDefinition: false, scopeLine: 6, containingType: !9, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: false)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!55}
+!50 = !DILocation(line: 9, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9)
+!55 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
@ -0,0 +1,19 @@
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-select=true  %s | FileCheck %s --check-prefix=CHECK
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Select when both offset and scale reg are present.
+define i64 @test1(i1 %c, i64* %b, i64 %scale) {
+; CHECK-LABEL: @test1
+entry:
+; CHECK-LABEL: entry:
+  %g = getelementptr inbounds i64, i64* %b, i64 %scale
+  %g1 = getelementptr inbounds i64, i64* %g, i64 8
+  %g2 = getelementptr inbounds i64, i64* %g, i64 16
+  %s = select i1 %c, i64* %g1, i64* %g2
+; CHECK-NOT: sunkaddr
+  %v = load i64 , i64* %s, align 8
+  ret i64 %v
+}
+
--- a/test/Transforms/GVNHoist/pr35222-hoist-load.ll
+++ b/test/Transforms/GVNHoist/pr35222-hoist-load.ll
@ -1,4 +1,5 @@
 ; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+; CHECK-LABEL: build_tree
 ; CHECK: load
 ; CHECK: load
 ; Check that the load is not hoisted because the call can potentially
@ -23,3 +24,47 @@ do.end:                                           ; preds = %do.body
 }

 declare i1 @pqdownheap(i32)
+
+@i = external hidden unnamed_addr global i32, align 4
+@j = external hidden unnamed_addr global [573 x i32], align 4
+@v = external global i1
+
+; CHECK-LABEL: test
+; CHECK-LABEL: do.end
+; CHECK: load
+; Check that the load is not hoisted because the call can potentially
+; modify the global
+
+define i32 @test() {
+entry:
+  br label %for.cond
+
+for.cond:
+  %a3 = load volatile i1, i1* @v
+  br i1 %a3, label %for.body, label %while.end
+
+for.body:
+  br label %if.then
+
+if.then:
+  %tmp4 = load i32, i32* @i, align 4
+  br label %for.cond
+
+while.end:
+  br label %do.body
+
+do.body:
+  %tmp9 = load i32, i32* getelementptr inbounds ([573 x i32], [573 x i32]* @j,
+i32 0, i32 1), align 4
+  %tmp10 = load i32, i32* @i, align 4
+  call void @fn()
+  %a1 = load volatile i1, i1* @v
+  br i1 %a1, label %do.body, label %do.end
+
+do.end:
+  %tmp20 = load i32, i32* getelementptr inbounds ([573 x i32], [573 x i32]* @j,
+i32 0, i32 1), align 4
+  ret i32 %tmp20
+}
+
+declare void @fn()
--- a/test/Transforms/JumpThreading/ddt-crash3.ll
+++ b/test/Transforms/JumpThreading/ddt-crash3.ll
@ -0,0 +1,43 @@
+; RUN: opt < %s -jump-threading -disable-output -verify-dom-info
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@global = external local_unnamed_addr global i64, align 8
+@global.1 = external local_unnamed_addr global i64, align 8
+@global.2 = external local_unnamed_addr global i64, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @hoge() local_unnamed_addr #0 {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb26, %bb
+  %tmp = load i64, i64* @global, align 8, !tbaa !1
+  %tmp2 = icmp eq i64 %tmp, 0
+  br i1 %tmp2, label %bb27, label %bb3
+
+bb3:                                              ; preds = %bb1
+  %tmp4 = load i64, i64* @global.1, align 8, !tbaa !1
+  %tmp5 = icmp eq i64 %tmp4, 0
+  br i1 %tmp5, label %bb23, label %bb23
+
+bb23:                                             ; preds = %bb3, %bb3
+  br label %bb26
+
+bb26:                                             ; preds = %bb27, %bb23
+  br label %bb1
+
+bb27:                                             ; preds = %bb1
+  br label %bb26
+}
+
+attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 7.0.0 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"long", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
--- a/test/Transforms/JumpThreading/ddt-crash4.ll
+++ b/test/Transforms/JumpThreading/ddt-crash4.ll
@ -0,0 +1,75 @@
+; RUN: opt < %s -jump-threading -disable-output -verify-dom-info
+@global = external global i64, align 8
+
+define void @f() {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = load i64, i64* @global, align 8
+  %tmp2 = icmp eq i64 %tmp, 0
+  br i1 %tmp2, label %bb27, label %bb3
+
+bb3:
+  %tmp4 = load i64, i64* @global, align 8
+  %tmp5 = icmp eq i64 %tmp4, 0
+  br i1 %tmp5, label %bb6, label %bb7
+
+bb6:
+  br label %bb7
+
+bb7:
+  %tmp8 = phi i1 [ true, %bb3 ], [ undef, %bb6 ]
+  %tmp9 = select i1 %tmp8, i64 %tmp4, i64 0
+  br i1 false, label %bb10, label %bb23
+
+bb10:
+  %tmp11 = load i64, i64* @global, align 8
+  %tmp12 = icmp slt i64 %tmp11, 5
+  br i1 %tmp12, label %bb13, label %bb17
+
+bb13:
+  br label %bb14
+
+bb14:
+  br i1 undef, label %bb15, label %bb16
+
+bb15:
+  unreachable
+
+bb16:
+  br label %bb10
+
+bb17:
+  br label %bb18
+
+bb18:
+  br i1 undef, label %bb22, label %bb13
+
+bb19:
+  br i1 undef, label %bb20, label %bb21
+
+bb20:
+  unreachable
+
+bb21:
+  br label %bb18
+
+bb22:
+  br label %bb23
+
+bb23:
+  br i1 undef, label %bb24, label %bb13
+
+bb24:
+  br i1 undef, label %bb26, label %bb25
+
+bb25:
+  br label %bb19
+
+bb26:
+  br label %bb1
+
+bb27:
+  br label %bb24
+}
--- a/test/Transforms/LoopVectorize/pr35773.ll
+++ b/test/Transforms/LoopVectorize/pr35773.ll
@ -0,0 +1,53 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+@a = common local_unnamed_addr global i32 0, align 4
+@b = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit1() local_unnamed_addr{
+entry:
+  br label %for.body
+
+for.body:
+  %main.iv = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+
+  %i8.iv = phi i8 [ 0, %entry ], [ %i8.add, %for.body ]
+  %i32.iv = phi i32 [ 0, %entry ], [ %i32.add, %for.body ]
+
+  %trunc.to.be.converted.to.new.iv = trunc i32 %i32.iv to i8
+  %i8.add = add i8 %i8.iv, %trunc.to.be.converted.to.new.iv
+
+  %noop.conv.under.pse = and i32 %i32.iv, 255
+  %i32.add = add nuw nsw i32 %noop.conv.under.pse, 9
+
+  %inc = add i32 %main.iv, 1
+  %tobool = icmp eq i32 %inc, 16
+  br i1 %tobool, label %for.cond.for.end_crit_edge, label %for.body
+
+; CHECK-LABEL: @doit1(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[MAIN_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[MAIN_IV_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[I8_IV:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[I8_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I32_IV:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[MAIN_IV]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0
+
+; CHECK-NEXT:    [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
+
+; CHECK-NEXT:    [[MAIN_IV_NEXT]] = add i32 [[MAIN_IV]], 4
+; CHECK-NEXT:    [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], <i32 36, i32 36, i32 36, i32 36>
+; CHECK-NEXT:    [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], <i8 36, i8 36, i8 36, i8 36>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+
+for.cond.for.end_crit_edge:
+  store i8 %i8.add, i8* @b, align 1
+  br label %for.end
+
+for.end:
+  ret void
+}
--- a/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @mainTest(i32* %ptr) #0  {
+; CHECK-LABEL: @mainTest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 1, undef
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], undef
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], undef
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], [[TMP6]]
+; CHECK-NEXT:    [[OP_EXTRA5]] = add i32 [[OP_EXTRA4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], undef
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       bail_out:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32* %ptr, null
+  br i1 %cmp, label %loop, label %bail_out
+
+loop:
+  %dummy_phi = phi i32 [ 1, %entry ], [ %18, %loop ]
+  %0 = load i32, i32 * %ptr , align 4
+  %1 = mul i32 %0, %0
+  %2 = add i32 1, %1
+  %3 = getelementptr inbounds i32, i32 * %ptr, i64 1
+  %4 = load i32, i32 * %3 , align 4
+  %5 = mul i32 %4, %4
+  %6 = add i32 %2, %4
+  %7 = add i32 %6, %5
+  %8 = getelementptr inbounds i32, i32 *%ptr, i64 2
+  %9 = load i32, i32 * %8 , align 4
+  %10 = mul i32 %9, %9
+  %11 = add i32 %7, %9
+  %12 = add i32 %11, %10
+  %13 = sext i32 %9 to i64
+  %14 = getelementptr inbounds i32, i32 *%ptr, i64 3
+  %15 = load i32, i32 * %14 , align 4
+  %16 = mul i32 %15, %15
+  %17 = add i32 %12, %15
+  %18 = add i32 %17, %16
+  br label %loop
+
+bail_out:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="westmere" }
+
--- a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @test() #0 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP6:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DUMMY_ADD:%.*]] = add i16 0, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> <i64 3, i64 2, i64 1, i64 0>, [[TMP4]]
+; CHECK-NEXT:    [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[SUM1:%.*]] = add i64 undef, undef
+; CHECK-NEXT:    [[SUM2:%.*]] = add i64 [[SUM1]], undef
+; CHECK-NEXT:    [[ZSUM:%.*]] = add i64 [[SUM2]], 0
+; CHECK-NEXT:    [[JOIN:%.*]] = add i64 undef, [[ZSUM]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NEXT:    [[OP_EXTRA3]] = add i64 [[OP_EXTRA]], [[TMP6]]
+; CHECK-NEXT:    [[LAST:%.*]] = add i64 [[JOIN]], undef
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %dummy_phi = phi i64 [ 1, %entry ], [ %last, %loop ]
+  %0 = phi i64 [ 2, %entry ], [ %fork, %loop ]
+  %inc1 = add i64 %0, 1
+  %inc2 = add i64 %0, 2
+  %inc11 = add i64 1, %inc1
+  %exact1 = ashr exact i64 %inc11, 32
+  %inc3 = add i64 %0, 3
+  %dummy_add = add i16 0, 0
+  %inc12 = add i64 1, %inc2
+  %exact2 = ashr exact i64 %inc12, 32
+  %dummy_shl = shl i64 %inc3, 32
+  %inc13 = add i64 1, %inc3
+  %exact3 = ashr exact i64 %inc13, 32
+  %fork = add i64 %0, 0
+  %sum1 = add i64 %exact3, %exact2
+  %sum2 = add i64 %sum1, %exact1
+  %zsum = add i64 %sum2, 0
+  %sext22 = add i64 1, %fork
+  %exact4 = ashr exact i64 %sext22, 32
+  %join = add i64 %fork, %zsum
+  %last = add i64 %join, %exact4
+  br label %loop
+}
+
--- a/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35777.ll
@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -verify -slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s
+
+@global = local_unnamed_addr global [6 x double] zeroinitializer, align 16
+
+define { i64, i64 } @patatino(double %arg) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP17]]
+;
+bb:
+  %tmp = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
+  %tmp1 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
+  %tmp2 = fmul double %tmp1, %arg
+  %tmp3 = fadd double %tmp, %tmp2
+  %tmp4 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
+  %tmp5 = fadd double %tmp4, %tmp3
+  %tmp6 = fptosi double %tmp5 to i32
+  %tmp7 = sext i32 %tmp6 to i64
+  %tmp8 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
+  %tmp9 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
+  %tmp10 = fmul double %tmp9, %arg
+  %tmp11 = fadd double %tmp8, %tmp10
+  %tmp12 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
+  %tmp13 = fadd double %tmp12, %tmp11
+  %tmp14 = fptosi double %tmp13 to i32
+  %tmp15 = sext i32 %tmp14 to i64
+  %tmp16 = insertvalue { i64, i64 } undef, i64 %tmp7, 0
+  %tmp17 = insertvalue { i64, i64 } %tmp16, i64 %tmp15, 1
+  ret { i64, i64 } %tmp17
+}
--- a/test/Transforms/SLPVectorizer/X86/PR35865.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35865.ll
@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
+; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
+; CHECK-NEXT:    [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
+; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT:    [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
+; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <16 x half> undef, i32 4
+  %conv.i.4.i = fpext half %0 to float
+  %1 = bitcast float %conv.i.4.i to i32
+  %vecins.i.4.i = insertelement <8 x i32> undef, i32 %1, i32 4
+  %2 = extractelement <16 x half> undef, i32 5
+  %conv.i.5.i = fpext half %2 to float
+  %3 = bitcast float %conv.i.5.i to i32
+  %vecins.i.5.i = insertelement <8 x i32> %vecins.i.4.i, i32 %3, i32 5
+  ret void
+}
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@ -7,8 +7,8 @@ target triple = "x86_64-apple-macosx10.8.0"

 define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@ -20,8 +20,8 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select(
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@ -64,18 +64,18 @@ declare void @llvm.assume(i1) nounwind
 ; This entire tree is ephemeral, don't vectorize any of it.
 define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_eph(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
@ -100,18 +100,18 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; CHECK-NEXT:    ret <4 x float> undef
 ;
 ; ZEROTHRESH-LABEL: @simple_select_eph(
-; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
 ; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
 ; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
 ; ZEROTHRESH-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
@ -175,8 +175,8 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; doesn't matter
 define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_insert_out_of_order(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@ -188,8 +188,8 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select_insert_out_of_order(
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
 ; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@ -233,8 +233,8 @@ declare void @f32_user(float) #0
 ; Multiple users of the final constructed vector
 define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_users(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@ -247,8 +247,8 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select_users(
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@ -291,18 +291,18 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
 ; Unused insertelement
 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_no_users(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
@ -330,18 +330,18 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    ret <4 x float> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select_no_users(
-; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
 ; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
 ; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
 ; ZEROTHRESH-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
@ -387,25 +387,25 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; to do this backwards this backwards
 define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
 ; CHECK-LABEL: @reconstruct(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> %c, i32 2
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> %c, i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[C]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP2]], i32 2
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP1]], i32 3
 ; CHECK-NEXT:    ret <4 x i32> [[RD]]
 ;
 ; ZEROTHRESH-LABEL: @reconstruct(
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP2]], i32 1
-; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> %c, i32 2
-; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP3]], i32 2
-; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> %c, i32 3
-; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP4]], i32 3
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
 ; ZEROTHRESH-NEXT:    ret <4 x i32> [[RD]]
 ;
  %c0 = extractelement <4 x i32> %c, i32 0
@ -421,8 +421,8 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {

 define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_v2(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> %c, zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> %a, <2 x float> %b
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
@ -430,12 +430,12 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; CHECK-NEXT:    ret <2 x float> [[RB]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select_v2(
-; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <2 x i32> %c, i32 0
-; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <2 x i32> %c, i32 1
-; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <2 x float> %a, i32 0
-; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <2 x float> %a, i32 1
-; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <2 x float> %b, i32 0
-; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <2 x float> %b, i32 1
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1
 ; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
 ; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
 ; ZEROTHRESH-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
@ -464,12 +464,12 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; (low cost threshold needed to force this to happen)
 define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
 ; CHECK-LABEL: @simple_select_partial_vector(
-; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
@ -485,12 +485,12 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
 ; CHECK-NEXT:    ret <4 x float> [[RB]]
 ;
 ; ZEROTHRESH-LABEL: @simple_select_partial_vector(
-; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> %b, i32 1
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
 ; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
 ; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
@ -530,7 +530,7 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
 ; must be rescheduled. The case here is from compiling Julia.
 define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @reschedule_extract(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@ -542,7 +542,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
 ; ZEROTHRESH-LABEL: @reschedule_extract(
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
 ; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; ZEROTHRESH-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@ -576,7 +576,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; instructions that are erased.
 define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @take_credit(
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@ -588,7 +588,7 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
 ; ZEROTHRESH-LABEL: @take_credit(
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
 ; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; ZEROTHRESH-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@ -622,10 +622,10 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; CHECK-LABEL: @multi_tree(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
@ -640,10 +640,10 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ;
 ; ZEROTHRESH-LABEL: @multi_tree(
 ; ZEROTHRESH-NEXT:  entry:
-; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
-; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
-; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
-; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
+; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[X:%.*]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
 ; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
 ; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
 ; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
@ -675,7 +675,7 @@ entry:
 define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x float> %a, %b
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
@ -696,7 +696,7 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
 ;
 ; ZEROTHRESH-LABEL: @_vadd256(
 ; ZEROTHRESH-NEXT:  entry:
-; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = fadd <8 x float> %a, %b
+; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
 ; ZEROTHRESH-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
 ; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
--- a/test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ b/test/Transforms/SLPVectorizer/X86/insertvalue.ll
@ -1,11 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s

-; CHECK-LABEL: julia_2xdouble
-; CHECK: load <2 x double>
-; CHECK: load <2 x double>
-; CHECK: fmul <2 x double>
-; CHECK: fadd <2 x double>
 define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+; CHECK-LABEL: @julia_2xdouble(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
+; CHECK-NEXT:    store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 top:
  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
  %x0 = load double, double* %px0, align 4
@ -29,12 +48,40 @@ top:
  ret void
 }

-; CHECK-LABEL: julia_4xfloat
-; CHECK: load <4 x float>
-; CHECK: load <4 x float>
-; CHECK: fmul <4 x float>
-; CHECK: fadd <4 x float>
 define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+; CHECK-LABEL: @julia_4xfloat(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT:    [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2
+; CHECK-NEXT:    [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2
+; CHECK-NEXT:    [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
+; CHECK-NEXT:    [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; CHECK-NEXT:    [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
+; CHECK-NEXT:    store [4 x float] [[I3]], [4 x float]* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 top:
  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
  %x0 = load float, float* %px0, align 4
@ -76,9 +123,27 @@ top:
  ret void
 }

-; CHECK-LABEL: julia_load_array_of_float
-; CHECK: fsub <4 x float>
 define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+; CHECK-LABEL: @julia_load_array_of_float(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x float]* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x float], [4 x float]* [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x float]* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x float], [4 x float]* [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
+; CHECK-NEXT:    store [4 x float] [[C_ARR3]], [4 x float]* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 top:
  %a_arr = load [4 x float], [4 x float]* %a, align 4
  %a0 = extractvalue [4 x float] %a_arr, 0
@ -102,11 +167,27 @@ top:
  ret void
 }

-; CHECK-LABEL: julia_load_array_of_i32
-; CHECK: load <4 x i32>
-; CHECK: load <4 x i32>
-; CHECK: sub <4 x i32>
 define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
+; CHECK-LABEL: @julia_load_array_of_i32(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x i32]* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i32], [4 x i32]* [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x i32]* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i32], [4 x i32]* [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
+; CHECK-NEXT:    store [4 x i32] [[C_ARR3]], [4 x i32]* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 top:
  %a_arr = load [4 x i32], [4 x i32]* %a, align 4
  %a0 = extractvalue [4 x i32] %a_arr, 0
@ -132,9 +213,30 @@ top:

 ; Almost identical to previous test, but for type that should NOT be vectorized.
 ;
-; CHECK-LABEL: julia_load_array_of_i16
-; CHECK-NOT: i2>
 define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
+; CHECK-LABEL: @julia_load_array_of_i16(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4
+; CHECK-NEXT:    [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
+; CHECK-NEXT:    [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
+; CHECK-NEXT:    [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
+; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4
+; CHECK-NEXT:    [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
+; CHECK-NEXT:    [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
+; CHECK-NEXT:    [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
+; CHECK-NEXT:    [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
+; CHECK-NEXT:    [[C1:%.*]] = sub i16 [[A1]], [[B1]]
+; CHECK-NEXT:    [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
+; CHECK-NEXT:    [[C0:%.*]] = sub i16 [[A0]], [[B0]]
+; CHECK-NEXT:    [[C2:%.*]] = sub i16 [[A2]], [[B2]]
+; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
+; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
+; CHECK-NEXT:    [[C3:%.*]] = sub i16 [[A3]], [[B3]]
+; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
+; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
+; CHECK-NEXT:    store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 top:
  %a_arr = load [4 x i16], [4 x i16]* %a, align 4
  %a0 = extractvalue [4 x i16] %a_arr, 0
@ -160,11 +262,27 @@ top:

 %pseudovec = type { float, float, float, float }

-; CHECK-LABEL: julia_load_struct_of_float
-; CHECK: load <4 x float>
-; CHECK: load <4 x float>
-; CHECK: fsub <4 x float>
 define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+; CHECK-LABEL: @julia_load_struct_of_float(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %pseudovec* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[A_STRUCT:%.*]] = load [[PSEUDOVEC:%.*]], %pseudovec* [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %pseudovec* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[B_STRUCT:%.*]] = load [[PSEUDOVEC]], %pseudovec* [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC]] undef, float [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct0, float [[TMP6]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT:    [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct1, float [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT:    [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct2, float [[TMP8]], 3
+; CHECK-NEXT:    store [[PSEUDOVEC]] %c_struct3, %pseudovec* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 top:
  %a_struct = load %pseudovec, %pseudovec* %a, align 4
  %a0 = extractvalue %pseudovec %a_struct, 0
--- a/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll
@ -1,15 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-grtev3-linux-gnu"

 ; We used to crash on this example because we were building a constant
 ; expression during vectorization and the vectorizer expects instructions
 ; as elements of the vectorized tree.
-; CHECK-LABEL: @test
 ; PR19621

 define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb279:
+; CHECK-NEXT:    br label [[BB283:%.*]]
+; CHECK:       bb283:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP13:%.*]], [[EXIT]] ]
+; CHECK-NEXT:    br label [[BB284:%.*]]
+; CHECK:       bb284:
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    br label [[BB21_I:%.*]]
+; CHECK:       bb21.i:
+; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK:       bb22.i:
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
+; CHECK-NEXT:    br label [[BB32_I:%.*]]
+; CHECK:       bb32.i:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> undef, [[TMP9]]
+; CHECK-NEXT:    [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
+; CHECK-NEXT:    [[TMP317:%.*]] = fptrunc double undef to float
+; CHECK-NEXT:    [[TMP319:%.*]] = fptrunc double undef to float
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP317]], i32 0
+; CHECK-NEXT:    [[TMP13]] = insertelement <2 x float> [[TMP12]], float [[TMP319]], i32 1
+; CHECK-NEXT:    br label [[BB283]]
+;
 bb279:
  br label %bb283

@ -62,6 +93,12 @@ exit:
 ; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
 ; The code that handles insertelement instructions must handle this.
 define <4 x double> @constant_folding() {
+; CHECK-LABEL: @constant_folding(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
+; CHECK-NEXT:    ret <4 x double> [[I2]]
+;
 entry:
  %t0 = fadd double 1.000000e+00 , 0.000000e+00
  %t1 = fadd double 1.000000e+00 , 1.000000e+00
@ -71,10 +108,3 @@ entry:
  %i2 = insertelement <4 x double> %i1, double %t3, i32 0
  ret <4 x double> %i2
 }
-
-; CHECK-LABEL: @constant_folding
-; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0
-; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1
-; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1
-; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0
-; CHECK: ret <4 x double> %[[V3]]
--- a/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll
+++ b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll
@ -0,0 +1,77 @@
+; XFAIL: *
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s
+
+; FIXME: Merge into backedge-id-bug
+; Variant which has an issue with region construction
+
+define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
+entry:
+  %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+  %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+  %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+  br label %LOOP.HEADER
+
+LOOP.HEADER:
+  %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+  call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+  %tmp12 = zext i32 %i to i64
+  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+  %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+  %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+  %tmp16 = and i32 %tmp15, 65535
+  %tmp17 = icmp eq i32 %tmp16, 1
+  br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+  %tmp19 = extractelement <2 x i32> %tmp, i64 0
+  %tmp22 = lshr i32 %tmp19, 16
+  %tmp24 = urem i32 %tmp22, 52
+  %tmp25 = mul nuw nsw i32 %tmp24, 52
+  br label %INNER_LOOP
+
+INNER_LOOP:
+  %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+  call void asm sideeffect "; inner loop body", ""() #0
+  %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+  %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+  br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+  %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+  call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+  br label %END_ELSE_BLOCK
+
+bb62:
+  %load13 = icmp ult i32 %tmp16, 271
+  ;br i1 %load13, label %bb64, label %INCREMENT_I
+  ; branching directly to the return avoids the bug
+  br i1 %load13, label %RETURN, label %INCREMENT_I
+
+
+bb64:
+  call void asm sideeffect "s_nop 42", "~{memory}"() #0
+  br label %RETURN
+
+INCREMENT_I:
+  %inc.i = add i32 %i, 1
+  call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+  br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+  %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+  call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+  %cmp.end.else.block = icmp eq i32 %i.final, -1
+  br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+  call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+  store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }
--- a/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
+++ b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s | FileCheck %s
+
+; StructurizeCFG::orderNodes used an arbitrary and nonsensical sorting
+; function which broke the basic backedge identification algorithm. It
+; would use RPO order, but then do a weird partial sort by the loop
+; depth assuming blocks are sorted by loop. However a block can appear
+; in between blocks of a loop that is not part of a loop, breaking the
+; assumption of the sort.
+;
+; The collectInfos must be done in RPO order. The actual
+; structurization order I think is less important, but unless the loop
+; headers are identified in RPO order, it finds the wrong set of back
+; edges.
+
+define amdgpu_kernel void @loop_backedge_misidentified(i32 addrspace(1)* %arg0) #0 {
+; CHECK-LABEL: @loop_backedge_misidentified(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+; CHECK-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG0:%.*]], i32 [[TID]]
+; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       LOOP.HEADER:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[FLOW4:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP13]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB62:%.*]], label [[FLOW:%.*]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       bb18:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = urem i32 [[TMP22]], 52
+; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
+; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW4]]
+; CHECK:       INNER_LOOP:
+; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
+; CHECK-NEXT:    [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
+; CHECK-NEXT:    br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
+; CHECK:       INNER_LOOP_BREAK:
+; CHECK-NEXT:    [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
+; CHECK-NEXT:    call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+; CHECK-NEXT:    br label [[FLOW3:%.*]]
+; CHECK:       bb62:
+; CHECK-NEXT:    [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[LOAD13]], true
+; CHECK-NEXT:    br i1 [[TMP3]], label [[INCREMENT_I:%.*]], label [[FLOW1:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I]] ], [ undef, [[BB62]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[BB64:%.*]], label [[FLOW2:%.*]]
+; CHECK:       bb64:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #0
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW3]]
+; CHECK:       INCREMENT_I:
+; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       END_ELSE_BLOCK:
+; CHECK-NEXT:    [[I_FINAL:%.*]] = phi i32 [ [[TMP1]], [[FLOW3]] ]
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1337
+; CHECK-NEXT:    [[CMP_END_ELSE_BLOCK:%.*]] = icmp eq i32 [[I_FINAL]], -1
+; CHECK-NEXT:    br label [[FLOW4]]
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP10]] = phi i32 [ [[I_FINAL]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW3]] ]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[RETURN:%.*]], label [[LOOP_HEADER]]
+; CHECK:       RETURN:
+; CHECK-NEXT:    call void asm sideeffect "s_nop 0x99
+; CHECK-NEXT:    store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+  %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+  %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+  br label %LOOP.HEADER
+
+LOOP.HEADER:
+  %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+  call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+  %tmp12 = zext i32 %i to i64
+  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+  %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+  %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+  %tmp16 = and i32 %tmp15, 65535
+  %tmp17 = icmp eq i32 %tmp16, 1
+  br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+  %tmp19 = extractelement <2 x i32> %tmp, i64 0
+  %tmp22 = lshr i32 %tmp19, 16
+  %tmp24 = urem i32 %tmp22, 52
+  %tmp25 = mul nuw nsw i32 %tmp24, 52
+  br label %INNER_LOOP
+
+INNER_LOOP:
+  %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+  call void asm sideeffect "; inner loop body", ""() #0
+  %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+  %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+  br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+  %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+  call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+  br label %END_ELSE_BLOCK
+
+bb62:
+  %load13 = icmp ult i32 %tmp16, 271
+  br i1 %load13, label %bb64, label %INCREMENT_I
+
+bb64:
+  call void asm sideeffect "s_nop 42", "~{memory}"() #0
+  br label %RETURN
+
+INCREMENT_I:
+  %inc.i = add i32 %i, 1
+  call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+  br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+  %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+  call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+  %cmp.end.else.block = icmp eq i32 %i.final, -1
+  br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+  call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+  store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+  ret void
+}
+
+; The same function, except break to return block goes directly to the
+; return, which managed to hide the bug.
+; FIXME: Merge variant from backedge-id-bug-xfail
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }
--- a/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
+++ b/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
--- a/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/test/Transforms/StructurizeCFG/nested-loop-order.ll
@ -1,32 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -structurizecfg %s -o - | FileCheck %s

 define void @main(float addrspace(1)* %out) {
-
-; CHECK: main_body:
-; CHECK: br label %LOOP.outer
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  main_body:
+; CHECK-NEXT:    br label [[LOOP_OUTER:%.*]]
+; CHECK:       LOOP.outer:
+; CHECK-NEXT:    [[TEMP8_0_PH:%.*]] = phi float [ 0.000000e+00, [[MAIN_BODY:%.*]] ], [ [[TMP13:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[TEMP4_0_PH:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP12:%.*]], [[FLOW3]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       LOOP:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP12]], [[FLOW:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ undef, [[LOOP_OUTER]] ], [ [[TMP13]], [[FLOW]] ]
+; CHECK-NEXT:    [[TEMP4_0:%.*]] = phi i32 [ [[TEMP4_0_PH]], [[LOOP_OUTER]] ], [ [[TMP15:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TEMP4_0]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP22]], true
+; CHECK-NEXT:    br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TEMP8_0_PH]], [[IF29:%.*]] ], [ [[TMP9:%.*]], [[FLOW1:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP20]], [[IF29]] ], [ undef, [[FLOW1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP32:%.*]], [[IF29]] ], [ true, [[FLOW1]] ]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    br i1 [[TMP16:%.*]], label [[ENDLOOP:%.*]], label [[LOOP_OUTER]]
+; CHECK:       ENDLOOP:
+; CHECK-NEXT:    [[TEMP8_1:%.*]] = phi float [ [[TMP14:%.*]], [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
+; CHECK-NEXT:    [[DOT45:%.*]] = select i1 [[TMP23]], float 0.000000e+00, float 1.000000e+00
+; CHECK-NEXT:    store float [[DOT45]], float addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT:    ret void
+; CHECK:       ENDIF:
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP31]], true
+; CHECK-NEXT:    br i1 [[TMP6]], label [[ENDIF28:%.*]], label [[FLOW1]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP20]], [[ENDIF28]] ], [ [[TMP0]], [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP35:%.*]], [[ENDIF28]] ], [ [[TMP1]], [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP9]] = phi float [ [[TMP35]], [[ENDIF28]] ], [ [[TEMP8_0_PH]], [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ [[TMP36:%.*]], [[ENDIF28]] ], [ true, [[ENDIF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ false, [[ENDIF28]] ], [ true, [[ENDIF]] ]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[IF29]], label [[FLOW2:%.*]]
+; CHECK:       IF29:
+; CHECK-NEXT:    [[TMP32]] = icmp sgt i32 [[TMP20]], 2
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP12]] = phi i32 [ [[TMP7]], [[FLOW2]] ], [ [[TMP0]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP13]] = phi float [ [[TMP8]], [[FLOW2]] ], [ [[TMP1]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP14]] = phi float [ [[TMP3]], [[FLOW2]] ], [ [[TEMP8_0_PH]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP15]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT:    [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW2]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[FLOW3]], label [[LOOP]]
+; CHECK:       ENDIF28:
+; CHECK-NEXT:    [[TMP35]] = fadd float [[TEMP8_0_PH]], 1.000000e+00
+; CHECK-NEXT:    [[TMP36]] = icmp sgt i32 [[TMP20]], 2
+; CHECK-NEXT:    br label [[FLOW1]]
+;
 main_body:
  br label %LOOP.outer

-; CHECK: LOOP.outer:
-; CHECK: br label %LOOP
 LOOP.outer:                                       ; preds = %ENDIF28, %main_body
  %temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
  %temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
  br label %LOOP

-; CHECK: LOOP:
-; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
 LOOP:                                             ; preds = %IF29, %LOOP.outer
  %temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
  %tmp20 = add i32 %temp4.0, 1
  %tmp22 = icmp sgt i32 %tmp20, 3
  br i1 %tmp22, label %ENDLOOP, label %ENDIF

-; CHECK: Flow3
-; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
-
-; CHECK: ENDLOOP:
-; CHECK: ret void
 ENDLOOP:                                          ; preds = %ENDIF28, %IF29, %LOOP
  %temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
  %tmp23 = icmp eq i32 %tmp20, 3
@ -34,29 +78,14 @@ ENDLOOP:                                          ; preds = %ENDIF28, %IF29, %LO
  store float %.45, float addrspace(1)* %out
  ret void

-; CHECK: ENDIF:
-; CHECK: br i1 %tmp31, label %IF29, label %Flow1
 ENDIF:                                            ; preds = %LOOP
  %tmp31 = icmp sgt i32 %tmp20, 1
  br i1 %tmp31, label %IF29, label %ENDIF28

-; CHECK: Flow:
-; CHECK: br i1 %{{[0-9]+}}, label %Flow2, label %LOOP
-
-; CHECK: IF29:
-; CHECK: br label %Flow1
 IF29:                                             ; preds = %ENDIF
  %tmp32 = icmp sgt i32 %tmp20, 2
  br i1 %tmp32, label %ENDLOOP, label %LOOP

-; CHECK: Flow1:
-; CHECK: br label %Flow
-
-; CHECK: Flow2:
-; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
-
-; CHECK: ENDIF28:
-; CHECK: br label %Flow3
 ENDIF28:                                          ; preds = %ENDIF
  %tmp35 = fadd float %temp8.0.ph, 1.0
  %tmp36 = icmp sgt i32 %tmp20, 2
--- a/test/tools/llvm-readobj/macho-needed-libs.test
+++ b/test/tools/llvm-readobj/macho-needed-libs.test
@ -0,0 +1,26 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-readobj -needed-libs %t.o | FileCheck %s
+
+# CHECK:      NeededLibraries [
+# CHECK-NEXT:   /usr/lib/libSystem.B.dylib
+# CHECK-NEXT: ]
+
+!mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000001
+  ncmds:           1
+  sizeofcmds:      56
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 81985536
+      compatibility_version: 65536
+    PayloadString:   /usr/lib/libSystem.B.dylib
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@ -39,6 +39,8 @@ public:
  void printUnwindInfo() override;
  void printStackMap() const override;

+  void printNeededLibraries() override;
+
  // MachO-specific.
  void printMachODataInCode() override;
  void printMachOVersionMin() override;
@ -675,6 +677,34 @@ void MachODumper::printStackMap() const {
                         StackMapV2Parser<support::big>(StackMapContentsArray));
 }

+void MachODumper::printNeededLibraries() {
+  ListScope D(W, "NeededLibraries");
+
+  using LibsTy = std::vector<StringRef>;
+  LibsTy Libs;
+
+  for (const auto &Command : Obj->load_commands()) {
+    if (Command.C.cmd == MachO::LC_LOAD_DYLIB ||
+        Command.C.cmd == MachO::LC_ID_DYLIB ||
+        Command.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
+        Command.C.cmd == MachO::LC_REEXPORT_DYLIB ||
+        Command.C.cmd == MachO::LC_LAZY_LOAD_DYLIB ||
+        Command.C.cmd == MachO::LC_LOAD_UPWARD_DYLIB) {
+      MachO::dylib_command Dl = Obj->getDylibIDLoadCommand(Command);
+      if (Dl.dylib.name < Dl.cmdsize) {
+        auto *P = static_cast<const char*>(Command.Ptr) + Dl.dylib.name;
+        Libs.push_back(P);
+      }
+    }
+  }
+
+  std::stable_sort(Libs.begin(), Libs.end());
+
+  for (const auto &L : Libs) {
+    outs() << "  " << L << "\n";
+  }
+}
+
 void MachODumper::printMachODataInCode() {
  for (const auto &Load : Obj->load_commands()) {
    if (Load.C.cmd  == MachO::LC_DATA_IN_CODE) {
--- a/unittests/IR/DominatorTreeBatchUpdatesTest.cpp
+++ b/unittests/IR/DominatorTreeBatchUpdatesTest.cpp
@ -258,3 +258,98 @@ TEST(DominatorTreeBatchUpdates, InsertDeleteExhaustive) {
    EXPECT_TRUE(PDT.verify());
  }
 }
+
+// These are some odd flowgraphs, usually generated from csmith cases,
+// which are difficult on post dom trees.
+TEST(DominatorTreeBatchUpdates, InfiniteLoop) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"},
+      {"2", "3"},
+      {"3", "6"}, {"3", "5"},
+      {"4", "5"},
+      {"5", "2"},
+      {"6", "3"}, {"6", "4"}};
+
+  // SplitBlock on 3 -> 5
+  std::vector<CFGBuilder::Update> Updates = {
+      {CFGInsert, {"N", "5"}},  {CFGInsert, {"3", "N"}}, {CFGDelete, {"3", "5"}}};
+
+  CFGHolder Holder;
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  while (B.applyUpdate())
+    ;
+
+  auto DomUpdates = ToDomUpdates(B, Updates);
+  DT.applyUpdates(DomUpdates);
+  EXPECT_TRUE(DT.verify());
+  PDT.applyUpdates(DomUpdates);
+  EXPECT_TRUE(PDT.verify());
+}
+
+TEST(DominatorTreeBatchUpdates, DeadBlocks) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"},
+      {"2", "3"},
+      {"3", "4"}, {"3", "7"},
+      {"4", "4"},
+      {"5", "6"}, {"5", "7"},
+      {"6", "7"},
+      {"7", "2"}, {"7", "8"}};
+
+  // Remove dead 5 and 7,
+  // plus SplitBlock on 7 -> 8
+  std::vector<CFGBuilder::Update> Updates = {
+      {CFGDelete, {"6", "7"}},  {CFGDelete, {"5", "7"}}, {CFGDelete, {"5", "6"}},
+      {CFGInsert, {"N", "8"}},  {CFGInsert, {"7", "N"}}, {CFGDelete, {"7", "8"}}};
+
+  CFGHolder Holder;
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  while (B.applyUpdate())
+    ;
+
+  auto DomUpdates = ToDomUpdates(B, Updates);
+  DT.applyUpdates(DomUpdates);
+  EXPECT_TRUE(DT.verify());
+  PDT.applyUpdates(DomUpdates);
+  EXPECT_TRUE(PDT.verify());
+}
+
+TEST(DominatorTreeBatchUpdates, InfiniteLoop2) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"},
+      {"2", "6"}, {"2", "3"},
+      {"3", "4"},
+      {"4", "5"}, {"4", "6"},
+      {"5", "4"},
+      {"6", "2"}};
+
+  // SplitBlock on 4 -> 6
+  std::vector<CFGBuilder::Update> Updates = {
+      {CFGInsert, {"N", "6"}},  {CFGInsert, {"4", "N"}}, {CFGDelete, {"4", "6"}}};
+
+  CFGHolder Holder;
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  while (B.applyUpdate())
+    ;
+
+  auto DomUpdates = ToDomUpdates(B, Updates);
+  DT.applyUpdates(DomUpdates);
+  EXPECT_TRUE(DT.verify());
+  PDT.applyUpdates(DomUpdates);
+  EXPECT_TRUE(PDT.verify());
+}
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@ -925,3 +925,28 @@ TEST(DominatorTree, InsertDeleteExhaustive) {
    }
  }
 }
+
+TEST(DominatorTree, InsertIntoIrreducible) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"0", "1"},
+      {"1", "27"}, {"1", "7"},
+      {"10", "18"},
+      {"13", "10"},
+      {"18", "13"}, {"18", "23"},
+      {"23", "13"}, {"23", "24"},
+      {"24", "1"}, {"24", "18"},
+      {"27", "24"}};
+
+  CFGHolder Holder;
+  CFGBuilder B(Holder.F, Arcs, {{Insert, {"7", "23"}}});
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+
+  B.applyUpdate();
+  BasicBlock *From = B.getOrAddBlock("7");
+  BasicBlock *To = B.getOrAddBlock("23");
+  DT.insertEdge(From, To);
+
+  EXPECT_TRUE(DT.verify());
+}
+
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@ -33,6 +33,7 @@ do_asserts="no"
 do_compare="yes"
 do_rt="yes"
 do_libs="yes"
+do_libcxxabi="yes"
 do_libunwind="yes"
 do_test_suite="yes"
 do_openmp="yes"
@ -62,6 +63,7 @@ function usage() {
    echo "                      For example -svn-path trunk or -svn-path branches/release_37"
    echo " -no-rt               Disable check-out & build Compiler-RT"
    echo " -no-libs             Disable check-out & build libcxx/libcxxabi/libunwind"
+    echo " -no-libcxxabi        Disable check-out & build libcxxabi"
    echo " -no-libunwind        Disable check-out & build libunwind"
    echo " -no-test-suite       Disable check-out & build test-suite"
    echo " -no-openmp           Disable check-out & build libomp"
@ -135,6 +137,9 @@ while [ $# -gt 0 ]; do
        -no-libs )
            do_libs="no"
            ;;
+        -no-libcxxabi )
+            do_libcxxabi="no"
+            ;;
        -no-libunwind )
            do_libunwind="no"
            ;;
@ -206,7 +211,10 @@ if [ $do_rt = "yes" ]; then
  projects="$projects compiler-rt"
 fi
 if [ $do_libs = "yes" ]; then
-  projects="$projects libcxx libcxxabi"
+  projects="$projects libcxx"
+  if [ $do_libcxxabi = "yes" ]; then
+    projects="$projects libcxxabi"
+  fi
  if [ $do_libunwind = "yes" ]; then
    projects="$projects libunwind"
  fi