Vendor import of llvm release_60 branch r323948:

https://llvm.org/svn/llvm-project/llvm/branches/release_60@323948
2018-02-01 21:07:55 +00:00 · 2018-02-01 21:07:55 +00:00 · 4a6a1ccbec
commit 4a6a1ccbec
parent a096e0bdf6
46 changed files with 754 additions and 313 deletions
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -15,7 +15,7 @@ Introduction
 ============

 This document contains the release notes for the LLVM Compiler Infrastructure,
-release 5.0.0.  Here we describe the status of LLVM, including major improvements
+release 6.0.0.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@ -508,7 +508,8 @@ class Value;
  /// -> LHS = %a, RHS = i32 4, *CastOp = Instruction::SExt
  ///
  SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
-                                         Instruction::CastOps *CastOp = nullptr);
+                                         Instruction::CastOps *CastOp = nullptr,
+                                         unsigned Depth = 0);
  inline SelectPatternResult
  matchSelectPattern(const Value *V, const Value *&LHS, const Value *&RHS,
                     Instruction::CastOps *CastOp = nullptr) {
--- a/include/llvm/MC/MCFragment.h
+++ b/include/llvm/MC/MCFragment.h
@ -422,14 +422,21 @@ class MCFillFragment : public MCFragment {
  uint8_t Value;

  /// The number of bytes to insert.
-  uint64_t Size;
+  const MCExpr &Size;
+
+  /// Source location of the directive that this fragment was created for.
+  SMLoc Loc;

 public:
-  MCFillFragment(uint8_t Value, uint64_t Size, MCSection *Sec = nullptr)
-      : MCFragment(FT_Fill, false, 0, Sec), Value(Value), Size(Size) {}
+  MCFillFragment(uint8_t Value, const MCExpr &Size, SMLoc Loc,
+                 MCSection *Sec = nullptr)
+      : MCFragment(FT_Fill, false, 0, Sec), Value(Value), Size(Size), Loc(Loc) {
+  }

  uint8_t getValue() const { return Value; }
-  uint64_t getSize() const { return Size; }
+  const MCExpr &getSize() const { return Size; }
+
+  SMLoc getLoc() const { return Loc; }

  static bool classof(const MCFragment *F) {
    return F->getKind() == MCFragment::FT_Fill;
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@ -161,7 +161,6 @@ public:
  bool EmitRelocDirective(const MCExpr &Offset, StringRef Name,
                          const MCExpr *Expr, SMLoc Loc) override;
  using MCStreamer::emitFill;
-  void emitFill(uint64_t NumBytes, uint8_t FillValue) override;
  void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                SMLoc Loc = SMLoc()) override;
  void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@ -662,7 +662,7 @@ public:

  /// \brief Emit NumBytes bytes worth of the value specified by FillValue.
  /// This implements directives such as '.space'.
-  virtual void emitFill(uint64_t NumBytes, uint8_t FillValue);
+  void emitFill(uint64_t NumBytes, uint8_t FillValue);

  /// \brief Emit \p Size bytes worth of the value specified by \p FillValue.
  ///
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@ -4165,17 +4165,18 @@ static SelectPatternResult matchClamp(CmpInst::Predicate Pred,
 ///   a < c ? min(a,b) : min(b,c) ==> min(min(a,b),min(b,c))
 static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
                                               Value *CmpLHS, Value *CmpRHS,
-                                               Value *TrueVal, Value *FalseVal) {
+                                               Value *TVal, Value *FVal,
+                                               unsigned Depth) {
  // TODO: Allow FP min/max with nnan/nsz.
  assert(CmpInst::isIntPredicate(Pred) && "Expected integer comparison");

  Value *A, *B;
-  SelectPatternResult L = matchSelectPattern(TrueVal, A, B);
+  SelectPatternResult L = matchSelectPattern(TVal, A, B, nullptr, Depth + 1);
  if (!SelectPatternResult::isMinOrMax(L.Flavor))
    return {SPF_UNKNOWN, SPNB_NA, false};

  Value *C, *D;
-  SelectPatternResult R = matchSelectPattern(FalseVal, C, D);
+  SelectPatternResult R = matchSelectPattern(FVal, C, D, nullptr, Depth + 1);
  if (L.Flavor != R.Flavor)
    return {SPF_UNKNOWN, SPNB_NA, false};

@ -4214,7 +4215,7 @@ static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
      break;
    return {SPF_UNKNOWN, SPNB_NA, false};
  default:
-    llvm_unreachable("Bad flavor while matching min/max");
+    return {SPF_UNKNOWN, SPNB_NA, false};
  }

  // a pred c ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
@ -4240,7 +4241,8 @@ static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
 static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
                                       Value *CmpLHS, Value *CmpRHS,
                                       Value *TrueVal, Value *FalseVal,
-                                       Value *&LHS, Value *&RHS) {
+                                       Value *&LHS, Value *&RHS,
+                                       unsigned Depth) {
  // Assume success. If there's no match, callers should not use these anyway.
  LHS = TrueVal;
  RHS = FalseVal;
@ -4249,7 +4251,7 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
  if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
    return SPR;

-  SPR = matchMinMaxOfMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal);
+  SPR = matchMinMaxOfMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, Depth);
  if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
    return SPR;
  
@ -4313,7 +4315,8 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                              FastMathFlags FMF,
                                              Value *CmpLHS, Value *CmpRHS,
                                              Value *TrueVal, Value *FalseVal,
-                                              Value *&LHS, Value *&RHS) {
+                                              Value *&LHS, Value *&RHS,
+                                              unsigned Depth) {
  LHS = CmpLHS;
  RHS = CmpRHS;

@ -4429,7 +4432,7 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
  }

  if (CmpInst::isIntPredicate(Pred))
-    return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS);
+    return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS, Depth);

  // According to (IEEE 754-2008 5.3.1), minNum(0.0, -0.0) and similar
  // may return either -0.0 or 0.0, so fcmp/select pair has stricter
@ -4550,7 +4553,11 @@ static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
 }

 SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
-                                             Instruction::CastOps *CastOp) {
+                                             Instruction::CastOps *CastOp,
+                                             unsigned Depth) {
+  if (Depth >= MaxDepth)
+    return {SPF_UNKNOWN, SPNB_NA, false};
+
  SelectInst *SI = dyn_cast<SelectInst>(V);
  if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};

@ -4579,7 +4586,7 @@ SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
        FMF.setNoSignedZeros();
      return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                  cast<CastInst>(TrueVal)->getOperand(0), C,
-                                  LHS, RHS);
+                                  LHS, RHS, Depth);
    }
    if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp)) {
      // If this is a potential fmin/fmax with a cast to integer, then ignore
@ -4588,11 +4595,11 @@ SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
        FMF.setNoSignedZeros();
      return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                  C, cast<CastInst>(FalseVal)->getOperand(0),
-                                  LHS, RHS);
+                                  LHS, RHS, Depth);
    }
  }
  return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal,
-                              LHS, RHS);
+                              LHS, RHS, Depth);
 }

 /// Return true if "icmp Pred LHS RHS" is always true.
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@ -812,6 +812,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
  auto TII = MF->getTarget().getIntrinsicInfo();
  const Function *F = CI.getCalledFunction();

+  // FIXME: support Windows dllimport function calls.
+  if (F && F->hasDLLImportStorageClass())
+    return false;
+
  if (CI.isInlineAsm())
    return translateInlineAsm(CI, MIRBuilder);

--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@ -661,7 +661,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
  }
  case TargetOpcode::G_FCONSTANT: {
    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildFConstant(DstExt, *MI.getOperand(1).getFPImm());
+    const ConstantFP *CFP = MI.getOperand(1).getFPImm();
+    APFloat Val = CFP->getValueAPF();
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
+    auto LLT2Sem = [](LLT Ty) {
+      switch (Ty.getSizeInBits()) {
+      case 32:
+        return &APFloat::IEEEsingle();
+        break;
+      case 64:
+        return &APFloat::IEEEdouble();
+        break;
+      default:
+        llvm_unreachable("Unhandled fp widen type");
+      }
+    };
+    bool LosesInfo;
+    Val.convert(*LLT2Sem(WideTy), APFloat::rmTowardZero, &LosesInfo);
+    MIRBuilder.buildFConstant(DstExt, *ConstantFP::get(Ctx, Val));
    MIRBuilder.buildFPTrunc(MI.getOperand(0).getReg(), DstExt);
    MI.eraseFromParent();
    return Legalized;
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@ -193,9 +193,10 @@ namespace {
    void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);

    void usePhysReg(MachineOperand &MO);
-    void definePhysReg(MachineInstr &MI, MCPhysReg PhysReg, RegState NewState);
+    void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg,
+                       RegState NewState);
    unsigned calcSpillCost(MCPhysReg PhysReg) const;
-    void assignVirtToPhysReg(LiveReg&, MCPhysReg PhysReg);
+    void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);

    LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
@ -434,8 +435,8 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
 /// Mark PhysReg as reserved or free after spilling any virtregs. This is very
 /// similar to defineVirtReg except the physreg is reserved instead of
 /// allocated.
-void RegAllocFast::definePhysReg(MachineInstr &MI, MCPhysReg PhysReg,
-                                 RegState NewState) {
+void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
+                                 MCPhysReg PhysReg, RegState NewState) {
  markRegUsedInInstr(PhysReg);
  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
  case regDisabled:
@ -857,7 +858,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
  // Add live-in registers as live.
  for (const MachineBasicBlock::RegisterMaskPair LI : MBB.liveins())
    if (MRI->isAllocatable(LI.PhysReg))
-      definePhysReg(*MII, LI.PhysReg, regReserved);
+      definePhysReg(MII, LI.PhysReg, regReserved);

  VirtDead.clear();
  Coalesced.clear();
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@ -1380,8 +1380,10 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
  FastISelFailed = false;
  // Initialize the Fast-ISel state, if needed.
  FastISel *FastIS = nullptr;
-  if (TM.Options.EnableFastISel)
+  if (TM.Options.EnableFastISel) {
+    DEBUG(dbgs() << "Enabling fast-isel\n");
    FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
+  }

  setupSwiftErrorVals(Fn, TLI, FuncInfo);

--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@ -717,6 +717,8 @@ bool TargetPassConfig::addCoreISelPasses() {
  if (EnableGlobalISel == cl::BOU_TRUE ||
      (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled() &&
       EnableFastISelOption != cl::BOU_TRUE)) {
+    TM->setFastISel(false);
+
    if (addIRTranslator())
      return true;

--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@ -192,9 +192,6 @@ public:

  void EmitGPRel32Value(const MCExpr *Value) override;

-
-  void emitFill(uint64_t NumBytes, uint8_t FillValue) override;
-
  void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                SMLoc Loc = SMLoc()) override;

@ -965,17 +962,12 @@ void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
  EmitEOL();
 }

-/// emitFill - Emit NumBytes bytes worth of the value specified by
-/// FillValue.  This implements directives such as '.space'.
-void MCAsmStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
-  if (NumBytes == 0) return;
-
-  const MCExpr *E = MCConstantExpr::create(NumBytes, getContext());
-  emitFill(*E, FillValue);
-}
-
 void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                             SMLoc Loc) {
+  int64_t IntNumBytes;
+  if (NumBytes.evaluateAsAbsolute(IntNumBytes) && IntNumBytes == 0)
+    return;
+
  if (const char *ZeroDirective = MAI->getZeroDirective()) {
    // FIXME: Emit location directives
    OS << ZeroDirective;
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@ -281,8 +281,18 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
    return cast<MCRelaxableFragment>(F).getContents().size();
  case MCFragment::FT_CompactEncodedInst:
    return cast<MCCompactEncodedInstFragment>(F).getContents().size();
-  case MCFragment::FT_Fill:
-    return cast<MCFillFragment>(F).getSize();
+  case MCFragment::FT_Fill: {
+    auto &FF = cast<MCFillFragment>(F);
+    int64_t Size = 0;
+    if (!FF.getSize().evaluateAsAbsolute(Size, Layout))
+      getContext().reportError(FF.getLoc(),
+                               "expected assembly-time absolute expression");
+    if (Size < 0) {
+      getContext().reportError(FF.getLoc(), "invalid number of bytes");
+      return 0;
+    }
+    return Size;
+  }

  case MCFragment::FT_LEB:
    return cast<MCLEBFragment>(F).getContents().size();
@ -540,7 +550,7 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
    for (unsigned I = 1; I < MaxChunkSize; ++I)
      Data[I] = Data[0];

-    uint64_t Size = FF.getSize();
+    uint64_t Size = FragmentSize;
    for (unsigned ChunkSize = MaxChunkSize; ChunkSize; ChunkSize /= 2) {
      StringRef Ref(Data, ChunkSize);
      for (uint64_t I = 0, E = Size / ChunkSize; I != E; ++I)
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@ -411,29 +411,19 @@ void MCMachOStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,

 void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
                                   uint64_t Size, unsigned ByteAlignment) {
-  getAssembler().registerSection(*Section);
-
-  // The symbol may not be present, which only creates the section.
-  if (!Symbol)
-    return;
-
  // On darwin all virtual sections have zerofill type.
  assert(Section->isVirtualSection() && "Section does not have zerofill type!");

-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  PushSection();
+  SwitchSection(Section);

-  getAssembler().registerSymbol(*Symbol);
-
-  // Emit an align fragment if necessary.
-  if (ByteAlignment != 1)
-    new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, Section);
-
-  MCFragment *F = new MCFillFragment(0, Size, Section);
-  Symbol->setFragment(F);
-
-  // Update the maximum alignment on the zero fill section if necessary.
-  if (ByteAlignment > Section->getAlignment())
-    Section->setAlignment(ByteAlignment);
+  // The symbol may not be present, which only creates the section.
+  if (Symbol) {
+    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+    EmitLabel(Symbol);
+    EmitZeros(Size);
+  }
+  PopSection();
 }

 // This should always be called with the thread local bss section.  Like the
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@ -577,28 +577,13 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
  return false;
 }

-void MCObjectStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
-  assert(getCurrentSectionOnly() && "need a section");
-  insert(new MCFillFragment(FillValue, NumBytes));
-}
-
 void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                                SMLoc Loc) {
  MCDataFragment *DF = getOrCreateDataFragment();
  flushPendingLabels(DF, DF->getContents().size());

-  int64_t IntNumBytes;
-  if (!NumBytes.evaluateAsAbsolute(IntNumBytes, getAssembler())) {
-    getContext().reportError(Loc, "expected absolute expression");
-    return;
-  }
-
-  if (IntNumBytes <= 0) {
-    getContext().reportError(Loc, "invalid number of bytes");
-    return;
-  }
-
-  emitFill(IntNumBytes, FillValue);
+  assert(getCurrentSectionOnly() && "need a section");
+  insert(new MCFillFragment(FillValue, NumBytes, Loc));
 }

 void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@ -184,8 +184,7 @@ void MCStreamer::EmitGPRel32Value(const MCExpr *Value) {
 /// Emit NumBytes bytes worth of the value specified by FillValue.
 /// This implements directives such as '.space'.
 void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
-  for (uint64_t i = 0, e = NumBytes; i != e; ++i)
-    EmitIntValue(FillValue, 1);
+  emitFill(*MCConstantExpr::create(NumBytes, getContext()), FillValue);
 }

 void MCStreamer::emitFill(uint64_t NumValues, int64_t Size, int64_t Expr) {
--- a/lib/MC/MCWinCOFFStreamer.cpp
+++ b/lib/MC/MCWinCOFFStreamer.cpp
@ -257,20 +257,13 @@ void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
  auto *Symbol = cast<MCSymbolCOFF>(S);

  MCSection *Section = getContext().getObjectFileInfo()->getBSSSection();
-  getAssembler().registerSection(*Section);
-  if (Section->getAlignment() < ByteAlignment)
-    Section->setAlignment(ByteAlignment);
-
-  getAssembler().registerSymbol(*Symbol);
+  PushSection();
+  SwitchSection(Section);
+  EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+  EmitLabel(Symbol);
  Symbol->setExternal(false);
-
-  if (ByteAlignment != 1)
-    new MCAlignFragment(ByteAlignment, /*Value=*/0, /*ValueSize=*/0,
-                        ByteAlignment, Section);
-
-  MCFillFragment *Fragment = new MCFillFragment(
-      /*Value=*/0, Size, Section);
-  Symbol->setFragment(Fragment);
+  EmitZeros(Size);
+  PopSection();
 }

 void MCWinCOFFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@ -528,7 +528,10 @@ static void addData(SmallVectorImpl<char> &DataBytes,
                                             Align->getMaxBytesToEmit());
      DataBytes.resize(Size, Value);
    } else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
-      DataBytes.insert(DataBytes.end(), Fill->getSize(), Fill->getValue());
+      int64_t Size;
+      if (!Fill->getSize().evaluateAsAbsolute(Size))
+        llvm_unreachable("The fill should be an assembler constant");
+      DataBytes.insert(DataBytes.end(), Size, Fill->getValue());
    } else {
      const auto &DataFrag = cast<MCDataFragment>(Frag);
      const SmallVectorImpl<char> &Contents = DataFrag.getContents();
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@ -476,26 +476,27 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
    // ADRP + LDRX
    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
            ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);

    ResultReg = createResultReg(&AArch64::GPR64RegClass);
    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
            ResultReg)
-      .addReg(ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
-                        AArch64II::MO_NC);
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0,
+                          AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
  } else {
    // ADRP + ADDX
    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
            ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);

    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
            ResultReg)
-      .addReg(ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
-      .addImm(0);
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0,
+                          AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags)
+        .addImm(0);
  }
  return ResultReg;
 }
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@ -929,6 +929,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
      return false;
    }

+    // FIXME: PR36018: Volatile loads in some cases are incorrectly selected by
+    // folding with an extend. Until we have a G_SEXTLOAD solution bail out if
+    // we hit one.
+    if (Opcode == TargetOpcode::G_LOAD && MemOp.isVolatile())
+      return false;
+
    const unsigned PtrReg = I.getOperand(1).getReg();
 #ifndef NDEBUG
    const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@ -189,15 +189,18 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
    return AArch64II::MO_GOT;

+  unsigned Flags = GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
+                                                  : AArch64II::MO_NO_FLAG;
+
  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
-    return AArch64II::MO_GOT;
+    return AArch64II::MO_GOT | Flags;

  // The small code model's direct accesses use ADRP, which cannot
  // necessarily produce the value 0 (if the code is above 4GB).
  if (useSmallAddressing() && GV->hasExternalWeakLinkage())
-    return AArch64II::MO_GOT;
+    return AArch64II::MO_GOT | Flags;

-  return AArch64II::MO_NO_FLAG;
+  return Flags;
 }

 unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@ -210,65 +210,73 @@ void SIInsertSkips::kill(MachineInstr &MI) {
    switch (MI.getOperand(2).getImm()) {
    case ISD::SETOEQ:
    case ISD::SETEQ:
-      Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+      Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
      break;
    case ISD::SETOGT:
    case ISD::SETGT:
-      Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LT_F32_e64;
      break;
    case ISD::SETOGE:
    case ISD::SETGE:
-      Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LE_F32_e64;
      break;
    case ISD::SETOLT:
    case ISD::SETLT:
-      Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_GT_F32_e64;
      break;
    case ISD::SETOLE:
    case ISD::SETLE:
-      Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_GE_F32_e64;
      break;
    case ISD::SETONE:
    case ISD::SETNE:
-      Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LG_F32_e64;
      break;
    case ISD::SETO:
-      Opcode = AMDGPU::V_CMPX_O_F32_e32;
+      Opcode = AMDGPU::V_CMPX_O_F32_e64;
      break;
    case ISD::SETUO:
-      Opcode = AMDGPU::V_CMPX_U_F32_e32;
+      Opcode = AMDGPU::V_CMPX_U_F32_e64;
      break;
    case ISD::SETUEQ:
-      Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
      break;
    case ISD::SETUGT:
-      Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
      break;
    case ISD::SETUGE:
-      Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
      break;
    case ISD::SETULT:
-      Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
      break;
    case ISD::SETULE:
-      Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
      break;
    case ISD::SETUNE:
-      Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
      break;
    default:
      llvm_unreachable("invalid ISD:SET cond code");
    }

-    // TODO: Allow this:
-    if (!MI.getOperand(0).isReg() ||
-        !TRI->isVGPR(MBB.getParent()->getRegInfo(),
-                     MI.getOperand(0).getReg()))
-      llvm_unreachable("SI_KILL operand should be a VGPR");
+    assert(MI.getOperand(0).isReg());

-    BuildMI(MBB, &MI, DL, TII->get(Opcode))
-        .add(MI.getOperand(1))
-        .add(MI.getOperand(0));
+    if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
+                    MI.getOperand(0).getReg())) {
+      Opcode = AMDGPU::getVOPe32(Opcode);
+      BuildMI(MBB, &MI, DL, TII->get(Opcode))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(0));
+    } else {
+      BuildMI(MBB, &MI, DL, TII->get(Opcode))
+          .addReg(AMDGPU::VCC, RegState::Define)
+          .addImm(0)  // src0 modifiers
+          .add(MI.getOperand(1))
+          .addImm(0)  // src1 modifiers
+          .add(MI.getOperand(0))
+          .addImm(0);  // omod
+    }
    break;
  }
  case AMDGPU::SI_KILL_I1_TERMINATOR: {
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@ -39,11 +39,11 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                    const MCSubtargetInfo &STI) {
  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
  uint64_t TSFlags = Desc.TSFlags;
+  unsigned Flags = MI->getFlags();

-  if (TSFlags & X86II::LOCK)
+  if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
    OS << "\tlock\t";

-  unsigned Flags = MI->getFlags();
  if (Flags & X86::IP_HAS_REPEAT_NE)
    OS << "\trepne\t";
  else if (Flags & X86::IP_HAS_REPEAT)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -31776,9 +31776,10 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
        // Check all uses of the condition operand to check whether it will be
        // consumed by non-BLEND instructions. Those may require that all bits
        // are set properly.
-        for (SDNode *U : Cond->uses()) {
+        for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+             UI != UE; ++UI) {
          // TODO: Add other opcodes eventually lowered into BLEND.
-          if (U->getOpcode() != ISD::VSELECT)
+          if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
            return SDValue();
        }

--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@ -142,10 +142,11 @@ recordConditions(const CallSite &CS, BasicBlock *Pred,
  recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
  BasicBlock *From = Pred;
  BasicBlock *To = Pred;
-  SmallPtrSet<BasicBlock *, 4> Visited = {From};
+  SmallPtrSet<BasicBlock *, 4> Visited;
  while (!Visited.count(From->getSinglePredecessor()) &&
         (From = From->getSinglePredecessor())) {
    recordCondition(CS, From, To, Conditions);
+    Visited.insert(From);
    To = From;
  }
 }
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@ -14,6 +14,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
@ -176,8 +177,9 @@ class StructurizeCFG : public RegionPass {
  Region *ParentRegion;

  DominatorTree *DT;
+  LoopInfo *LI;

-  std::deque<RegionNode *> Order;
+  SmallVector<RegionNode *, 8> Order;
  BBSet Visited;

  BBPhiMap DeletedPhis;
@ -202,7 +204,7 @@ class StructurizeCFG : public RegionPass {

  void gatherPredicates(RegionNode *N);

-  void analyzeNode(RegionNode *N);
+  void collectInfos();

  void insertConditions(bool Loops);

@ -256,6 +258,7 @@ public:
      AU.addRequired<DivergenceAnalysis>();
    AU.addRequiredID(LowerSwitchID);
    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();

    AU.addPreserved<DominatorTreeWrapperPass>();
    RegionPass::getAnalysisUsage(AU);
@ -289,17 +292,55 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {

 /// \brief Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
-  assert(Visited.empty());
-  assert(Predicates.empty());
-  assert(Loops.empty());
-  assert(LoopPreds.empty());
+  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;

-  // This must be RPO order for the back edge detection to work
-  for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) {
-    // FIXME: Is there a better order to use for structurization?
-    Order.push_back(RN);
-    analyzeNode(RN);
+  // The reverse post-order traversal of the list gives us an ordering close
+  // to what we want.  The only problem with it is that sometimes backedges
+  // for outer loops will be visited before backedges for inner loops.
+  for (RegionNode *RN : RPOT) {
+    BasicBlock *BB = RN->getEntry();
+    Loop *Loop = LI->getLoopFor(BB);
+    ++LoopBlocks[Loop];
  }
+
+  unsigned CurrentLoopDepth = 0;
+  Loop *CurrentLoop = nullptr;
+  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = (*I)->getEntry();
+    unsigned LoopDepth = LI->getLoopDepth(BB);
+
+    if (is_contained(Order, *I))
+      continue;
+
+    if (LoopDepth < CurrentLoopDepth) {
+      // Make sure we have visited all blocks in this loop before moving back to
+      // the outer loop.
+
+      auto LoopI = I;
+      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
+        LoopI++;
+        BasicBlock *LoopBB = (*LoopI)->getEntry();
+        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+          --BlockCount;
+          Order.push_back(*LoopI);
+        }
+      }
+    }
+
+    CurrentLoop = LI->getLoopFor(BB);
+    if (CurrentLoop)
+      LoopBlocks[CurrentLoop]--;
+
+    CurrentLoopDepth = LoopDepth;
+    Order.push_back(*I);
+  }
+
+  // This pass originally used a post-order traversal and then operated on
+  // the list in reverse. Now that we are using a reverse post-order traversal
+  // rather than re-working the whole pass to operate on the list in order,
+  // we just reverse the list and continue to operate on it in reverse.
+  std::reverse(Order.begin(), Order.end());
 }

 /// \brief Determine the end of the loops
@ -425,19 +466,32 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 }

 /// \brief Collect various loop and predicate infos
-void StructurizeCFG::analyzeNode(RegionNode *RN) {
-  DEBUG(dbgs() << "Visiting: "
-        << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-        << RN->getEntry()->getName() << '\n');
+void StructurizeCFG::collectInfos() {
+  // Reset predicate
+  Predicates.clear();

-  // Analyze all the conditions leading to a node
-  gatherPredicates(RN);
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();

-  // Remember that we've seen this node
-  Visited.insert(RN->getEntry());
+  // Reset the visited nodes
+  Visited.clear();

-  // Find the last back edges
-  analyzeLoops(RN);
+  for (RegionNode *RN : reverse(Order)) {
+    DEBUG(dbgs() << "Visiting: "
+                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+                 << RN->getEntry()->getName() << " Loop Depth: "
+                 << LI->getLoopDepth(RN->getEntry()) << "\n");
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(RN);
+
+    // Remember that we've seen this node
+    Visited.insert(RN->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(RN);
+  }
 }

 /// \brief Insert the missing branch conditions
@ -610,7 +664,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
 BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
  LLVMContext &Context = Func->getContext();
  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
-                       Order.front()->getEntry();
+                       Order.back()->getEntry();
  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
                                        Func, Insert);
  DT->addNewBlock(Flow, Dominator);
@ -690,8 +744,7 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
 /// Take one node from the order vector and wire it up
 void StructurizeCFG::wireFlow(bool ExitUseAllowed,
                              BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.front();
-  Order.pop_front();
+  RegionNode *Node = Order.pop_back_val();
  Visited.insert(Node->getEntry());

  if (isPredictableTrue(Node)) {
@ -715,7 +768,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

    PrevNode = Node;
    while (!Order.empty() && !Visited.count(LoopEnd) &&
-           dominatesPredicates(Entry, Order.front())) {
+           dominatesPredicates(Entry, Order.back())) {
      handleLoops(false, LoopEnd);
    }

@ -726,7 +779,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,

 void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                                 BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.front();
+  RegionNode *Node = Order.back();
  BasicBlock *LoopStart = Node->getEntry();

  if (!Loops.count(LoopStart)) {
@ -871,9 +924,10 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
  ParentRegion = R;

  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

  orderNodes();
-
+  collectInfos();
  createFlow();
  insertConditions(false);
  insertConditions(true);
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@ -25,6 +25,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@ -536,13 +537,23 @@ Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
  return None;
 }

+static Metadata *cloneOrBuildODR(const MDNode &N) {
+  auto *CT = dyn_cast<DICompositeType>(&N);
+  // If ODR type uniquing is enabled, we would have uniqued composite types
+  // with identifiers during bitcode reading, so we can just use CT.
+  if (CT && CT->getContext().isODRUniquingDebugTypes() &&
+      CT->getIdentifier() != "")
+    return const_cast<DICompositeType *>(CT);
+  return MDNode::replaceWithDistinct(N.clone());
+}
+
 MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
  assert(N.isDistinct() && "Expected a distinct node");
  assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
-  DistinctWorklist.push_back(cast<MDNode>(
-      (M.Flags & RF_MoveDistinctMDs)
-          ? M.mapToSelf(&N)
-          : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+  DistinctWorklist.push_back(
+      cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
+                       ? M.mapToSelf(&N)
+                       : M.mapToMetadata(&N, cloneOrBuildODR(N))));
  return DistinctWorklist.back();
 }

--- a/test/Analysis/ValueTracking/select-pattern.ll
+++ b/test/Analysis/ValueTracking/select-pattern.ll
@ -0,0 +1,46 @@
+; RUN: opt -simplifycfg < %s -S | FileCheck %s
+
+; The dead code would cause a select that had itself
+; as an operand to be analyzed. This would then cause
+; infinite recursion and eventual crash.
+
+define void @PR36045(i1 %t, i32* %b) {
+; CHECK-LABEL: @PR36045(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %t, label %if, label %end
+
+if:
+  br i1 %t, label %unreach, label %pre
+
+unreach:
+  unreachable
+
+pre:
+  %p = phi i32 [ 70, %if ], [ %sel, %for ]
+  br label %for
+
+for:
+  %cmp = icmp sgt i32 %p, 8
+  %add = add i32 %p, 2
+  %sel = select i1 %cmp, i32 %p, i32 %add
+  %cmp21 = icmp ult i32 %sel, 21
+  br i1 %cmp21, label %pre, label %for.end
+
+for.end:
+  br i1 %t, label %unreach2, label %then12
+
+then12:
+  store i32 0, i32* %b
+  br label %unreach2
+
+unreach2:
+  %spec = phi i32 [ %sel, %for.end ], [ 42, %then12 ]
+  unreachable
+
+end:
+  ret void
+}
+
--- a/test/CodeGen/AArch64/GlobalISel/fallback-nofastisel.ll
+++ b/test/CodeGen/AArch64/GlobalISel/fallback-nofastisel.ll
@ -0,0 +1,11 @@
+; RUN: llc -mtriple=aarch64_be-- %s -o /dev/null -debug-only=isel -O0 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; This test uses big endian in order to force an abort since it's not currently supported for GISel.
+; The purpose is to check that we don't fall back to FastISel. Checking the pass structure is insufficient
+; because the FastISel is set up in the SelectionDAGISel, so it doesn't appear on the pass structure.
+
+; CHECK-NOT: Enabling fast-ise
+define void @empty() {
+  ret void
+}
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-volatile-load-pr36018.ll
@ -0,0 +1,14 @@
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
+
+@g = global i16 0, align 2
+declare void @bar(i32)
+
+; Check that only one load is generated. We fall back to
+define hidden void @foo() {
+; CHECK-NOT: ldrh
+; CHECK: ldrsh
+  %1 = load volatile i16, i16* @g, align 2
+  %2 = sext i16 %1 to i32
+  call void @bar(i32 %2)
+  ret void
+}
--- a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
@ -75,7 +75,7 @@ body: |
    ; CHECK: %w0 = COPY [[C]](s32)
    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
    ; CHECK: %x0 = COPY [[C1]](s64)
-    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT half 0xH0000
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
    ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[C2]](s32)
    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
    ; CHECK: %w0 = COPY [[ANYEXT]](s32)
--- a/test/CodeGen/AArch64/dllimport.ll
+++ b/test/CodeGen/AArch64/dllimport.ll
@ -1,4 +1,6 @@
-; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s
+; RUN: llc -mtriple aarch64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s -check-prefixes=CHECK,DAG-ISEL
+; RUN: llc -mtriple aarch64-unknown-windows-msvc -fast-isel -filetype asm -o - %s | FileCheck %s -check-prefixes=CHECK,FAST-ISEL
+; RUN: llc -mtriple aarch64-unknown-windows-msvc -O0 -filetype asm -o - %s | FileCheck %s -check-prefixes=CHECK,GLOBAL-ISEL,GLOBAL-ISEL-FALLBACK

@var = external dllimport global i32
@ext = external global i32
@ -23,7 +25,11 @@ define i32 @get_ext() {

 ; CHECK-LABEL: get_ext
 ; CHECK: adrp x8, ext
-; CHECK: ldr w0, [x8, ext]
+; DAG-ISEL: ldr w0, [x8, ext]
+; FAST-ISEL: add x8, x8, ext
+; FAST-ISEL: ldr w0, [x8]
+; GLOBAL-ISEL-FALLBACK: add x8, x8, ext
+; GLOBAL-ISEL-FALLBACK: ldr w0, [x8]
 ; CHECK: ret

 define i32* @get_var_pointer() {
@ -31,8 +37,8 @@ define i32* @get_var_pointer() {
 }

 ; CHECK-LABEL: get_var_pointer
-; CHECK: adrp x0, __imp_var
-; CHECK: ldr x0, [x0, __imp_var]
+; CHECK: adrp [[REG1:x[0-9]+]], __imp_var
+; CHECK: ldr {{x[0-9]+}}, {{\[}}[[REG1]], __imp_var]
 ; CHECK: ret

 define i32 @call_external() {
@ -51,4 +57,6 @@ define i32 @call_internal() {
 }

 ; CHECK-LABEL: call_internal
-; CHECK: b internal
+; DAG-ISEL: b internal
+; FAST-ISEL: b internal
+; GLOBAL-ISEL: bl internal
--- a/test/CodeGen/AArch64/fast-regalloc-empty-bb-with-liveins.mir
+++ b/test/CodeGen/AArch64/fast-regalloc-empty-bb-with-liveins.mir
@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64-apple-ios -run-pass regallocfast -o - %s | FileCheck %s
+# This test used to crash the fast register alloc.
+# Basically, when a basic block has liveins, the fast regalloc
+# was deferencing the begin iterator of this block. However,
+# when this block is empty and it will just crashed!
+---
+name:            crashing
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: crashing
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %x0, %x1
+  ; CHECK: bb.1:
+  ; CHECK:   renamable %w0 = MOVi32imm -1
+  ; CHECK:   RET_ReallyLR implicit killed %w0
+  bb.1:
+    liveins: %x0, %x1
+
+  bb.2:
+    %0:gpr32 = MOVi32imm -1
+    %w0 = COPY %0
+    RET_ReallyLR implicit %w0
+
+...
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@ -234,6 +234,23 @@ define amdgpu_ps void @wqm(float %a) {
  ret void
 }

+; This checks that we use the 64-bit encoding when the operand is a SGPR.
+; SI-LABEL: {{^}}test_sgpr:
+; SI: v_cmpx_ge_f32_e64
+define amdgpu_ps void @test_sgpr(float inreg %a) #0 {
+  %c = fcmp ole float %a, 1.000000e+00
+  call void @llvm.amdgcn.kill(i1 %c) #1
+  ret void
+}
+
+; SI-LABEL: {{^}}test_non_inline_imm_sgpr:
+; SI-NOT: v_cmpx_ge_f32_e64
+define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 {
+  %c = fcmp ole float %a, 1.500000e+00
+  call void @llvm.amdgcn.kill(i1 %c) #1
+  ret void
+}
+
 declare void @llvm.amdgcn.kill(i1) #0
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare i1 @llvm.amdgcn.wqm.vote(i1)
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@ -66,10 +66,9 @@ ENDIF:                                            ; preds = %LOOP

 ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
-; OPT: llvm.amdgcn.break
-; OPT: llvm.amdgcn.if.break
-; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.loop
+; OPT: llvm.amdgcn.if.break
+; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.end.cf

 ; GCN-LABEL: {{^}}multi_if_break_loop:
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@ -124,100 +124,55 @@ bb23:                                             ; preds = %bb10
 ; Earlier version of above, before a run of the structurizer.
 ; IR-LABEL: @nested_loop_conditions(

-; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
-; IR:   br i1 %tmp1235, label %bb14.lr.ph, label %Flow
-
-; IR: bb14.lr.ph:
-; IR: br label %bb14
-
-; IR: Flow3:
-; IR:   call void @llvm.amdgcn.end.cf(i64 %18)
-; IR:   %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
-; IR:   %1 = extractvalue { i1, i64 } %0, 0
-; IR:   %2 = extractvalue { i1, i64 } %0, 1
-; IR:   br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
-
-; IR: bb4.bb13_crit_edge:
-; IR:   br label %Flow4
-
-; IR: Flow4:
-; IR:   %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
-; IR:   call void @llvm.amdgcn.end.cf(i64 %2)
-; IR:   br label %Flow
-
-; IR: bb13:
-; IR:   br label %bb31
-
-; IR: Flow:
-; IR:   %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
-; IR:   %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
-; IR:   %6 = extractvalue { i1, i64 } %5, 0
-; IR:   %7 = extractvalue { i1, i64 } %5, 1
-; IR:   br i1 %6, label %bb13, label %bb31
-
-; IR: bb14:
-; IR:   %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
-; IR:   %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
-; IR:   %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
-; IR:   %tmp15 = icmp eq i32 %tmp1037, 1
-; IR:   %8 = xor i1 %tmp15, true
-; IR:   %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
-; IR:   %10 = extractvalue { i1, i64 } %9, 0
-; IR:   %11 = extractvalue { i1, i64 } %9, 1
-; IR:   br i1 %10, label %bb31.loopexit, label %Flow1
+; IR: Flow7:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
+; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
+; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
+; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8

 ; IR: Flow1:
-; IR:   %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
-; IR:   %13 = extractvalue { i1, i64 } %12, 0
-; IR:   %14 = extractvalue { i1, i64 } %12, 1
-; IR:   br i1 %13, label %bb16, label %Flow2
-
-; IR: bb16:
-; IR:   %tmp17 = bitcast i64 %tmp3 to <2 x i32>
-; IR:   br label %bb18
+; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
+; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
+; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
+; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
+; IR-NEXT: br i1 %18, label %Flow7, label %bb14

 ; IR: Flow2:
-; IR:   %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
-; IR:   %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
-; IR:   %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
-; IR:   %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
-; IR:   %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
-; IR:   call void @llvm.amdgcn.end.cf(i64 %14)
-; IR:   %19 = call i1 @llvm.amdgcn.loop(i64 %18)
-; IR:   br i1 %19, label %Flow3, label %bb14
-
-; IR: bb18:
-; IR:   %tmp19 = load volatile i32, i32 addrspace(1)* undef
-; IR:   %tmp20 = icmp slt i32 %tmp19, 9
-; IR:   br i1 %tmp20, label %bb21, label %bb18
+; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
+; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
+; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
+; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
+; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
+; IR-NEXT: br i1 %25, label %bb21, label %Flow3

 ; IR: bb21:
-; IR:   %tmp22 = extractelement <2 x i32> %tmp17, i64 1
-; IR:   %tmp23 = lshr i32 %tmp22, 16
-; IR:   %tmp24 = select i1 undef, i32 undef, i32 %tmp23
-; IR:   %tmp25 = uitofp i32 %tmp24 to float
-; IR:   %tmp26 = fmul float %tmp25, 0x3EF0001000000000
-; IR:   %tmp27 = fsub float %tmp26, undef
-; IR:   %tmp28 = fcmp olt float %tmp27, 5.000000e-01
-; IR:   %tmp29 = select i1 %tmp28, i64 1, i64 2
-; IR:   %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
-; IR:   %tmp7 = zext i32 %tmp30 to i64
-; IR:   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
-; IR:   %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
-; IR:   %tmp10 = extractelement <4 x i32> %tmp9, i64 0
-; IR:   %tmp11 = load volatile i32, i32 addrspace(1)* undef
-; IR:   %tmp12 = icmp slt i32 %tmp11, 9
-; IR:   %20 = xor i1 %tmp12, true
-; IR:   %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
-; IR:   br label %Flow2
+; IR: %tmp12 = icmp slt i32 %tmp11, 9
+; IR-NEXT: %27 = xor i1 %tmp12, true
+; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
+; IR-NEXT: br label %Flow3

-; IR: bb31.loopexit:
-; IR:   br label %Flow1
+; IR: Flow3:
+; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
+; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
+; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
+; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
+; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
+; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4

 ; IR: bb31:
-; IR:   call void @llvm.amdgcn.end.cf(i64 %7)
-; IR:   store volatile i32 0, i32 addrspace(1)* undef
-; IR:   ret void
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
+; IR-NEXT: ret void


 ; GCN-LABEL: {{^}}nested_loop_conditions:
--- a/test/CodeGen/X86/pr34592.ll
+++ b/test/CodeGen/X86/pr34592.ll
@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -O0 | FileCheck %s
+
+define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <16 x i64> %arg3, <16 x i64> %arg4) {
+; CHECK-LABEL: pluto:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $352, %rsp # imm = 0x160
+; CHECK-NEXT:    vmovaps 240(%rbp), %ymm8
+; CHECK-NEXT:    vmovaps 208(%rbp), %ymm9
+; CHECK-NEXT:    vmovaps 176(%rbp), %ymm10
+; CHECK-NEXT:    vmovaps 144(%rbp), %ymm11
+; CHECK-NEXT:    vmovaps 112(%rbp), %ymm12
+; CHECK-NEXT:    vmovaps 80(%rbp), %ymm13
+; CHECK-NEXT:    vmovaps 48(%rbp), %ymm14
+; CHECK-NEXT:    vmovaps 16(%rbp), %ymm15
+; CHECK-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,18446744071562067968,18446744071562067968]
+; CHECK-NEXT:    vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm13[2,3],ymm2[4,5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3,4,5,6,7]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm13 = [18446744071562067968,18446744071562067968,0,0]
+; CHECK-NEXT:    vblendvpd %ymm13, %ymm9, %ymm6, %ymm6
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[3,2,2,1]
+; CHECK-NEXT:    vmovaps %xmm6, %xmm11
+; CHECK-NEXT:    # implicit-def: %ymm13
+; CHECK-NEXT:    vinserti128 $1, %xmm11, %ymm13, %ymm13
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5],ymm9[6,7]
+; CHECK-NEXT:    vmovaps %xmm0, %xmm11
+; CHECK-NEXT:    # implicit-def: %ymm0
+; CHECK-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm0
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm13 = ymm8[2,0,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5],ymm7[6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,0,2,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,1,2]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,1,1,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; CHECK-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm9, %ymm0
+; CHECK-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm5, %ymm1
+; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5 # 32-byte Reload
+; CHECK-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm5, %ymm2
+; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6 # 32-byte Reload
+; CHECK-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm6, %ymm3
+; CHECK-NEXT:    vmovaps %ymm15, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm12, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm10, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm14, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+bb:
+  %tmp = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg, <16 x i64> %arg1
+  %tmp5 = select <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg2, <16 x i64> zeroinitializer
+  %tmp6 = select <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <16 x i64> %arg3, <16 x i64> %tmp5
+  %tmp7 = shufflevector <16 x i64> %tmp, <16 x i64> %tmp6, <16 x i32> <i32 11, i32 18, i32 24, i32 9, i32 14, i32 29, i32 29, i32 6, i32 14, i32 28, i32 8, i32 9, i32 22, i32 12, i32 25, i32 6>
+  ret <16 x i64> %tmp7
+}
--- a/test/MC/X86/eval-fill.s
+++ b/test/MC/X86/eval-fill.s
@ -0,0 +1,17 @@
+// RUN: llvm-mc -filetype=obj %s -o - -triple x86_64-pc-linux | llvm-readobj -s | FileCheck %s
+
+// CHECK:      Name: .text
+// CHECK-NEXT: Type: SHT_PROGBITS
+// CHECK-NEXT: Flags [
+// CHECK-NEXT:   SHF_ALLOC
+// CHECK-NEXT:   SHF_EXECINSTR
+// CHECK-NEXT: ]
+// CHECK-NEXT: Address:
+// CHECK-NEXT: Offset:
+// CHECK-NEXT: Size: 4092
+
+        .globl  foo
+foo:
+        .space 4
+bar:
+        .space  4092 - (bar - foo)
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@ -10774,3 +10774,9 @@ btcl $4, (%eax)
 // CHECK: 	clzero
 // CHECK:  encoding: [0x0f,0x01,0xfc]
        	clzero
+
+// CHECK: lock addl %esi, (%edi)
+// INTEL: lock add dword ptr [edi], esi
+// CHECK:  encoding: [0xf0,0x01,0x37]
+        	lock add %esi, (%edi)
+
--- a/test/ThinLTO/X86/Inputs/dicompositetype-unique-alias.ll
+++ b/test/ThinLTO/X86/Inputs/dicompositetype-unique-alias.ll
@ -0,0 +1,39 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+%struct.CFVS = type { %struct.Vec }
+%struct.Vec = type { i8 }
+%struct.S = type { i8 }
+
+@_ZN4CFVSD1Ev = alias void (%struct.CFVS*), void (%struct.CFVS*)* @_ZN4CFVSD2Ev
+
+define void @_ZN4CFVSD2Ev(%struct.CFVS* %this) unnamed_addr align 2 !dbg !8 {
+entry:
+  %this.addr = alloca %struct.CFVS*, align 8
+  store %struct.CFVS* %this, %struct.CFVS** %this.addr, align 8
+  %this1 = load %struct.CFVS*, %struct.CFVS** %this.addr, align 8
+  %m_val = getelementptr inbounds %struct.CFVS, %struct.CFVS* %this1, i32 0, i32 0
+  ret void
+}
+
+declare dereferenceable(1) %struct.S* @_Z3Getv()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bz188598-b.cpp", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!8 = distinct !DISubprogram(name: "~CFVS", linkageName: "_ZN4CFVSD2Ev", scope: !9, file: !1, line: 2, type: !28, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !27, variables: !2)
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !10, line: 7, size: 8, elements: !11, identifier: "_ZTS4CFVS")
+!10 = !DIFile(filename: "./bz188598.h", directory: "")
+!11 = !{!35}
+!27 = !DISubprogram(name: "~CFVS", scope: !9, file: !10, line: 8, type: !28, isLocal: false, isDefinition: false, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false)
+!28 = !DISubroutineType(types: !29)
+!29 = !{null, !30}
+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!35 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
--- a/test/ThinLTO/X86/dicompositetype-unique-alias.ll
+++ b/test/ThinLTO/X86/dicompositetype-unique-alias.ll
@ -0,0 +1,62 @@
+; RUN: opt -module-summary -o %t1.bc %s
+; RUN: opt -module-summary -o %t2.bc %S/Inputs/dicompositetype-unique-alias.ll
+; RUN: llvm-lto --thinlto-action=run %t1.bc %t2.bc -thinlto-save-temps=%t3.
+; RUN: llvm-dis %t3.0.3.imported.bc -o - | FileCheck %s
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t --save-temps \
+; RUN: -r %t1.bc,_ZN1CD2Ev,pl \
+; RUN: -r %t1.bc,_ZN4CFVSD1Ev,l \
+; RUN: -r %t1.bc,_ZN4CFVSD2Ev,l \
+; RUN: -r %t1.bc,_Z3Getv,l \
+; RUN: -r %t2.bc,_ZN4CFVSD1Ev,pl \
+; RUN: -r %t2.bc,_ZN4CFVSD2Ev,pl \
+; RUN: -r %t2.bc,_Z3Getv,l
+; RUN: llvm-dis %t.1.3.import.bc -o - | FileCheck %s
+
+; Only llvm-lto2 adds the dso_local keyword, hence the {{.*}}
+; CHECK: define available_externally{{.*}} void @_ZN4CFVSD1Ev
+
+; Confirm that we only have a single DICompositeType after importing
+; both an alias and its aliasee, since ODR Type Uniquing is enabled.
+; CHECK: DICompositeType
+; CHECK-NOT: DICompositeType
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+%class.C = type <{ i32 (...)**, %class.A, %struct.CFVS, [6 x i8] }>
+%class.A = type { %struct.Vec }
+%struct.Vec = type { i8 }
+%struct.CFVS = type { %struct.Vec }
+%struct.S = type { i8 }
+
+define void @_ZN1CD2Ev(%class.C* %this) unnamed_addr align 2 {
+entry:
+  %this.addr = alloca %class.C*, align 8
+  %this1 = load %class.C*, %class.C** %this.addr, align 8
+  %m = getelementptr inbounds %class.C, %class.C* %this1, i32 0, i32 2
+  call void @_ZN4CFVSD1Ev(%struct.CFVS* %m), !dbg !50
+  call void @_ZN4CFVSD2Ev(%struct.CFVS* %m), !dbg !50
+  ret void
+}
+
+declare void @_ZN4CFVSD1Ev(%struct.CFVS*) unnamed_addr
+declare void @_ZN4CFVSD2Ev(%struct.CFVS*) unnamed_addr
+
+declare dereferenceable(1) %struct.S* @_Z3Getv()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bz188598-a.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!8 = distinct !DISubprogram(name: "~C", linkageName: "_ZN1CD2Ev", scope: !1, file: !1, line: 9, type: !47, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!55}
+!50 = !DILocation(line: 9, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9)
+!55 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
--- a/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
+++ b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll
@ -16,3 +16,27 @@ Tail:
  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
  ret i32 %r
 }
+
+define void @fn1(i16 %p1) {
+entry:
+  ret void
+}
+
+define void @fn2() {
+  ret void
+
+; Unreachable code below
+
+for.inc:                                          ; preds = %for.inc
+  br i1 undef, label %for.end6, label %for.inc
+
+for.end6:                                         ; preds = %for.inc
+  br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs:                                          ; preds = %for.end6
+  br label %lor.end
+
+lor.end:                                          ; preds = %for.end6, %lor.rhs
+  call void @fn1(i16 0)
+  ret void
+}
--- a/test/Transforms/InstCombine/minmax-fold.ll
+++ b/test/Transforms/InstCombine/minmax-fold.ll
@ -899,3 +899,24 @@ define i32 @common_factor_umax_extra_use_both(i32 %a, i32 %b, i32 %c) {
  ret i32 %max_abc
 }

+; This would assert. Don't assume that earlier min/max types match a possible later min/max.
+
+define float @not_min_of_min(i8 %i, float %x) {
+; CHECK-LABEL: @not_min_of_min(
+; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1_INV]], float 1.000000e+00, float [[X]]
+; CHECK-NEXT:    [[CMP2_INV:%.*]] = fcmp fast oge float [[X]], 2.000000e+00
+; CHECK-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2_INV]], float 2.000000e+00, float [[X]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[I:%.*]], 16
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP3]], float [[MIN1]], float [[MIN2]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %cmp1 = fcmp fast ult float %x, 1.0
+  %min1 = select i1 %cmp1, float %x, float 1.0
+  %cmp2 = fcmp fast ult float %x, 2.0
+  %min2 = select i1 %cmp2, float %x, float 2.0
+  %cmp3 = icmp ult i8 %i, 16
+  %r = select i1 %cmp3, float %min1, float %min2
+  ret float %r
+}
+
--- a/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
+++ b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
@ -1,3 +1,4 @@
+; XFAIL: *
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s | FileCheck %s

--- a/test/Transforms/StructurizeCFG/bug36015.ll
+++ b/test/Transforms/StructurizeCFG/bug36015.ll
@ -0,0 +1,53 @@
+; RUN: opt -S -structurizecfg %s | FileCheck %s
+
+; r321751 introduced a bug where control flow branching from if to exit was
+; not handled properly and instead ended up in an infinite loop.
+define void @bug36015(i32 %cmp0, i32 %count) {
+entry:
+  br label %loop.outer
+
+loop.outer:
+  %ctr.loop.outer = phi i32 [ 0, %entry ], [ %ctr.else, %else ]
+  call void @foo(i32 0)
+  br label %loop.inner
+
+loop.inner:
+  %ctr.loop.inner = phi i32 [ %ctr.loop.outer, %loop.outer ], [ %ctr.if, %if ]
+  call void @foo(i32 1)
+  %cond.inner = icmp eq i32 %cmp0, %ctr.loop.inner
+  br i1 %cond.inner, label %if, label %else
+
+; CHECK: if:
+; CHECK:   %0 = xor i1 %cond.if, true
+; CHECK:   br label %Flow
+if:
+  %ctr.if = add i32 %ctr.loop.inner, 1
+  call void @foo(i32 2)
+  %cond.if = icmp slt i32 %ctr.if, %count
+  br i1 %cond.if, label %loop.inner, label %exit
+
+; CHECK: Flow:
+; CHECK:   %2 = phi i1 [ %0, %if ], [ true, %loop.inner ]
+; CHECK:   %3 = phi i1 [ false, %if ], [ true, %loop.inner ]
+; CHECK:   br i1 %2, label %Flow1, label %loop.inner
+
+; CHECK: Flow1:
+; CHECK:   br i1 %3, label %else, label %Flow2
+
+; CHECK: else:
+; CHECK:   br label %Flow2
+else:
+  %ctr.else = add i32 %ctr.loop.inner, 1
+  call void @foo(i32 3)
+  %cond.else = icmp slt i32 %ctr.else, %count
+  br i1 %cond.else, label %loop.outer, label %exit
+
+; CHECK: Flow2:
+; CHECK:   %6 = phi i1 [ %4, %else ], [ true, %Flow1 ]
+; CHECK:   br i1 %6, label %exit, label %loop.outer
+
+exit:
+  ret void
+}
+
+declare void @foo(i32)
--- a/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/test/Transforms/StructurizeCFG/nested-loop-order.ll
@ -1,76 +1,32 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -structurizecfg %s -o - | FileCheck %s

 define void @main(float addrspace(1)* %out) {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  main_body:
-; CHECK-NEXT:    br label [[LOOP_OUTER:%.*]]
-; CHECK:       LOOP.outer:
-; CHECK-NEXT:    [[TEMP8_0_PH:%.*]] = phi float [ 0.000000e+00, [[MAIN_BODY:%.*]] ], [ [[TMP13:%.*]], [[FLOW3:%.*]] ]
-; CHECK-NEXT:    [[TEMP4_0_PH:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP12:%.*]], [[FLOW3]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       LOOP:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP12]], [[FLOW:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ undef, [[LOOP_OUTER]] ], [ [[TMP13]], [[FLOW]] ]
-; CHECK-NEXT:    [[TEMP4_0:%.*]] = phi i32 [ [[TEMP4_0_PH]], [[LOOP_OUTER]] ], [ [[TMP15:%.*]], [[FLOW]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TEMP4_0]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP22]], true
-; CHECK-NEXT:    br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
-; CHECK:       Flow2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TEMP8_0_PH]], [[IF29:%.*]] ], [ [[TMP9:%.*]], [[FLOW1:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP20]], [[IF29]] ], [ undef, [[FLOW1]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP32:%.*]], [[IF29]] ], [ true, [[FLOW1]] ]
-; CHECK-NEXT:    br label [[FLOW]]
-; CHECK:       Flow3:
-; CHECK-NEXT:    br i1 [[TMP16:%.*]], label [[ENDLOOP:%.*]], label [[LOOP_OUTER]]
-; CHECK:       ENDLOOP:
-; CHECK-NEXT:    [[TEMP8_1:%.*]] = phi float [ [[TMP14:%.*]], [[FLOW3]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
-; CHECK-NEXT:    [[DOT45:%.*]] = select i1 [[TMP23]], float 0.000000e+00, float 1.000000e+00
-; CHECK-NEXT:    store float [[DOT45]], float addrspace(1)* [[OUT:%.*]]
-; CHECK-NEXT:    ret void
-; CHECK:       ENDIF:
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP31]], true
-; CHECK-NEXT:    br i1 [[TMP6]], label [[ENDIF28:%.*]], label [[FLOW1]]
-; CHECK:       Flow1:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP20]], [[ENDIF28]] ], [ [[TMP0]], [[ENDIF]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP35:%.*]], [[ENDIF28]] ], [ [[TMP1]], [[ENDIF]] ]
-; CHECK-NEXT:    [[TMP9]] = phi float [ [[TMP35]], [[ENDIF28]] ], [ [[TEMP8_0_PH]], [[ENDIF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ [[TMP36:%.*]], [[ENDIF28]] ], [ true, [[ENDIF]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ false, [[ENDIF28]] ], [ true, [[ENDIF]] ]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[IF29]], label [[FLOW2:%.*]]
-; CHECK:       IF29:
-; CHECK-NEXT:    [[TMP32]] = icmp sgt i32 [[TMP20]], 2
-; CHECK-NEXT:    br label [[FLOW2]]
-; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP12]] = phi i32 [ [[TMP7]], [[FLOW2]] ], [ [[TMP0]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP13]] = phi float [ [[TMP8]], [[FLOW2]] ], [ [[TMP1]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP14]] = phi float [ [[TMP3]], [[FLOW2]] ], [ [[TEMP8_0_PH]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP15]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP]] ]
-; CHECK-NEXT:    [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW2]] ], [ true, [[LOOP]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ true, [[LOOP]] ]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[FLOW3]], label [[LOOP]]
-; CHECK:       ENDIF28:
-; CHECK-NEXT:    [[TMP35]] = fadd float [[TEMP8_0_PH]], 1.000000e+00
-; CHECK-NEXT:    [[TMP36]] = icmp sgt i32 [[TMP20]], 2
-; CHECK-NEXT:    br label [[FLOW1]]
-;
+
+; CHECK: main_body:
+; CHECK: br label %LOOP.outer
 main_body:
  br label %LOOP.outer

+; CHECK: LOOP.outer:
+; CHECK: br label %LOOP
 LOOP.outer:                                       ; preds = %ENDIF28, %main_body
  %temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
  %temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
  br label %LOOP

+; CHECK: LOOP:
+; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
 LOOP:                                             ; preds = %IF29, %LOOP.outer
  %temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
  %tmp20 = add i32 %temp4.0, 1
  %tmp22 = icmp sgt i32 %tmp20, 3
  br i1 %tmp22, label %ENDLOOP, label %ENDIF

+; CHECK: Flow3
+; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
+
+; CHECK: ENDLOOP:
+; CHECK: ret void
 ENDLOOP:                                          ; preds = %ENDIF28, %IF29, %LOOP
  %temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
  %tmp23 = icmp eq i32 %tmp20, 3
@ -78,14 +34,29 @@ ENDLOOP:                                          ; preds = %ENDIF28, %IF29, %LO
  store float %.45, float addrspace(1)* %out
  ret void

+; CHECK: ENDIF:
+; CHECK: br i1 %tmp31, label %IF29, label %Flow1
 ENDIF:                                            ; preds = %LOOP
  %tmp31 = icmp sgt i32 %tmp20, 1
  br i1 %tmp31, label %IF29, label %ENDIF28

+; CHECK: Flow:
+; CHECK: br i1 %{{[0-9]+}}, label %Flow2, label %LOOP
+
+; CHECK: IF29:
+; CHECK: br label %Flow1
 IF29:                                             ; preds = %ENDIF
  %tmp32 = icmp sgt i32 %tmp20, 2
  br i1 %tmp32, label %ENDLOOP, label %LOOP

+; CHECK: Flow1:
+; CHECK: br label %Flow
+
+; CHECK: Flow2:
+; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
+
+; CHECK: ENDIF28:
+; CHECK: br label %Flow3
 ENDIF28:                                          ; preds = %ENDIF
  %tmp35 = fadd float %temp8.0.ph, 1.0
  %tmp36 = icmp sgt i32 %tmp20, 2