Vendor import of llvm release_50 branch r310316:

https://llvm.org/svn/llvm-project/llvm/branches/release_50@310316
2017-08-08 16:52:53 +00:00 · 2017-08-08 16:52:53 +00:00 · 4e20bb0468
commit 4e20bb0468
parent 3ad6a4b447
45 changed files with 872 additions and 247 deletions
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml
@ -20,6 +20,10 @@ type llattribute
 type llmemorybuffer
 type llmdkind

+exception FeatureDisabled of string
+
+let () = Callback.register_exception "Llvm.FeatureDisabled" (FeatureDisabled "")
+
 module TypeKind = struct
  type t =
  | Void
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@ -371,6 +371,8 @@ type ('a, 'b) llrev_pos =

 (** {6 Exceptions} *)

+exception FeatureDisabled of string
+
 exception IoError of string


--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@ -336,7 +336,12 @@ CAMLprim LLVMContextRef llvm_type_context(LLVMTypeRef Ty) {

 /* lltype -> unit */
 CAMLprim value llvm_dump_type(LLVMTypeRef Val) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  LLVMDumpType(Val);
+#else
+  caml_raise_with_arg(*caml_named_value("Llvm.FeatureDisabled"),
+      caml_copy_string("dump"));
+#endif
  return Val_unit;
 }

--- a/cmake/modules/AddOCaml.cmake
+++ b/cmake/modules/AddOCaml.cmake
@ -87,6 +87,11 @@ function(add_ocaml_library name)
  foreach( include_dir ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} )
    set(c_flags "${c_flags} -I${include_dir}")
  endforeach()
+  # include -D/-UNDEBUG to match dump function visibility
+  # regex from HandleLLVMOptions.cmake
+  string(REGEX MATCH "(^| )[/-][UD] *NDEBUG($| )" flag_matches
+         "${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_C_FLAGS}")
+  set(c_flags "${c_flags} ${flag_matches}")

  foreach( ocaml_file ${ARG_OCAML} )
    list(APPEND sources "${ocaml_file}.mli" "${ocaml_file}.ml")
@ -199,7 +204,7 @@ function(add_ocaml_library name)
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
                      GROUP_READ GROUP_EXECUTE
                      WORLD_READ WORLD_EXECUTE
-          DESTINATION "${LLVM_OCAML_INSTALL_PATH}/llvm")
+          DESTINATION "${LLVM_OCAML_INSTALL_PATH}/stublibs")

  foreach( install_file ${install_files} ${install_shlibs} )
    get_filename_component(filename "${install_file}" NAME)
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -125,7 +125,22 @@ Changes to the AMDGPU Target
 Changes to the AVR Target
 -----------------------------

- During this release ...
+This release consists mainly of bugfixes and implementations of features
+required for compiling basic Rust programs.
+
+* Enable the branch relaxation pass so that we don't crash on large
+  stack load/stores
+
+* Add support for lowering bit-rotations to the native `ror` and `rol`
+  instructions
+
+* Fix bug where function pointers were treated as pointers to RAM and not
+  pointers to program memory
+
+* Fix broken code generaton for shift-by-variable expressions
+
+* Support zero-sized types in argument lists; this is impossible in C,
+  but possible in Rust

 Changes to the OCaml bindings
 -----------------------------
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@ -312,6 +312,12 @@ template <typename T> class ArrayRef;
                            const DataLayout &DL, LoopInfo *LI = nullptr,
                            unsigned MaxLookup = 6);

+  /// This is a wrapper around GetUnderlyingObjects and adds support for basic
+  /// ptrtoint+arithmetic+inttoptr sequences.
+  void getUnderlyingObjectsForCodeGen(const Value *V,
+                            SmallVectorImpl<Value *> &Objects,
+                            const DataLayout &DL);
+
  /// Return true if the only users of this pointer are lifetime markers.
  bool onlyUsedByLifetimeMarkers(const Value *V);

--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@ -661,6 +661,12 @@ class MachineFunction {
  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                          int64_t Offset, uint64_t Size);

+  /// Allocate a new MachineMemOperand by copying an existing one,
+  /// replacing only AliasAnalysis information. MachineMemOperands are owned
+  /// by the MachineFunction and need not be explicitly deallocated.
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          const AAMDNodes &AAInfo);
+
  using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity;

  /// Allocate an array of MachineOperands. This is only intended for use by
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@ -379,6 +379,9 @@ class MachineInstr
    return NumMemRefs == 1;
  }

+  /// Return the number of memory operands.
+  unsigned getNumMemOperands() const { return NumMemRefs; }
+
  /// API for querying MachineInstr properties. They are the same as MCInstrDesc
  /// queries but they are bundle aware.

--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@ -3277,6 +3277,69 @@ void llvm::GetUnderlyingObjects(Value *V, SmallVectorImpl<Value *> &Objects,
  } while (!Worklist.empty());
 }

+/// This is the function that does the work of looking through basic
+/// ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const Operator *U = dyn_cast<Operator>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (U->getOpcode() == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant, a multiplied value, or a phi, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed by the multiply,
+      // because our callers only care when the result is an
+      // identifiable object.
+      if (U->getOpcode() != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
+           !isa<PHINode>(U->getOperand(1))))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
+  } while (true);
+}
+
+/// This is a wrapper around GetUnderlyingObjects and adds support for basic
+/// ptrtoint+arithmetic+inttoptr sequences.
+void llvm::getUnderlyingObjectsForCodeGen(const Value *V,
+                          SmallVectorImpl<Value *> &Objects,
+                          const DataLayout &DL) {
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 4> Working(1, V);
+  do {
+    V = Working.pop_back_val();
+
+    SmallVector<Value *, 4> Objs;
+    GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
+
+    for (Value *V : Objs) {
+      if (!Visited.insert(V).second)
+        continue;
+      if (Operator::getOpcode(V) == Instruction::IntToPtr) {
+        const Value *O =
+          getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+        if (O->getType()->isPointerTy()) {
+          Working.push_back(O);
+          continue;
+        }
+      }
+      // If GetUnderlyingObjects fails to find an identifiable object,
+      // getUnderlyingObjectsForCodeGen also fails for safety.
+      if (!isIdentifiedObject(V)) {
+        Objects.clear();
+        return;
+      }
+      Objects.push_back(const_cast<Value *>(V));
+    }
+  } while (!Working.empty());
+}
+
 /// Return true if the only users of this pointer are lifetime markers.
 bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
  for (const User *U : V->users()) {
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@ -1475,13 +1475,14 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
      bool PredAnalyzable =
          !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);

-      if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) {
+      if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB &&
+          PredTBB != PredFBB) {
        // The predecessor has a conditional branch to this block which consists
        // of only a tail call. Try to fold the tail call into the conditional
        // branch.
        if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
          // TODO: It would be nice if analyzeBranch() could provide a pointer
-          // to the branch insturction so replaceBranchWithTailCall() doesn't
+          // to the branch instruction so replaceBranchWithTailCall() doesn't
          // have to search for it.
          TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
          ++NumTailCalls;
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@ -330,6 +330,20 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                               MMO->getOrdering(), MMO->getFailureOrdering());
 }

+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      const AAMDNodes &AAInfo) {
+  MachinePointerInfo MPI = MMO->getValue() ?
+             MachinePointerInfo(MMO->getValue(), MMO->getOffset()) :
+             MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset());
+
+  return new (Allocator)
+             MachineMemOperand(MPI, MMO->getFlags(), MMO->getSize(),
+                               MMO->getBaseAlignment(), AAInfo,
+                               MMO->getRanges(), MMO->getSyncScopeID(),
+                               MMO->getOrdering(), MMO->getFailureOrdering());
+}
+
 MachineInstr::mmo_iterator
 MachineFunction::allocateMemRefsArray(unsigned long Num) {
  return Allocator.Allocate<MachineMemOperand *>(Num);
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@ -578,10 +578,8 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
  if (BasePtr == nullptr)
    return false;

-  return isDereferenceableAndAlignedPointer(BasePtr, 1,
-                                            APInt(DL.getPointerSize(),
-                                                  Offset + Size),
-                                            DL);
+  return isDereferenceableAndAlignedPointer(
+      BasePtr, 1, APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
 }

 /// getConstantPool - Return a MachinePointerInfo record that refers to the
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@ -121,63 +121,6 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
  SchedModel.init(ST.getSchedModel(), &ST, TII);
 }

-/// This is the function that does the work of looking through basic
-/// ptrtoint+arithmetic+inttoptr sequences.
-static const Value *getUnderlyingObjectFromInt(const Value *V) {
-  do {
-    if (const Operator *U = dyn_cast<Operator>(V)) {
-      // If we find a ptrtoint, we can transfer control back to the
-      // regular getUnderlyingObjectFromInt.
-      if (U->getOpcode() == Instruction::PtrToInt)
-        return U->getOperand(0);
-      // If we find an add of a constant, a multiplied value, or a phi, it's
-      // likely that the other operand will lead us to the base
-      // object. We don't have to worry about the case where the
-      // object address is somehow being computed by the multiply,
-      // because our callers only care when the result is an
-      // identifiable object.
-      if (U->getOpcode() != Instruction::Add ||
-          (!isa<ConstantInt>(U->getOperand(1)) &&
-           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
-           !isa<PHINode>(U->getOperand(1))))
-        return V;
-      V = U->getOperand(0);
-    } else {
-      return V;
-    }
-    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
-  } while (true);
-}
-
-/// This is a wrapper around GetUnderlyingObjects and adds support for basic
-/// ptrtoint+arithmetic+inttoptr sequences.
-static void getUnderlyingObjects(const Value *V,
-                                 SmallVectorImpl<Value *> &Objects,
-                                 const DataLayout &DL) {
-  SmallPtrSet<const Value *, 16> Visited;
-  SmallVector<const Value *, 4> Working(1, V);
-  do {
-    V = Working.pop_back_val();
-
-    SmallVector<Value *, 4> Objs;
-    GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
-
-    for (Value *V : Objs) {
-      if (!Visited.insert(V).second)
-        continue;
-      if (Operator::getOpcode(V) == Instruction::IntToPtr) {
-        const Value *O =
-          getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
-        if (O->getType()->isPointerTy()) {
-          Working.push_back(O);
-          continue;
-        }
-      }
-      Objects.push_back(const_cast<Value *>(V));
-    }
-  } while (!Working.empty());
-}
-
 /// If this machine instr has memory reference information and it can be tracked
 /// to a normal reference to a known object, return the Value for that object.
 static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
@ -208,12 +151,10 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
        Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
      } else if (const Value *V = MMO->getValue()) {
        SmallVector<Value *, 4> Objs;
-        getUnderlyingObjects(V, Objs, DL);
+        getUnderlyingObjectsForCodeGen(V, Objs, DL);

        for (Value *V : Objs) {
-          if (!isIdentifiedObject(V))
-            return false;
-
+          assert(isIdentifiedObject(V));
          Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
        }
      } else
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -99,6 +99,27 @@ LimitFPPrecision("limit-float-precision",
 // store [4096 x i8] %data, [4096 x i8]* %buffer
 static const unsigned MaxParallelChains = 64;

+// True if the Value passed requires ABI mangling as it is a parameter to a
+// function or a return value from a function which is not an intrinsic.
+static bool isABIRegCopy(const Value * V) {
+  const bool IsRetInst = V && isa<ReturnInst>(V);
+  const bool IsCallInst = V && isa<CallInst>(V);
+  const bool IsInLineAsm =
+      IsCallInst && static_cast<const CallInst *>(V)->isInlineAsm();
+  const bool IsIndirectFunctionCall =
+      IsCallInst && !IsInLineAsm &&
+      !static_cast<const CallInst *>(V)->getCalledFunction();
+  // It is possible that the call instruction is an inline asm statement or an
+  // indirect function call in which case the return value of
+  // getCalledFunction() would be nullptr.
+  const bool IsInstrinsicCall =
+      IsCallInst && !IsInLineAsm && !IsIndirectFunctionCall &&
+      static_cast<const CallInst *>(V)->getCalledFunction()->getIntrinsicID() !=
+          Intrinsic::not_intrinsic;
+
+  return IsRetInst || (IsCallInst && (!IsInLineAsm && !IsInstrinsicCall));
+}
+
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                      const SDValue *Parts, unsigned NumParts,
                                      MVT PartVT, EVT ValueVT, const Value *V,
@ -1026,13 +1047,9 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {

  if (It != FuncInfo.ValueMap.end()) {
    unsigned InReg = It->second;
-    bool IsABIRegCopy =
-        V && ((isa<CallInst>(V) &&
-               !(static_cast<const CallInst *>(V))->isInlineAsm()) ||
-              isa<ReturnInst>(V));

    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                     DAG.getDataLayout(), InReg, Ty, IsABIRegCopy);
+                     DAG.getDataLayout(), InReg, Ty, isABIRegCopy(V));
    SDValue Chain = DAG.getEntryNode();
    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
                                 V);
@ -1221,13 +1238,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
  // If this is an instruction which fast-isel has deferred, select it now.
  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
    unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
-    bool IsABIRegCopy =
-        V && ((isa<CallInst>(V) &&
-               !(static_cast<const CallInst *>(V))->isInlineAsm()) ||
-              isa<ReturnInst>(V));

    RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
-                     Inst->getType(), IsABIRegCopy);
+                     Inst->getType(), isABIRegCopy(V));
    SDValue Chain = DAG.getEntryNode();
    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
  }
@ -8281,13 +8294,9 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  // If this is an InlineAsm we have to match the registers required, not the
  // notional registers required by the type.
-  bool IsABIRegCopy =
-    V && ((isa<CallInst>(V) &&
-           !(static_cast<const CallInst *>(V))->isInlineAsm()) ||
-          isa<ReturnInst>(V));

  RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
-                   V->getType(), IsABIRegCopy);
+                   V->getType(), isABIRegCopy(V));
  SDValue Chain = DAG.getEntryNode();

  ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
@ -889,6 +890,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {

  // Keep a list of *allocas* which need to be remapped.
  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+
+  // Keep a list of allocas which has been affected by the remap.
+  SmallPtrSet<const AllocaInst*, 32> MergedAllocas;
+
  for (const std::pair<int, int> &SI : SlotRemap) {
    const AllocaInst *From = MFI->getObjectAllocation(SI.first);
    const AllocaInst *To = MFI->getObjectAllocation(SI.second);
@ -908,6 +913,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
      Inst = Cast;
    }

+    // We keep both slots to maintain AliasAnalysis metadata later.
+    MergedAllocas.insert(From);
+    MergedAllocas.insert(To);
+
    // Allow the stack protector to adjust its value map to account for the
    // upcoming replacement.
    SP->adjustForColoring(From, To);
@ -939,13 +948,6 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {

      // Update the MachineMemOperand to use the new alloca.
      for (MachineMemOperand *MMO : I.memoperands()) {
-        // FIXME: In order to enable the use of TBAA when using AA in CodeGen,
-        // we'll also need to update the TBAA nodes in MMOs with values
-        // derived from the merged allocas. When doing this, we'll need to use
-        // the same variant of GetUnderlyingObjects that is used by the
-        // instruction scheduler (that can look through ptrtoint/inttoptr
-        // pairs).
-
        // We've replaced IR-level uses of the remapped allocas, so we only
        // need to replace direct uses here.
        const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue());
@ -997,6 +999,48 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
        MO.setIndex(ToSlot);
        FixedInstr++;
      }
+
+      // We adjust AliasAnalysis information for merged stack slots.
+      MachineSDNode::mmo_iterator NewMemOps =
+          MF->allocateMemRefsArray(I.getNumMemOperands());
+      unsigned MemOpIdx = 0;
+      bool ReplaceMemOps = false;
+      for (MachineMemOperand *MMO : I.memoperands()) {
+        // If this memory location can be a slot remapped here,
+        // we remove AA information.
+        bool MayHaveConflictingAAMD = false;
+        if (MMO->getAAInfo()) {
+          if (const Value *MMOV = MMO->getValue()) {
+            SmallVector<Value *, 4> Objs;
+            getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout());
+
+            if (Objs.empty())
+              MayHaveConflictingAAMD = true;
+            else
+              for (Value *V : Objs) {
+                // If this memory location comes from a known stack slot
+                // that is not remapped, we continue checking.
+                // Otherwise, we need to invalidate AA infomation.
+                const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V);
+                if (AI && MergedAllocas.count(AI)) {
+                  MayHaveConflictingAAMD = true;
+                  break;
+                }
+              }
+          }
+        }
+        if (MayHaveConflictingAAMD) {
+          NewMemOps[MemOpIdx++] = MF->getMachineMemOperand(MMO, AAMDNodes());
+          ReplaceMemOps = true;
+        }
+        else
+          NewMemOps[MemOpIdx++] = MMO;
+      }
+
+      // If any memory operand is updated, set memory references of
+      // this instruction.
+      if (ReplaceMemOps)
+        I.setMemRefs(std::make_pair(NewMemOps, I.getNumMemOperands()));
    }

  // Update the location of C++ catch objects for the MSVC personality routine.
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@ -2097,15 +2097,19 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
      // Subsequent evaluation would get confused and produce erroneous results.
      //
      // The following prohibits such a GEP from being formed by checking to see
-      // if the index is in-range with respect to an array or vector.
+      // if the index is in-range with respect to an array.
+      // TODO: This code may be extended to handle vectors as well.
      bool PerformFold = false;
      if (Idx0->isNullValue())
        PerformFold = true;
      else if (LastI.isSequential())
        if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx0))
-          PerformFold =
-              !LastI.isBoundedSequential() ||
-              isIndexInRangeOfArrayType(LastI.getSequentialNumElements(), CI);
+          PerformFold = (!LastI.isBoundedSequential() ||
+                         isIndexInRangeOfArrayType(
+                             LastI.getSequentialNumElements(), CI)) &&
+                        !CE->getOperand(CE->getNumOperands() - 1)
+                             ->getType()
+                             ->isVectorTy();

      if (PerformFold) {
        SmallVector<Value*, 16> NewIndices;
--- a/lib/Object/COFFImportFile.cpp
+++ b/lib/Object/COFFImportFile.cpp
@ -542,15 +542,12 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
  SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);

  //__imp_ String Table
-  if (Imp) {
-    SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 7;
-    writeStringTable(Buffer, {std::string("__imp_").append(Sym),
-                              std::string("__imp_").append(Weak)});
-  } else {
-    SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 1;
-    writeStringTable(Buffer, {Sym, Weak});
-  }
+  StringRef Prefix = Imp ? "__imp_" : "";
+  SymbolTable[3].Name.Offset.Offset =
+      sizeof(uint32_t) + Sym.size() + Prefix.size() + 1;
  append(Buffer, SymbolTable);
+  writeStringTable(Buffer, {(Prefix + Sym).str(),
+                            (Prefix + Weak).str()});

  // Copied here so we can still use writeStringTable
  char *Buf = Alloc.Allocate<char>(Buffer.size());
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@ -946,6 +946,18 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
  case AArch64::CMP_SWAP_128:
    return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);

+  case AArch64::AESMCrrTied:
+  case AArch64::AESIMCrrTied: {
+    MachineInstrBuilder MIB =
+    BuildMI(MBB, MBBI, MI.getDebugLoc(),
+            TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr :
+                                                      AArch64::AESIMCrr))
+      .add(MI.getOperand(0))
+      .add(MI.getOperand(1));
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+   }
  }
  return false;
 }
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@ -506,19 +506,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
    return;
  }

-  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes - CSStackSize);
+  AFI->setLocalStackSize(NumBytes - PrologueSaveSize);

  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
  if (CombineSPBump) {
    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                    MachineInstr::FrameSetup);
    NumBytes = 0;
-  } else if (CSStackSize != 0) {
+  } else if (PrologueSaveSize != 0) {
    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
-                                                     -CSStackSize);
-    NumBytes -= CSStackSize;
+                                                     -PrologueSaveSize);
+    NumBytes -= PrologueSaveSize;
  }
  assert(NumBytes >= 0 && "Negative stack allocation size!?");

@ -532,8 +536,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
    ++MBBI;
  }
  if (HasFP) {
-    // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
-    int FPOffset = CSStackSize - 16;
+    // Only set up FP if we actually need to. Frame pointer is fp =
+    // sp - fixedobject - 16.
+    int FPOffset = AFI->getCalleeSavedStackSize() - 16;
    if (CombineSPBump)
      FPOffset += AFI->getLocalStackSize();

@ -672,8 +677,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
    if (HasFP) {
      // Define the current CFA rule to use the provided FP.
      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, Reg, 2 * StackGrowth - FixedObject));
      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
          .addCFIIndex(CFIIndex)
          .setMIFlags(MachineInstr::FrameSetup);
@ -759,12 +764,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
  // it as the 2nd argument of AArch64ISD::TC_RETURN.

-  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);

-  if (!CombineSPBump && CSStackSize != 0)
+  if (!CombineSPBump && PrologueSaveSize != 0)
    convertCalleeSaveRestoreToSPPrePostIncDec(
-        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);

  // Move past the restores of the callee-saved registers.
  MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
@ -786,7 +795,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
    return;
  }

-  NumBytes -= CSStackSize;
+  NumBytes -= PrologueSaveSize;
  assert(NumBytes >= 0 && "Negative stack allocation size!?");

  if (!hasFP(MF)) {
@ -796,7 +805,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
    if (RedZone && ArgumentPopSize == 0)
      return;

-    bool NoCalleeSaveRestore = CSStackSize == 0;
+    bool NoCalleeSaveRestore = PrologueSaveSize == 0;
    int StackRestoreBytes = RedZone ? 0 : NumBytes;
    if (NoCalleeSaveRestore)
      StackRestoreBytes += ArgumentPopSize;
@ -815,7 +824,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
  // be able to save any instructions.
  if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+                    -AFI->getCalleeSavedStackSize() + 16, TII,
+                    MachineInstr::FrameDestroy);
  else if (NumBytes)
    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
                    MachineInstr::FrameDestroy);
@ -845,7 +855,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
      MF.getSubtarget().getRegisterInfo());
  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  int FPOffset = MFI.getObjectOffset(FI) + 16;
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
  int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
  bool isFixed = MFI.isFixedObjectIndex(FI);

@ -956,12 +970,6 @@ static void computeCalleeSaveRegisterPairs(
         "Odd number of callee-saved regs to spill!");
  int Offset = AFI->getCalleeSavedStackSize();

-  unsigned GPRSaveSize = AFI->getVarArgsGPRSize();
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
-  if (IsWin64)
-    Offset -= alignTo(GPRSaveSize, 16);
-
  for (unsigned i = 0; i < Count; ++i) {
    RegPairInfo RPI;
    RPI.Reg1 = CSI[i].getReg();
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -9586,8 +9586,8 @@ static bool performTBISimplification(SDValue Addr,
                                     SelectionDAG &DAG) {
  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
  KnownBits Known;
-  TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
-                                        DCI.isBeforeLegalizeOps());
+  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                        !DCI.isBeforeLegalizeOps());
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
    DCI.CommitTargetLoweringOpt(TLO);
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -37,6 +37,9 @@ def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                 AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
                                 AssemblerPredicate<"FeatureSPE", "spe">;
+def HasFuseAES       : Predicate<"Subtarget->hasFuseAES()">,
+                                 AssemblerPredicate<"FeatureFuseAES",
+                                 "fuse-aes">;
 def HasSVE           : Predicate<"Subtarget->hasSVE()">,
                                 AssemblerPredicate<"FeatureSVE", "sve">;

@ -5304,6 +5307,31 @@ def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
 def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
 def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;

+// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
+// for AES fusion on some CPUs.
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
+def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+                        Sched<[WriteV]>;
+def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+                         Sched<[WriteV]>;
+}
+
+// Only use constrained versions of AES(I)MC instructions if they are paired with
+// AESE/AESD.
+def : Pat<(v16i8 (int_aarch64_crypto_aesmc
+            (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
+                                            (v16i8 V128:$src2))))),
+          (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
+                                             (v16i8 V128:$src2)))))>,
+          Requires<[HasFuseAES]>;
+
+def : Pat<(v16i8 (int_aarch64_crypto_aesimc
+            (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
+                                            (v16i8 V128:$src2))))),
+          (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
+                                              (v16i8 V128:$src2)))))>,
+          Requires<[HasFuseAES]>;
+
 def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
 def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
 def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@ -118,11 +118,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
    // Fuse AES crypto operations.
    switch(SecondOpcode) {
    // AES encode.
-    case AArch64::AESMCrr :
+    case AArch64::AESMCrr:
+    case AArch64::AESMCrrTied:
      return FirstOpcode == AArch64::AESErr ||
             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
    // AES decode.
    case AArch64::AESIMCrr:
+    case AArch64::AESIMCrrTied:
      return FirstOpcode == AArch64::AESDrr ||
             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
    }
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -3984,6 +3984,13 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
  if (Offset != MFI.getObjectOffset(FI))
    return false;

+  // If this is not byval, check that the argument stack object is immutable.
+  // inalloca and argument copy elision can create mutable argument stack
+  // objects. Byval objects can be mutated, but a byval call intends to pass the
+  // mutated memory.
+  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
+    return false;
+
  if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
    // If the argument location is wider than the argument type, check that any
    // extension flags match.
@ -30605,8 +30612,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
    APInt DemandedMask(APInt::getSignMask(BitWidth));
    KnownBits Known;
-    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
-                                          DCI.isBeforeLegalizeOps());
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
        TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
      // If we changed the computation somewhere in the DAG, this change will
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -3697,8 +3697,7 @@ let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
 def PAUSE : I<0x90, RawFrm, (outs), (ins),
-              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
-              OBXS, Requires<[HasSSE2]>;
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS;
 }

 let SchedRW = [WriteFence] in {
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@ -356,7 +356,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
      // Just add all the struct element types.
      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                                        "", InsertPt);
+                                        I->getParamAlignment(), "", InsertPt);
      StructType *STy = cast<StructType>(AgTy);
      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
                        nullptr};
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@ -756,7 +756,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
  bool runOnFunction() {
    if (!ClStack) return false;

-    if (ClRedzoneByvalArgs) copyArgsPassedByValToAllocas();
+    if (ClRedzoneByvalArgs && Mapping.Offset != kDynamicShadowSentinel)
+      copyArgsPassedByValToAllocas();

    // Collect alloca, ret, lifetime instructions etc.
    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@ -1790,7 +1790,8 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
  // variables that do not have their 'addresses taken'.  If they don't have
  // their addresses taken, we can propagate constants through them.
  for (GlobalVariable &G : M.globals())
-    if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+    if (!G.isConstant() && G.hasLocalLinkage() &&
+        G.hasDefinitiveInitializer() && !AddressIsTaken(&G))
      Solver.TrackValueOfGlobalVariable(&G);

  // Solve for constants.
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@ -132,7 +132,8 @@ std::unique_ptr<Module> llvm::CloneModule(
    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
    I->getAllMetadata(MDs);
    for (auto MD : MDs)
-      GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
+      GV->addMetadata(MD.first,
+                      *MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));

    copyComdat(GV, &*I);
  }
--- a/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
+++ b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
@ -2,14 +2,14 @@

 define win64cc void @pass_va(i32 %count, ...) nounwind {
 entry:
-; CHECK: sub     sp, sp, #80
+; CHECK: str     x30, [sp, #-80]!
 ; CHECK: add     x8, sp, #24
 ; CHECK: add     x0, sp, #24
 ; CHECK: stp     x6, x7, [sp, #64]
 ; CHECK: stp     x4, x5, [sp, #48]
 ; CHECK: stp     x2, x3, [sp, #32]
 ; CHECK: str     x1, [sp, #24]
-; CHECK: stp     x30, x8, [sp]
+; CHECK: str     x8, [sp, #8]
 ; CHECK: bl      other_func
 ; CHECK: ldr     x30, [sp], #80
 ; CHECK: ret
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@ -1,10 +1,10 @@
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-aes,+crypto | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=generic -mattr=+crypto | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSEALLPAIRS
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-aes,+crypto | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=generic -mattr=+crypto | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s

 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
 declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %d)
@ -76,41 +76,23 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
  ret void

 ; CHECK-LABEL: aesea:
-; CHECKFUSEALLPAIRS: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKFUSEALLPAIRS: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKFUSEALLPAIRS: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKFUSEALLPAIRS: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKFUSEALLPAIRS: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKFUSEALLPAIRS: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKFUSEALLPAIRS: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKFUSEALLPAIRS: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
-; CHECKFUSEALLPAIRS-NOT: aesmc
-
-; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}
-; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKM1: aesmc {{v[0-7].16b}}, [[VH]]
-; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECK: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VA]], [[VA]]
+; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VB]], [[VB]]
+; CHECK: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VC]], [[VC]]
+; CHECK: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VD]], [[VD]]
+; CHECK: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VE]], [[VE]]
+; CHECK: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VF]], [[VF]]
+; CHECK: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VG]], [[VG]]
+; CHECK: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc [[VH]], [[VH]]
+; CHECK-NOT: aesmc
 }

 define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
@ -178,41 +160,23 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
  ret void

 ; CHECK-LABEL: aesda:
-; CHECKFUSEALLPAIRS: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKFUSEALLPAIRS: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKFUSEALLPAIRS: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKFUSEALLPAIRS: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKFUSEALLPAIRS: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKFUSEALLPAIRS: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKFUSEALLPAIRS: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKFUSEALLPAIRS: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKFUSEALLPAIRS-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
-; CHECKFUSEALLPAIRS-NOT: aesimc
-
-; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}
-; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKM1: aesimc {{v[0-7].16b}}, [[VH]]
-; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECK: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VA]], [[VA]]
+; CHECK: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VB]], [[VB]]
+; CHECK: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VC]], [[VC]]
+; CHECK: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VD]], [[VD]]
+; CHECK: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VE]], [[VE]]
+; CHECK: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VF]], [[VF]]
+; CHECK: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VG]], [[VG]]
+; CHECK: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesimc [[VH]], [[VH]]
+; CHECK-NOT: aesimc
 }

 define void @aes_load_store(<16 x i8> *%p1, <16 x i8> *%p2 , <16 x i8> *%p3) {
@ -225,20 +189,20 @@ entry:
  %in1 = load <16 x i8>, <16 x i8>* %p1, align 16
  store <16 x i8> %in1, <16 x i8>* %x1, align 16
  %aese1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in1) #2
-  store <16 x i8> %aese1, <16 x i8>* %x2, align 16
  %in2 = load <16 x i8>, <16 x i8>* %p2, align 16
  %aesmc1= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese1) #2
-  store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16
  %aese2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in2) #2
-  store <16 x i8> %aese2, <16 x i8>* %x4, align 16
+  store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16
+  %in3 = load <16 x i8>, <16 x i8>* %p3, align 16
  %aesmc2= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese2) #2
-  store <16 x i8> %aesmc2, <16 x i8>* %x5, align 16
+  %aese3 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %aesmc2, <16 x i8> %in3) #2
+  store <16 x i8> %aese3, <16 x i8>* %x5, align 16
  ret void

 ; CHECK-LABEL: aes_load_store:
 ; CHECK: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECK-NEXT: aesmc [[VA]], [[VA]]
 ; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECK-NEXT: aesmc [[VB]], [[VB]]
 ; CHECK-NOT: aesmc
 }
--- a/test/CodeGen/AArch64/tbi.ll
+++ b/test/CodeGen/AArch64/tbi.ll
@ -100,3 +100,14 @@ define i32 @ld_and32_narrower(i64 %p) {
  %load = load i32, i32* %cast
  ret i32 %load
 }
+
+; BOTH-LABEL:ld_and8:
+; BOTH: and x
+define i32 @ld_and8(i64 %base, i8 %off) {
+  %off_masked = and i8 %off, 63
+  %off_64 = zext i8 %off_masked to i64
+  %p = add i64 %base, %off_64
+  %cast = inttoptr i64 %p to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
--- a/test/CodeGen/AArch64/win64_vararg.ll
+++ b/test/CodeGen/AArch64/win64_vararg.ll
@ -2,14 +2,14 @@

 define void @pass_va(i32 %count, ...) nounwind {
 entry:
-; CHECK: sub     sp, sp, #80
+; CHECK: str     x30, [sp, #-80]!
 ; CHECK: add     x8, sp, #24
 ; CHECK: add     x0, sp, #24
 ; CHECK: stp     x6, x7, [sp, #64]
 ; CHECK: stp     x4, x5, [sp, #48]
 ; CHECK: stp     x2, x3, [sp, #32]
 ; CHECK: str     x1, [sp, #24]
-; CHECK: stp     x30, x8, [sp]
+; CHECK: str     x8, [sp, #8]
 ; CHECK: bl      other_func
 ; CHECK: ldr     x30, [sp], #80
 ; CHECK: ret
@ -102,6 +102,113 @@ declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 declare i32 @__stdio_common_vsprintf(i64, i8*, i64, i8*, i8*, i8*) local_unnamed_addr #3
 declare i64* @__local_stdio_printf_options() local_unnamed_addr #4

+; CHECK-LABEL: fp
+; CHECK: str     x21, [sp, #-96]!
+; CHECK: stp     x20, x19, [sp, #16]
+; CHECK: stp     x29, x30, [sp, #32]
+; CHECK: add     x29, sp, #32
+; CHECK: add     x8, x29, #24
+; CHECK: mov     x19, x2
+; CHECK: mov     x20, x1
+; CHECK: mov     x21, x0
+; CHECK: stp     x6, x7, [x29, #48]
+; CHECK: stp     x4, x5, [x29, #32]
+; CHECK: str     x3, [x29, #24]
+; CHECK: str     x8, [sp, #8]
+; CHECK: bl      __local_stdio_printf_options
+; CHECK: ldr     x8, [x0]
+; CHECK: add     x5, x29, #24
+; CHECK: mov     x1, x21
+; CHECK: mov     x2, x20
+; CHECK: orr     x0, x8, #0x2
+; CHECK: mov     x3, x19
+; CHECK: mov     x4, xzr
+; CHECK: bl      __stdio_common_vsprintf
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK: ldp     x20, x19, [sp, #16]
+; CHECK: cmp     w0, #0
+; CHECK: csinv   w0, w0, wzr, ge
+; CHECK: ldr     x21, [sp], #96
+; CHECK: ret
+define i32 @fp(i8*, i64, i8*, ...) local_unnamed_addr #6 {
+  %4 = alloca i8*, align 8
+  %5 = bitcast i8** %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %5) #2
+  call void @llvm.va_start(i8* nonnull %5)
+  %6 = load i8*, i8** %4, align 8
+  %7 = call i64* @__local_stdio_printf_options() #2
+  %8 = load i64, i64* %7, align 8
+  %9 = or i64 %8, 2
+  %10 = call i32 @__stdio_common_vsprintf(i64 %9, i8* %0, i64 %1, i8* %2, i8* null, i8* %6) #2
+  %11 = icmp sgt i32 %10, -1
+  %12 = select i1 %11, i32 %10, i32 -1
+  call void @llvm.va_end(i8* nonnull %5)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %5) #2
+  ret i32 %12
+}
+
+attributes #6 = { "no-frame-pointer-elim"="true" }
+
+; CHECK-LABEL: vla
+; CHECK: str     x23, [sp, #-112]!
+; CHECK: stp     x22, x21, [sp, #16]
+; CHECK: stp     x20, x19, [sp, #32]
+; CHECK: stp     x29, x30, [sp, #48]
+; CHECK: add     x29, sp, #48
+; CHECK: add     x8, x29, #16
+; CHECK: stur    x8, [x29, #-40]
+; CHECK: mov     w8, w0
+; CHECK: add     x8, x8, #15
+; CHECK: mov     x9, sp
+; CHECK: and     x8, x8, #0x1fffffff0
+; CHECK: sub     x20, x9, x8
+; CHECK: mov     x19, x1
+; CHECK: mov     x23, sp
+; CHECK: stp     x6, x7, [x29, #48]
+; CHECK: stp     x4, x5, [x29, #32]
+; CHECK: stp     x2, x3, [x29, #16]
+; CHECK: mov     sp, x20
+; CHECK: ldur    x21, [x29, #-40]
+; CHECK: sxtw    x22, w0
+; CHECK: bl      __local_stdio_printf_options
+; CHECK: ldr     x8, [x0]
+; CHECK: mov     x1, x20
+; CHECK: mov     x2, x22
+; CHECK: mov     x3, x19
+; CHECK: orr     x0, x8, #0x2
+; CHECK: mov     x4, xzr
+; CHECK: mov     x5, x21
+; CHECK: bl      __stdio_common_vsprintf
+; CHECK: mov     sp, x23
+; CHECK: sub     sp, x29, #48
+; CHECK: ldp     x29, x30, [sp, #48]
+; CHECK: ldp     x20, x19, [sp, #32]
+; CHECK: ldp     x22, x21, [sp, #16]
+; CHECK: ldr     x23, [sp], #112
+; CHECK: ret
+define void @vla(i32, i8*, ...) local_unnamed_addr {
+  %3 = alloca i8*, align 8
+  %4 = bitcast i8** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %4) #5
+  call void @llvm.va_start(i8* nonnull %4)
+  %5 = zext i32 %0 to i64
+  %6 = call i8* @llvm.stacksave()
+  %7 = alloca i8, i64 %5, align 1
+  %8 = load i8*, i8** %3, align 8
+  %9 = sext i32 %0 to i64
+  %10 = call i64* @__local_stdio_printf_options()
+  %11 = load i64, i64* %10, align 8
+  %12 = or i64 %11, 2
+  %13 = call i32 @__stdio_common_vsprintf(i64 %12, i8* nonnull %7, i64 %9, i8* %1, i8* null, i8* %8)
+  call void @llvm.va_end(i8* nonnull %4)
+  call void @llvm.stackrestore(i8* %6)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %4) #5
+  ret void
+}
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+
 ; CHECK-LABEL: snprintf
 ; CHECK: sub     sp,  sp, #96
 ; CHECK: stp     x21, x20, [sp, #16]
--- a/test/CodeGen/Mips/cconv/pr33883.ll
+++ b/test/CodeGen/Mips/cconv/pr33883.ll
@ -0,0 +1,12 @@
+; RUN: llc -march=mips -mcpu=mips32 < %s -o /dev/null
+
+; Test that calls to vector intrinsics do not crash SelectionDAGBuilder.
+
+define <4 x float> @_ZN4simd3foo17hebb969c5fb39a194E(<4 x float>) {
+start:
+  %1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %0)
+
+  ret <4 x float> %1
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
--- a/test/CodeGen/Mips/pr33978.ll
+++ b/test/CodeGen/Mips/pr33978.ll
@ -0,0 +1,20 @@
+; RUN: llc -march=mips -mcpu=mips32r2 < %s -o /dev/null
+
+; Test that SelectionDAG does not crash during DAGCombine when two pointers
+; to the stack match with differing bases and offsets when expanding memcpy.
+; This could result in one of the pointers being considered dereferenceable
+; and other not.
+
+define void @foo(i8*) {
+start:
+  %a = alloca [22 x i8]
+  %b = alloca [22 x i8]
+  %c = bitcast [22 x i8]* %a to i8*
+  %d = getelementptr inbounds [22 x i8], [22 x i8]* %b, i32 0, i32 2
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %c, i8* %d, i32 20, i32 1, i1 false)
+  %e = getelementptr inbounds [22 x i8], [22 x i8]* %b, i32 0, i32 6
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %e, i32 12, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
--- a/test/CodeGen/X86/conditional-tailcall-samedest.mir
+++ b/test/CodeGen/X86/conditional-tailcall-samedest.mir
@ -0,0 +1,139 @@
+# RUN: llc -run-pass=branch-folder %s -o - | FileCheck %s
+
+# PR33980
+
+# Don't form conditional tail calls when the original conditional branch has
+# the same true and false destination. Otherwise, when we remove the tail call
+# successor we will also remove the fallthrough successor from the CFG.
+
+# CHECK: body:             |
+# CHECK:   bb.0.entry:
+# CHECK:     successors: %bb.1.sw.bb(0x40000000)
+# CHECK:     liveins: %edi
+# CHECK:     CMP32ri8 killed %edi, 2, implicit-def %eflags
+# CHECK:     TCRETURNdi64cc @mergeable_conditional_tailcall
+
+# This was the unconditional branch to a dead MBB that we left behind before
+# this bug was fixed.
+# CHECK-NOT: JMP_1 %bb.-1
+
+--- |
+  ; ModuleID = 't.ll'
+  source_filename = "t.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64--linux"
+  
+  @static_local_guard = external global i64, align 8
+  
+  ; Function Attrs: optsize
+  define void @f(i32 %arg) #0 {
+  entry:
+    switch i32 %arg, label %sw.epilog [
+      i32 0, label %sw.bb
+      i32 1, label %sw.bb
+      i32 2, label %sw.bb2
+    ]
+  
+  sw.bb:                                            ; preds = %entry, %entry
+    %tmp = load atomic i8, i8* bitcast (i64* @static_local_guard to i8*) acquire, align 8
+    %guard.uninitialized.i = icmp eq i8 %tmp, 0
+    br i1 %guard.uninitialized.i, label %init.check.i, label %return, !prof !0
+  
+  init.check.i:                                     ; preds = %sw.bb
+    tail call void @initialize_static_local(i64* nonnull @static_local_guard)
+    ret void
+  
+  sw.bb2:                                           ; preds = %entry
+    tail call void @mergeable_conditional_tailcall()
+    ret void
+  
+  sw.epilog:                                        ; preds = %entry
+    tail call void @mergeable_conditional_tailcall()
+    ret void
+  
+  return:                                           ; preds = %sw.bb
+    ret void
+  }
+  
+  declare void @mergeable_conditional_tailcall()
+  
+  declare void @initialize_static_local(i64*)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { optsize }
+  attributes #1 = { nounwind }
+  
+  !0 = !{!"branch_weights", i32 1, i32 1048575}
+
+...
+---
+name:            f
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+liveins:         
+  - { reg: '%edi', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      
+stack:           
+constants:       
+body:             |
+  bb.0.entry:
+    successors: %bb.2.sw.bb(0x40000000), %bb.1.entry(0x40000000)
+    liveins: %edi
+  
+    CMP32ri8 killed %edi, 2, implicit-def %eflags
+    JB_1 %bb.2.sw.bb, implicit %eflags
+    JMP_1 %bb.1.entry
+  
+  bb.1.entry:
+    successors: %bb.4.sw.bb2(0x40000000), %bb.5.sw.epilog(0x40000000)
+    liveins: %eflags
+  
+    JE_1 %bb.4.sw.bb2, implicit killed %eflags
+    JMP_1 %bb.5.sw.epilog
+  
+  bb.2.sw.bb:
+    successors: %bb.3.init.check.i(0x00000800), %bb.6.return(0x7ffff800)
+  
+    %al = ACQUIRE_MOV8rm %rip, 1, _, @static_local_guard, _ :: (volatile load acquire 1 from `i8* bitcast (i64* @static_local_guard to i8*)`, align 8)
+    TEST8rr killed %al, %al, implicit-def %eflags
+    JNE_1 %bb.6.return, implicit killed %eflags
+    JMP_1 %bb.3.init.check.i
+  
+  bb.3.init.check.i:
+    dead %edi = MOV32ri64 @static_local_guard, implicit-def %rdi
+    TCRETURNdi64 @initialize_static_local, 0, csr_64, implicit %rsp, implicit %rdi
+  
+  bb.4.sw.bb2:
+    TCRETURNdi64 @mergeable_conditional_tailcall, 0, csr_64, implicit %rsp
+  
+  bb.5.sw.epilog:
+    TCRETURNdi64 @mergeable_conditional_tailcall, 0, csr_64, implicit %rsp
+  
+  bb.6.return:
+    RET 0
+
+...
--- a/test/CodeGen/X86/pause.ll
+++ b/test/CodeGen/X86/pause.ll
@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=-sse -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s
+
+define void @test_x86_sse2_pause() {
+; CHECK-LABEL: test_x86_sse2_pause:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pause ## encoding: [0xf3,0x90]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse2.pause()
+  ret void
+}
+declare void @llvm.x86.sse2.pause() nounwind
--- a/test/CodeGen/X86/tail-call-mutable-memarg.ll
+++ b/test/CodeGen/X86/tail-call-mutable-memarg.ll
@ -0,0 +1,42 @@
+; RUN: llc < %s | FileCheck %s
+
+; Make sure we check that forwarded memory arguments are not modified when tail
+; calling. inalloca and copy arg elimination make argument slots mutable.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc19.0.24215"
+
+declare x86_stdcallcc void @tail_std(i32)
+declare void @capture(i32*)
+
+define x86_thiscallcc void @inalloca(i32* %this, i32* inalloca %args) {
+entry:
+  %val = load i32, i32* %args
+  store i32 0, i32* %args
+  tail call x86_stdcallcc void @tail_std(i32 %val)
+  ret void
+}
+
+; CHECK-LABEL: _inalloca:                              # @inalloca
+; CHECK:         movl    4(%esp), %[[reg:[^ ]*]]
+; CHECK:         movl    $0, 4(%esp)
+; CHECK:         pushl   %[[reg]]
+; CHECK:         calll   _tail_std@4
+; CHECK:         retl    $4
+
+define x86_stdcallcc void @copy_elide(i32 %arg) {
+entry:
+  %arg.ptr = alloca i32
+  store i32 %arg, i32* %arg.ptr
+  call void @capture(i32* %arg.ptr)
+  tail call x86_stdcallcc void @tail_std(i32 %arg)
+  ret void
+}
+
+; CHECK-LABEL: _copy_elide@4:                          # @copy_elide
+; CHECK:         leal    {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK:         pushl   %[[reg]]
+; CHECK:         calll   _capture
+; ...
+; CHECK:         calll   _tail_std@4
+; CHECK:         retl    $4
--- a/test/DllTool/coff-weak-exports.def
+++ b/test/DllTool/coff-weak-exports.def
@ -1,19 +1,11 @@
 ; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a
-; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s
+; RUN: llvm-nm %t.a | FileCheck %s

 LIBRARY test.dll
 EXPORTS
 TestFunction==AltTestFunction

-; CHECK: File: test.dll
-; CHECK: Format: COFF-x86-64
-; CHECK: Arch: x86_64
-; CHECK: AddressSize: 64bit
-; CHECK: File: test.dll
-; CHECK: Format: COFF-x86-64
-; CHECK: Arch: x86_64
-; CHECK: AddressSize: 64bit
-; CHECK: File: test.dll
-; CHECK: Format: COFF-x86-64
-; CHECK: Arch: x86_64
-; CHECK: AddressSize: 64bit
+; CHECK:      U AltTestFunction
+; CHECK-NEXT: w TestFunction
+; CHECK:      U __imp_AltTestFunction
+; CHECK-NEXT: w __imp_TestFunction
--- a/test/Instrumentation/AddressSanitizer/force-dynamic-shadow.ll
+++ b/test/Instrumentation/AddressSanitizer/force-dynamic-shadow.ll
@ -0,0 +1,22 @@
+; Test -asan-force-dynamic-shadow flag.
+;
+; RUN: opt -asan -asan-module -S -asan-force-dynamic-shadow=1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FDS
+; RUN: opt -asan -asan-module -S -asan-force-dynamic-shadow=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NDS
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @test_load(i32* %a) sanitize_address {
+; First instrumentation in the function must be to load the dynamic shadow
+; address into a local variable.
+; CHECK-LABEL: @test_load
+; CHECK: entry:
+; CHECK-FDS-NEXT: %[[SHADOW:[^ ]*]] = load i64, i64* @__asan_shadow_memory_dynamic_address
+; CHECK-NDS-NOT: __asan_shadow_memory_dynamic_address
+
+; Shadow address is loaded and added into the whole offset computation.
+; CHECK-FDS add i64 %{{.*}}, %[[SHADOW] ]
+
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
--- a/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
@ -1,5 +1,7 @@
 ; This check verifies that arguments passed by value get redzones.
 ; RUN: opt < %s -asan -asan-realign-stack=32 -S | FileCheck %s
+; RUN: opt < %s -asan -asan-realign-stack=32 -asan-force-dynamic-shadow -S | FileCheck %s --check-prefixes=CHECK-FDS
+

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
@ -8,6 +10,8 @@ target triple = "x86_64-unknown-linux-gnu"

 declare i32 @bar(%struct.A*)

+; CHECK-FDS-NOT: {{\.byval}}
+
 ; Test behavior for named argument with explicit alignment.  The memcpy and
 ; alloca alignments should match the explicit alignment of 64.
 define void @foo(%struct.A* byval align 64 %a) sanitize_address {
--- a/test/MC/AArch64/arm64-crypto.s
+++ b/test/MC/AArch64/arm64-crypto.s
@ -1,4 +1,5 @@
 ; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -show-encoding -output-asm-variant=1 < %s | FileCheck %s
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr='+crypto,+fuse-aes' -show-encoding -output-asm-variant=1 < %s | FileCheck %s

 foo:
  aese.16b v0, v1
--- a/test/Transforms/ArgumentPromotion/byval.ll
+++ b/test/Transforms/ArgumentPromotion/byval.ll
@ -6,24 +6,45 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 %struct.ss = type { i32, i64 }

 define internal void @f(%struct.ss* byval  %b) nounwind  {
-; CHECK-LABEL: define internal void @f(i32 %b.0, i64 %b.1)
 entry:
-  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0		; <i32*> [#uses=2]
-  %tmp1 = load i32, i32* %tmp, align 4		; <i32> [#uses=1]
-  %tmp2 = add i32 %tmp1, 1		; <i32> [#uses=1]
+  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp, align 4
+  %tmp2 = add i32 %tmp1, 1
  store i32 %tmp2, i32* %tmp, align 4
  ret void
 }

-define i32 @main() nounwind  {
-; CHECK-LABEL: define i32 @main
+; CHECK-LABEL: define internal void @f(i32 %b.0, i64 %b.1)
+; CHECK: alloca %struct.ss{{$}}
+; CHECK: store i32 %b.0
+; CHECK: store i64 %b.1
+
+define internal void @g(%struct.ss* byval align 32 %b) nounwind {
 entry:
-  %S = alloca %struct.ss		; <%struct.ss*> [#uses=4]
-  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0		; <i32*> [#uses=1]
+  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp, align 4
+  ret void
+}
+
+; CHECK-LABEL: define internal void @g(i32 %b.0, i64 %b.1)
+; CHECK: alloca %struct.ss, align 32
+; CHECK: store i32 %b.0
+; CHECK: store i64 %b.1
+
+define i32 @main() nounwind  {
+entry:
+  %S = alloca %struct.ss
+  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
  store i32 1, i32* %tmp1, align 8
-  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1		; <i64*> [#uses=1]
+  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
  store i64 2, i64* %tmp4, align 4
-  call void @f( %struct.ss* byval  %S ) nounwind 
-; CHECK: call void @f(i32 %{{.*}}, i64 %{{.*}})
+  call void @f(%struct.ss* byval %S) nounwind
+  call void @g(%struct.ss* byval %S) nounwind
  ret i32 0
 }
+
+; CHECK-LABEL: define i32 @main
+; CHECK: call void @f(i32 %{{.*}}, i64 %{{.*}})
+; CHECK: call void @g(i32 %{{.*}}, i64 %{{.*}})
--- a/test/Transforms/InstSimplify/pr33957.ll
+++ b/test/Transforms/InstSimplify/pr33957.ll
@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-unroll -S %s | FileCheck %s
+
+%struct.bar = type { i32 }
+
+@global = external constant [78 x %struct.bar], align 4
+
+define void @patatino(i32 %x) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 true, label [[BB1_PREHEADER:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1.preheader:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br i1 true, label %bb1, label %bb3
+
+bb1:
+  %tmp = getelementptr inbounds [78 x %struct.bar], [78 x %struct.bar]* @global, i32 0, <4 x i32> undef
+  %tmp2 = getelementptr inbounds %struct.bar, <4 x %struct.bar*> %tmp, i32 1
+  br i1 true, label %bb3, label %bb1
+
+bb3:
+  ret void
+}
--- a/test/Transforms/SCCP/definite-initializer.ll
+++ b/test/Transforms/SCCP/definite-initializer.ll
@ -0,0 +1,11 @@
+; RUN: opt -S -ipsccp < %s | FileCheck %s
+@d = internal externally_initialized global i32 0, section ".openbsd.randomdata", align 4
+
+; CHECK-LABEL: @test1(
+define i32 @test1() {
+entry:
+  %load = load i32, i32* @d, align 4
+  ret i32 %load
+; CHECK: %[[load:.*]] = load i32, i32* @d, align 4
+; CHECK: ret i32 %[[load]]
+}
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@ -507,6 +507,19 @@ class CloneModule : public ::testing::Test {
                                DINode::FlagZero, false);
    F->setSubprogram(Subprogram);

+    // Create and assign DIGlobalVariableExpression to gv
+    auto GVExpression = DBuilder.createGlobalVariableExpression(
+        Subprogram, "gv", "gv", File, 1, DBuilder.createNullPtrType(), false);
+    GV->addDebugInfo(GVExpression);
+
+    // DIGlobalVariableExpression not attached to any global variable
+    auto Expr = DBuilder.createExpression(
+        ArrayRef<uint64_t>{dwarf::DW_OP_constu, 42U, dwarf::DW_OP_stack_value});
+
+    DBuilder.createGlobalVariableExpression(
+        Subprogram, "unattached", "unattached", File, 1,
+        DBuilder.createNullPtrType(), false, Expr);
+
    auto *Entry = BasicBlock::Create(C, "", F);
    IBuilder.SetInsertPoint(Entry);
    IBuilder.CreateRetVoid();
@ -546,6 +559,52 @@ TEST_F(CloneModule, GlobalMetadata) {
  EXPECT_NE(nullptr, NewGV->getMetadata(LLVMContext::MD_type));
 }

+TEST_F(CloneModule, GlobalDebugInfo) {
+  GlobalVariable *NewGV = NewM->getGlobalVariable("gv");
+  EXPECT_TRUE(NewGV != nullptr);
+
+  // Find debug info expression assigned to global
+  SmallVector<DIGlobalVariableExpression *, 1> GVs;
+  NewGV->getDebugInfo(GVs);
+  EXPECT_EQ(GVs.size(), 1U);
+
+  DIGlobalVariableExpression *GVExpr = GVs[0];
+  DIGlobalVariable *GV = GVExpr->getVariable();
+  EXPECT_TRUE(GV != nullptr);
+
+  EXPECT_EQ(GV->getName(), "gv");
+  EXPECT_EQ(GV->getLine(), 1U);
+
+  // Assert that the scope of the debug info attached to
+  // global variable matches the cloned function.
+  DISubprogram *SP = NewM->getFunction("f")->getSubprogram();
+  EXPECT_TRUE(SP != nullptr);
+  EXPECT_EQ(GV->getScope(), SP);
+}
+
+TEST_F(CloneModule, CompileUnit) {
+  // Find DICompileUnit listed in llvm.dbg.cu
+  auto *NMD = NewM->getNamedMetadata("llvm.dbg.cu");
+  EXPECT_TRUE(NMD != nullptr);
+  EXPECT_EQ(NMD->getNumOperands(), 1U);
+
+  DICompileUnit *CU = dyn_cast<llvm::DICompileUnit>(NMD->getOperand(0));
+  EXPECT_TRUE(CU != nullptr);
+
+  // Assert this CU is consistent with the cloned function debug info
+  DISubprogram *SP = NewM->getFunction("f")->getSubprogram();
+  EXPECT_TRUE(SP != nullptr);
+  EXPECT_EQ(SP->getUnit(), CU);
+
+  // Check globals listed in CU have the correct scope
+  DIGlobalVariableExpressionArray GlobalArray = CU->getGlobalVariables();
+  EXPECT_EQ(GlobalArray.size(), 2U);
+  for (DIGlobalVariableExpression *GVExpr : GlobalArray) {
+    DIGlobalVariable *GV = GVExpr->getVariable();
+    EXPECT_EQ(GV->getScope(), SP);
+  }
+}
+
 TEST_F(CloneModule, Comdat) {
  GlobalVariable *NewGV = NewM->getGlobalVariable("gv");
  auto *CD = NewGV->getComdat();