From c60b95818e4f6c00c872114318d01109f97a7fa3 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Fri, 17 Feb 2017 19:35:08 +0000
Subject: [PATCH 1/4] Vendor import of llvm release_40 branch r295380:
 https://llvm.org/svn/llvm-project/llvm/branches/release_40@295380

---
 docs/Extensions.rst                           |   2 +-
 docs/LangRef.rst                              |   2 +-
 docs/ReleaseNotes.rst                         |  49 ++++
 .../llvm/LTO/legacy/ThinLTOCodeGenerator.h    |  19 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |   8 +-
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp     |  31 ++-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   9 +
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |  20 +-
 lib/LTO/ThinLTOCodeGenerator.cpp              |  31 ++-
 .../AArch64/AArch64CallingConvention.td       |   8 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp    |   3 +-
 lib/Target/ARM/ARMCallingConv.td              |  28 +-
 lib/Target/ARM/ARMISelLowering.cpp            |   3 +-
 .../Instrumentation/AddressSanitizer.cpp      |  15 +-
 .../Instrumentation/ThreadSanitizer.cpp       |   7 +
 lib/Transforms/Utils/LoopUnroll.cpp           |  11 +-
 lib/Transforms/Utils/LoopUnrollRuntime.cpp    |  15 +-
 test/CodeGen/AArch64/ldst-opt.ll              |  99 +++++--
 test/CodeGen/AArch64/swifterror.ll            | 182 +++++++------
 test/CodeGen/AArch64/swiftself.ll             |  18 ++
 test/CodeGen/ARM/swifterror.ll                | 169 ++++++------
 test/CodeGen/ARM/swiftself.ll                 |  17 ++
 test/CodeGen/X86/dag-update-nodetomatch.ll    | 241 ++++++++++++++++++
 test/CodeGen/X86/pr31956.ll                   |  25 ++
 test/CodeGen/X86/swifterror.ll                |  15 ++
 .../Instrumentation/AddressSanitizer/basic.ll |  26 ++
 .../ThreadSanitizer/tsan_basic.ll             |  24 ++
 test/Transforms/LoopUnroll/runtime-li.ll      |  36 +++
 28 files changed, 863 insertions(+), 250 deletions(-)
 create mode 100644 test/CodeGen/X86/dag-update-nodetomatch.ll
 create mode 100644 test/CodeGen/X86/pr31956.ll
 create mode 100644 test/Transforms/LoopUnroll/runtime-li.ll

diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index 850c42750911..2b12123cdf68 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -61,7 +61,7 @@ types ``IMAGE_REL_I386_SECREL`` (32-bit) or ``IMAGE_REL_AMD64_SECREL`` (64-bit).
 the target.  It corresponds to the COFF relocation types
 ``IMAGE_REL_I386_SECTION`` (32-bit) or ``IMAGE_REL_AMD64_SECTION`` (64-bit).
 
-.. code-block:: gas
+.. code-block:: none
 
   .section .debug$S,"rn"
     .long 4
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 5ac17015953e..e93a02f6b023 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -3997,7 +3997,7 @@ DIFile
 
 ``DIFile`` nodes represent files. The ``filename:`` can include slashes.
 
-.. code-block:: llvm
+.. code-block:: none
 
     !0 = !DIFile(filename: "path/to/file", directory: "/path/to/dir",
                  checksumkind: CSK_MD5,
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index aef11daf194b..da86be3f96ff 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -61,6 +61,9 @@ Non-comprehensive list of changes in this release
   with LLVM option -adce-remove-loops when the loop body otherwise has
   no live operations.
 
+ * The llvm-cov tool can now export coverage data as json. Its html output mode
+   has also improved.
+
 * ... next change ...
 
 .. NOTE
@@ -81,6 +84,37 @@ Non-comprehensive list of changes in this release
    * Significant build-time and binary-size improvements when compiling with
      debug info (-g).
 
+LLVM Coroutines
+---------------
+
+Experimental support for :doc:`Coroutines` was added, which can be enabled
+with ``-enable-coroutines`` in ``opt`` command tool or using
+``addCoroutinePassesToExtensionPoints`` API when building the optimization
+pipeline.
+
+For more information on LLVM Coroutines and the LLVM implementation, see
+`2016 LLVM Developers’ Meeting talk on LLVM Coroutines
+<http://llvm.org/devmtg/2016-11/#talk4>`_.
+
+Regcall and Vectorcall Calling Conventions
+--------------------------------------------------
+
+Support was added for _regcall calling convention.
+Existing __vectorcall calling convention support was extended to include
+correct handling of HVAs.
+
+The __vectorcall calling convention was introduced by Microsoft to
+enhance register usage when passing parameters.
+For more information please read `__vectorcall documentation
+<https://msdn.microsoft.com/en-us/library/dn375768.aspx>`_.
+
+The __regcall calling convention was introduced by Intel to 
+optimize parameter transfer on function call.
+This calling convention ensures that as many values as possible are 
+passed or returned in registers.
+For more information please read `__regcall documentation
+<https://software.intel.com/en-us/node/693069>`_.
+
 Code Generation Testing
 -----------------------
 
@@ -258,6 +292,21 @@ External Open Source Projects Using LLVM 4.0.0
 
 * A project...
 
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.
+
 
 Additional Information
 ======================
diff --git a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index cb4a16cb5b7b..0cc3b26e9659 100644
--- a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -31,6 +31,23 @@ class StringRef;
 class LLVMContext;
 class TargetMachine;
 
+/// Wrapper around MemoryBufferRef, owning the identifier
+class ThinLTOBuffer {
+  std::string OwnedIdentifier;
+  StringRef Buffer;
+
+public:
+  ThinLTOBuffer(StringRef Buffer, StringRef Identifier)
+      : OwnedIdentifier(Identifier), Buffer(Buffer) {}
+
+  MemoryBufferRef getMemBuffer() const {
+    return MemoryBufferRef(Buffer,
+                           {OwnedIdentifier.c_str(), OwnedIdentifier.size()});
+  }
+  StringRef getBuffer() const { return Buffer; }
+  StringRef getBufferIdentifier() const { return OwnedIdentifier; }
+};
+
 /// Helper to gather options relevant to the target machine creation
 struct TargetMachineBuilder {
   Triple TheTriple;
@@ -280,7 +297,7 @@ private:
 
   /// Vector holding the input buffers containing the bitcode modules to
   /// process.
-  std::vector<MemoryBufferRef> Modules;
+  std::vector<ThinLTOBuffer> Modules;
 
   /// Set of symbols that need to be preserved outside of the set of bitcode
   /// files.
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fd156fa7dd07..2c7bffe76503 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13072,9 +13072,15 @@ SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N,
             !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
           return SDValue();
 
-        if (InVT1 != InVT2)
+        // Legalizing INSERT_SUBVECTOR is tricky - you basically have to
+        // lower it back into a BUILD_VECTOR. So if the inserted type is
+        // illegal, don't even try.
+        if (InVT1 != InVT2) {
+          if (!TLI.isTypeLegal(InVT2))
+            return SDValue();
           VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
                                DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
+        }
         ShuffleNumElems = NumElems * 2;
       } else {
         // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 3b91e58879b4..4a9042cfb3f4 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -502,8 +502,17 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     const TargetRegisterClass *TRC =
       TLI->getRegClassFor(Node->getSimpleValueType(0));
 
-    unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
-    MachineInstr *DefMI = MRI->getVRegDef(VReg);
+    unsigned Reg;
+    MachineInstr *DefMI;
+    RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(0));
+    if (R && TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      Reg = R->getReg();
+      DefMI = nullptr;
+    } else {
+      Reg = getVR(Node->getOperand(0), VRBaseMap);
+      DefMI = MRI->getVRegDef(Reg);
+    }
+
     unsigned SrcReg, DstReg, DefSubIdx;
     if (DefMI &&
         TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
@@ -519,20 +528,26 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
               TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
       MRI->clearKillFlags(SrcReg);
     } else {
-      // VReg may not support a SubIdx sub-register, and we may need to
+      // Reg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
       // class.
-      VReg = ConstrainForSubReg(VReg, SubIdx,
-                                Node->getOperand(0).getSimpleValueType(),
-                                Node->getDebugLoc());
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        Reg = ConstrainForSubReg(Reg, SubIdx,
+                                 Node->getOperand(0).getSimpleValueType(),
+                                 Node->getDebugLoc());
 
       // Create the destreg if it is missing.
       if (VRBase == 0)
         VRBase = MRI->createVirtualRegister(TRC);
 
       // Create the extract_subreg machine instruction.
-      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), VRBase).addReg(VReg, 0, SubIdx);
+      MachineInstrBuilder CopyMI =
+          BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), VRBase);
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        CopyMI.addReg(Reg, 0, SubIdx);
+      else
+        CopyMI.addReg(TRI->getSubReg(Reg, SubIdx));
     }
   } else if (Opc == TargetOpcode::INSERT_SUBREG ||
              Opc == TargetOpcode::SUBREG_TO_REG) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9ca646534e2b..996c95bd5f07 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5832,6 +5832,15 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   const Value *SwiftErrorVal = nullptr;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // We can't tail call inside a function with a swifterror argument. Lowering
+  // does not support this yet. It would have to move into the swifterror
+  // register before the call.
+  auto *Caller = CS.getInstruction()->getParent()->getParent();
+  if (TLI.supportSwiftError() &&
+      Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    isTailCall = false;
+
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
     const Value *V = *i;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 004fa703c192..64e6c221229b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2782,14 +2782,15 @@ struct MatchScope {
 /// for this.
 class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
 {
-      SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes;
-      SmallVectorImpl<MatchScope> &MatchScopes;
+  SDNode **NodeToMatch;
+  SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;
+  SmallVectorImpl<MatchScope> &MatchScopes;
 public:
-  MatchStateUpdater(SelectionDAG &DAG,
-                    SmallVectorImpl<std::pair<SDValue, SDNode*> > &RN,
-                    SmallVectorImpl<MatchScope> &MS) :
-    SelectionDAG::DAGUpdateListener(DAG),
-    RecordedNodes(RN), MatchScopes(MS) { }
+  MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,
+                    SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN,
+                    SmallVectorImpl<MatchScope> &MS)
+      : SelectionDAG::DAGUpdateListener(DAG), NodeToMatch(NodeToMatch),
+        RecordedNodes(RN), MatchScopes(MS) {}
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     // Some early-returns here to avoid the search if we deleted the node or
@@ -2799,6 +2800,9 @@ public:
     // update listener during matching a complex patterns.
     if (!E || E->isMachineOpcode())
       return;
+    // Check if NodeToMatch was updated.
+    if (N == *NodeToMatch)
+      *NodeToMatch = E;
     // Performing linear search here does not matter because we almost never
     // run this code.  You'd have to have a CSE during complex pattern
     // matching.
@@ -3091,7 +3095,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // consistent.
       std::unique_ptr<MatchStateUpdater> MSU;
       if (ComplexPatternFuncMutatesDAG())
-        MSU.reset(new MatchStateUpdater(*CurDAG, RecordedNodes,
+        MSU.reset(new MatchStateUpdater(*CurDAG, &NodeToMatch, RecordedNodes,
                                         MatchScopes));
 
       if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 104fb199da08..40537e4fa784 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -150,13 +150,13 @@ static void computePrevailingCopies(
 }
 
 static StringMap<MemoryBufferRef>
-generateModuleMap(const std::vector<MemoryBufferRef> &Modules) {
+generateModuleMap(const std::vector<ThinLTOBuffer> &Modules) {
   StringMap<MemoryBufferRef> ModuleMap;
   for (auto &ModuleBuffer : Modules) {
     assert(ModuleMap.find(ModuleBuffer.getBufferIdentifier()) ==
                ModuleMap.end() &&
            "Expect unique Buffer Identifier");
-    ModuleMap[ModuleBuffer.getBufferIdentifier()] = ModuleBuffer;
+    ModuleMap[ModuleBuffer.getBufferIdentifier()] = ModuleBuffer.getMemBuffer();
   }
   return ModuleMap;
 }
@@ -522,13 +522,13 @@ static void initTMBuilder(TargetMachineBuilder &TMBuilder,
 } // end anonymous namespace
 
 void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
-  MemoryBufferRef Buffer(Data, Identifier);
+  ThinLTOBuffer Buffer(Data, Identifier);
   if (Modules.empty()) {
     // First module added, so initialize the triple and some options
     LLVMContext Context;
     StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr =
-        expectedToErrorOrAndEmitErrors(Context, getBitcodeTargetTriple(Buffer));
+    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
+        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
     if (TripleOrErr)
       TripleStr = *TripleOrErr;
     Triple TheTriple(TripleStr);
@@ -538,8 +538,8 @@ void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
   else {
     LLVMContext Context;
     StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr =
-        expectedToErrorOrAndEmitErrors(Context, getBitcodeTargetTriple(Buffer));
+    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
+        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
     if (TripleOrErr)
       TripleStr = *TripleOrErr;
     assert(TMBuilder.TheTriple.str() == TripleStr &&
@@ -588,7 +588,8 @@ std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
   uint64_t NextModuleId = 0;
   for (auto &ModuleBuffer : Modules) {
     Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
-        object::ModuleSummaryIndexObjectFile::create(ModuleBuffer);
+        object::ModuleSummaryIndexObjectFile::create(
+            ModuleBuffer.getMemBuffer());
     if (!ObjOrErr) {
       // FIXME diagnose
       logAllUnhandledErrors(
@@ -852,8 +853,9 @@ void ThinLTOCodeGenerator::run() {
         Context.setDiscardValueNames(LTODiscardValueNames);
 
         // Parse module now
-        auto TheModule = loadModuleFromBuffer(ModuleBuffer, Context, false,
-                                              /*IsImporting*/ false);
+        auto TheModule =
+            loadModuleFromBuffer(ModuleBuffer.getMemBuffer(), Context, false,
+                                 /*IsImporting*/ false);
 
         // CodeGen
         auto OutputBuffer = codegen(*TheModule);
@@ -943,8 +945,8 @@ void ThinLTOCodeGenerator::run() {
   std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
   std::sort(ModulesOrdering.begin(), ModulesOrdering.end(),
             [&](int LeftIndex, int RightIndex) {
-              auto LSize = Modules[LeftIndex].getBufferSize();
-              auto RSize = Modules[RightIndex].getBufferSize();
+              auto LSize = Modules[LeftIndex].getBuffer().size();
+              auto RSize = Modules[RightIndex].getBuffer().size();
               return LSize > RSize;
             });
 
@@ -996,8 +998,9 @@ void ThinLTOCodeGenerator::run() {
         }
 
         // Parse module now
-        auto TheModule = loadModuleFromBuffer(ModuleBuffer, Context, false,
-                                              /*IsImporting*/ false);
+        auto TheModule =
+            loadModuleFromBuffer(ModuleBuffer.getMemBuffer(), Context, false,
+                                 /*IsImporting*/ false);
 
         // Save temps: original file.
         saveTempBitcode(*TheModule, SaveTempsDir, count, ".0.original.bc");
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 9058617768dd..938779d23690 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -91,7 +91,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
@@ -138,8 +138,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
 
-  // A SwiftError is passed in X19.
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+  // A SwiftError is passed in X21.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
@@ -289,7 +289,7 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 def CSR_AArch64_AAPCS_SwiftError
-    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
 
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2f67a105b4d1..849058bdfbdb 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3155,7 +3155,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
 
     if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+      if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
+          Outs[0].VT == MVT::i64) {
         assert(VA.getLocVT() == MVT::i64 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 9c278a52a7ff..7a7b7fede7c8 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -26,8 +26,8 @@ def CC_ARM_APCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is passed in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is passed in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
@@ -51,8 +51,8 @@ def RetCC_ARM_APCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is returned in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is returned in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
@@ -166,8 +166,8 @@ def CC_ARM_AAPCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is passed in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is passed in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -182,8 +182,8 @@ def RetCC_ARM_AAPCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is returned in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is returned in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -206,8 +206,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is passed in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is passed in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // HFAs are passed in a contiguous block of registers, or on the stack
   CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
@@ -227,8 +227,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is returned in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is returned in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -267,8 +267,8 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 // Also save R7-R4 first to match the stack frame fixed spill areas.
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
-// R6 is used to pass swifterror, remove it from CSR.
-def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>;
+// R8 is used to pass swifterror, remove it from CSR.
+def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>;
 
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 1606c1576465..97481d49ea34 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1787,7 +1787,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                          StackPtr, MemOpChains, Flags);
       }
     } else if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
+      if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
+          Outs[0].VT == MVT::i32) {
         assert(VA.getLocVT() == MVT::i32 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index ffd518e52968..f5e9e7dd5a93 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1013,7 +1013,9 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
        (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
        // inalloca allocas are not treated as static, and we don't want
        // dynamic alloca instrumentation for them as well.
-       !AI.isUsedWithInAlloca());
+       !AI.isUsedWithInAlloca() &&
+       // swifterror allocas are register promoted by ISel
+       !AI.isSwiftError());
 
   ProcessedAllocas[&AI] = IsInteresting;
   return IsInteresting;
@@ -1088,12 +1090,19 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
     }
   }
 
-  // Do not instrument acesses from different address spaces; we cannot deal
-  // with them.
   if (PtrOperand) {
+    // Do not instrument acesses from different address spaces; we cannot deal
+    // with them.
     Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
     if (PtrTy->getPointerAddressSpace() != 0)
       return nullptr;
+
+    // Ignore swifterror addresses.
+    // swifterror memory addresses are mem2reg promoted by instruction
+    // selection. As such they cannot have regular uses like an instrumentation
+    // function and it makes no sense to track them as memory.
+    if (PtrOperand->isSwiftError())
+      return nullptr;
   }
 
   // Treat memory accesses to promotable allocas as non-interesting since they
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index d9659694da46..52035c79a4a3 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -488,6 +488,13 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
   Value *Addr = IsWrite
       ? cast<StoreInst>(I)->getPointerOperand()
       : cast<LoadInst>(I)->getPointerOperand();
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
   int Idx = getMemoryAccessFuncIndex(Addr, DL);
   if (Idx < 0)
     return false;
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index f9a602bc268a..e346ebd6a000 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -189,11 +189,14 @@ const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
     assert(OriginalBB == OldLoop->getHeader() &&
            "Header should be first in RPO");
 
+    NewLoop = new Loop();
     Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
-    assert(NewLoopParent &&
-           "Expected parent loop before sub-loop in RPO");
-    NewLoop = new Loop;
-    NewLoopParent->addChildLoop(NewLoop);
+
+    if (NewLoopParent)
+      NewLoopParent->addChildLoop(NewLoop);
+    else
+      LI->addTopLevelLoop(NewLoop);
+
     NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
     return OldLoop;
   } else {
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 85da3ba899a5..d3ea1564115b 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -302,17 +302,22 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
   }
 
   NewLoopsMap NewLoops;
-  NewLoops[L] = NewLoop;
+  if (NewLoop)
+    NewLoops[L] = NewLoop;
+  else if (ParentLoop)
+    NewLoops[L] = ParentLoop;
+
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
-
-    if (NewLoop) {
+   
+    // If we're unrolling the outermost loop, there's no remainder loop,
+    // and this block isn't in a nested loop, then the new block is not
+    // in any loop. Otherwise, add it to loopinfo.
+    if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
       addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
-    } else if (ParentLoop)
-      ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
     if (Header == *BB) {
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index 81e4b19e6eea..b09fab8d8b46 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -119,7 +120,7 @@ define void @load-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
 ; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
-  %add = load i64, i64* %a, align 4
+  %add = load i64, i64* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
@@ -132,7 +133,7 @@ define void @store-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) no
 ; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
-  store i64 %val, i64* %a, align 4
+  store i64 %val, i64* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
@@ -147,7 +148,7 @@ define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
 ; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0
-  %add = load fp128, fp128* %a, align 4
+  %add = load fp128, fp128* %a, align 16
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1
@@ -160,7 +161,7 @@ define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) noun
 ; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0
-  store fp128 %val, fp128* %a, align 4
+  store fp128 %val, fp128* %a, align 16
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1
@@ -203,7 +204,7 @@ define void @load-pre-indexed-double(%struct.double* %ptr) nounwind {
 ; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1, i32 0
-  %add = load double, double* %a, align 4
+  %add = load double, double* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1
@@ -216,7 +217,7 @@ define void @store-pre-indexed-double(%struct.double* %ptr, double %val) nounwin
 ; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1, i32 0
-  store double %val, double* %a, align 4
+  store double %val, double* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1
@@ -1340,7 +1341,8 @@ end:
 define void @merge_zr32(i32* %p) {
 ; CHECK-LABEL: merge_zr32:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1349,11 +1351,13 @@ entry:
   ret void
 }
 
-; Same sa merge_zr32 but the merged stores should also get paried.
+; Same as merge_zr32 but the merged stores should also get paried.
 define void @merge_zr32_2(i32* %p) {
 ; CHECK-LABEL: merge_zr32_2:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1370,7 +1374,11 @@ entry:
 define void @merge_zr32_2_offset(i32* %p) {
 ; CHECK-LABEL: merge_zr32_2_offset:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #504]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516]
 ; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 126
@@ -1390,8 +1398,12 @@ entry:
 define void @no_merge_zr32_2_offset(i32* %p) {
 ; CHECK-LABEL: no_merge_zr32_2_offset:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4096]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108]
 ; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 1024
@@ -1411,8 +1423,12 @@ entry:
 define void @merge_zr32_3(i32* %p) {
 ; CHECK-LABEL: merge_zr32_3:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1437,7 +1453,8 @@ entry:
 define void @merge_zr32_2vec(<2 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_2vec:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x i32> zeroinitializer, <2 x i32>* %p
@@ -1448,8 +1465,10 @@ entry:
 define void @merge_zr32_3vec(<3 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_3vec:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
-; CHECK-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <3 x i32> zeroinitializer, <3 x i32>* %p
@@ -1460,7 +1479,9 @@ entry:
 define void @merge_zr32_4vec(<4 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_4vec:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <4 x i32> zeroinitializer, <4 x i32>* %p
@@ -1471,7 +1492,8 @@ entry:
 define void @merge_zr32_2vecf(<2 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_2vecf:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x float> zeroinitializer, <2 x float>* %p
@@ -1482,7 +1504,9 @@ entry:
 define void @merge_zr32_4vecf(<4 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_4vecf:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %p
@@ -1502,13 +1526,42 @@ entry:
   ret void
 }
 
+; Similar to merge_zr32, but for 64-bit values and with unaligned stores.
+define void @merge_zr64_unalign(<2 x i64>* %p) {
+; CHECK-LABEL: merge_zr64_unalign:
+; CHECK: // %entry
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN: strb wzr,
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; CHECK-NEXT: ret
+entry:
+  store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1
+  ret void
+}
+
 ; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
 ; vector store since the zero constant vector has multiple uses.
 define void @merge_zr64_2(i64* %p) {
 ; CHECK-LABEL: merge_zr64_2:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16]
 ; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
index b15eaa923f08..69bf3510cc5a 100644
--- a/test/CodeGen/AArch64/swifterror.ll
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -13,18 +13,18 @@ define float @foo(%swift_error** swifterror %error_ptr_ref) {
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-APPLE: strb [[ID]], [x0, #8]
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 
 ; CHECK-O0-LABEL: foo:
 ; CHECK-O0: orr w{{.*}}, wzr, #0x10
 ; CHECK-O0: malloc
-; CHECK-O0: mov x19, x0
-; CHECK-O0-NOT: x19
+; CHECK-O0: mov x21, x0
+; CHECK-O0-NOT: x21
 ; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1
-; CHECK-O0-NOT: x19
+; CHECK-O0-NOT: x21
 ; CHECK-O0: strb [[ID]], [x0, #8]
-; CHECK-O0-NOT: x19
+; CHECK-O0-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -38,20 +38,20 @@ entry:
 define float @caller(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller:
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
-; CHECK-O0: mov x19
+; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov [[ID:x[0-9]+]], x19
-; CHECK-O0: cbnz x19
+; CHECK-O0: mov [[ID:x[0-9]+]], x21
+; CHECK-O0: cbnz x21
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -75,22 +75,22 @@ define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller2:
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
 ; CHECK-APPLE: fmov [[CMP:s[0-9]+]], #1.0
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; CHECK-APPLE: fcmp s0, [[CMP]]
 ; CHECK-APPLE: b.le
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller2:
-; CHECK-O0: mov x19
+; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov [[ID:x[0-9]+]], x19
-; CHECK-O0: cbnz x19
+; CHECK-O0: mov [[ID:x[0-9]+]], x21
+; CHECK-O0: cbnz x21
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   br label %bb_loop
@@ -123,24 +123,24 @@ define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-APPLE: strb [[ID]], [x0, #8]
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE: ret
 
 ; CHECK-O0-LABEL: foo_if:
-; spill x19
-; CHECK-O0: str x19, [sp, [[SLOT:#[0-9]+]]]
+; spill x21
+; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]]
 ; CHECK-O0: cbz w0
 ; CHECK-O0: orr w{{.*}}, wzr, #0x10
 ; CHECK-O0: malloc
 ; CHECK-O0: mov [[ID:x[0-9]+]], x0
 ; CHECK-O0: orr [[ID2:w[0-9]+]], wzr, #0x1
 ; CHECK-O0: strb [[ID2]], [x0, #8]
-; CHECK-O0: mov x19, [[ID]]
+; CHECK-O0: mov x21, [[ID]]
 ; CHECK-O0: ret
 ; reload from stack
 ; CHECK-O0: ldr [[ID3:x[0-9]+]], [sp, [[SLOT]]]
-; CHECK-O0: mov x19, [[ID3]]
+; CHECK-O0: mov x21, [[ID3]]
 ; CHECK-O0: ret
 entry:
   %cond = icmp ne i32 %cc, 0
@@ -162,19 +162,19 @@ normal:
 ; under a certain condition inside a loop.
 define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-APPLE-LABEL: foo_loop:
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: cbz
 ; CHECK-APPLE: orr w0, wzr, #0x10
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: strb w{{.*}}, [x0, #8]
 ; CHECK-APPLE: fcmp
 ; CHECK-APPLE: b.le
-; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE: mov x21, x0
 ; CHECK-APPLE: ret
 
 ; CHECK-O0-LABEL: foo_loop:
-; spill x19
-; CHECK-O0: str x19, [sp, [[SLOT:#[0-9]+]]]
+; spill x21
+; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]]
 ; CHECK-O0: b [[BB1:[A-Za-z0-9_]*]]
 ; CHECK-O0: [[BB1]]:
 ; CHECK-O0: ldr     x0, [sp, [[SLOT]]]
@@ -193,7 +193,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-O0: b.le [[BB1]]
 ; reload from stack
 ; CHECK-O0: ldr [[ID3:x[0-9]+]], [sp]
-; CHECK-O0: mov x19, [[ID3]]
+; CHECK-O0: mov x21, [[ID3]]
 ; CHECK-O0: ret
 entry:
   br label %bb_loop
@@ -229,23 +229,23 @@ define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swi
 ; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-APPLE: strb [[ID]], [x0, #8]
 ; CHECK-APPLE: str w{{.*}}, [{{.*}}[[SRET]], #4]
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 
 ; CHECK-O0-LABEL: foo_sret:
 ; CHECK-O0: orr w{{.*}}, wzr, #0x10
 ; spill x8
 ; CHECK-O0-DAG: str x8
-; spill x19
-; CHECK-O0-DAG: str x19
+; spill x21
+; CHECK-O0-DAG: str x21
 ; CHECK-O0: malloc
 ; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-O0: strb [[ID]], [x0, #8]
 ; reload from stack
 ; CHECK-O0: ldr [[SRET:x[0-9]+]]
 ; CHECK-O0: str w{{.*}}, [{{.*}}[[SRET]], #4]
-; CHECK-O0: mov x19
-; CHECK-O0-NOT: x19
+; CHECK-O0: mov x21
+; CHECK-O0-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -261,22 +261,22 @@ entry:
 define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller3:
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_sret
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
 ; spill x0
 ; CHECK-O0: str x0
-; CHECK-O0: mov x19
+; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo_sret
-; CHECK-O0: mov [[ID2:x[0-9]+]], x19
-; CHECK-O0: cbnz [[ID2]]
+; CHECK-O0: mov [[ID2:x[0-9]+]], x21
+; CHECK-O0: cbnz x21
 ; Access part of the error object and save it to error_ref
 ; reload from stack
 ; CHECK-O0: ldrb [[CODE:w[0-9]+]]
@@ -323,8 +323,8 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; Third vararg
 ; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
 
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -356,13 +356,13 @@ define float @caller4(i8* %error_ref) {
 ; CHECK-APPLE: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK-APPLE: str {{x[0-9]+}}, [sp]
 
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
@@ -407,29 +407,29 @@ entry:
 }
 
 ; CHECK-APPLE-LABEL: swifterror_clobber
-; CHECK-APPLE: mov [[REG:x[0-9]+]], x19
+; CHECK-APPLE: mov [[REG:x[0-9]+]], x21
 ; CHECK-APPLE: nop
-; CHECK-APPLE: mov x19, [[REG]]
+; CHECK-APPLE: mov x21, [[REG]]
 define swiftcc void @swifterror_clobber(%swift_error** nocapture swifterror %err) {
-  call void asm sideeffect "nop", "~{x19}"()
+  call void asm sideeffect "nop", "~{x21}"()
   ret void
 }
 
 ; CHECK-APPLE-LABEL: swifterror_reg_clobber
-; CHECK-APPLE: stp {{.*}}x19
+; CHECK-APPLE: stp {{.*}}x21
 ; CHECK-APPLE: nop
-; CHECK-APPLE: ldp  {{.*}}x19
+; CHECK-APPLE: ldp  {{.*}}x21
 define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
-  call void asm sideeffect "nop", "~{x19}"()
+  call void asm sideeffect "nop", "~{x21}"()
   ret void
 }
 ; CHECK-APPLE-LABEL: params_in_reg
 ; Save callee saved registers and swifterror since it will be clobbered by the first call to params_in_reg2.
-; CHECK-APPLE:  stp     x19, x28, [sp
+; CHECK-APPLE:  stp     x21, x28, [sp
 ; CHECK-APPLE:  stp     x27, x26, [sp
 ; CHECK-APPLE:  stp     x25, x24, [sp
 ; CHECK-APPLE:  stp     x23, x22, [sp
-; CHECK-APPLE:  stp     x21, x20, [sp
+; CHECK-APPLE:  stp     x20, x19, [sp
 ; CHECK-APPLE:  stp     x29, x30, [sp
 ; CHECK-APPLE:  str     x20, [sp
 ; Store argument registers.
@@ -439,7 +439,7 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
 ; CHECK-APPLE:  mov      x26, x4
 ; CHECK-APPLE:  mov      x27, x3
 ; CHECK-APPLE:  mov      x28, x2
-; CHECK-APPLE:  mov      x21, x1
+; CHECK-APPLE:  mov      x19, x1
 ; CHECK-APPLE:  mov      x22, x0
 ; Setup call.
 ; CHECK-APPLE:  orr     w0, wzr, #0x1
@@ -451,11 +451,11 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
 ; CHECK-APPLE:  orr     w6, wzr, #0x7
 ; CHECK-APPLE:  orr     w7, wzr, #0x8
 ; CHECK-APPLE:  mov      x20, xzr
-; CHECK-APPLE:  mov      x19, xzr
+; CHECK-APPLE:  mov      x21, xzr
 ; CHECK-APPLE:  bl      _params_in_reg2
 ; Restore original arguments for next call.
 ; CHECK-APPLE:  mov      x0, x22
-; CHECK-APPLE:  mov      x1, x21
+; CHECK-APPLE:  mov      x1, x19
 ; CHECK-APPLE:  mov      x2, x28
 ; CHECK-APPLE:  mov      x3, x27
 ; CHECK-APPLE:  mov      x4, x26
@@ -463,22 +463,22 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
 ; CHECK-APPLE:  mov      x6, x24
 ; CHECK-APPLE:  mov      x7, x23
 ; Restore original swiftself argument and swifterror %err.
-; CHECK-APPLE:  ldp             x20, x19, [sp
+; CHECK-APPLE:  ldp             x20, x21, [sp
 ; CHECK-APPLE:  bl      _params_in_reg2
-; Restore calle save registers but don't clober swifterror x19.
-; CHECK-APPLE-NOT: x19
+; Restore calle save registers but don't clober swifterror x21.
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x29, x30, [sp
-; CHECK-APPLE-NOT: x19
-; CHECK-APPLE:  ldp     x21, x20, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
+; CHECK-APPLE:  ldp     x20, x19, [sp
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x23, x22, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x25, x24, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x27, x26, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldr     x28, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ret
 define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, i8* swiftself, %swift_error** nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror %swift_error*, align 8
@@ -495,17 +495,17 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  stp     x27, x26, [sp
 ; CHECK-APPLE:  stp     x25, x24, [sp
 ; CHECK-APPLE:  stp     x23, x22, [sp
-; CHECK-APPLE:  stp     x21, x20, [sp
+; CHECK-APPLE:  stp     x20, x19, [sp
 ; CHECK-APPLE:  stp     x29, x30, [sp
 ; Save original arguments.
-; CHECK-APPLE:  mov      x23, x19
+; CHECK-APPLE:  mov      x23, x21
 ; CHECK-APPLE:  str     x7, [sp, #16]
 ; CHECK-APPLE:  mov      x24, x6
 ; CHECK-APPLE:  mov      x25, x5
 ; CHECK-APPLE:  mov      x26, x4
 ; CHECK-APPLE:  mov      x27, x3
 ; CHECK-APPLE:  mov      x28, x2
-; CHECK-APPLE:  mov      x21, x1
+; CHECK-APPLE:  mov      x19, x1
 ; CHECK-APPLE:  mov      x22, x0
 ; Setup call arguments.
 ; CHECK-APPLE:  orr     w0, wzr, #0x1
@@ -517,23 +517,23 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  orr     w6, wzr, #0x7
 ; CHECK-APPLE:  orr     w7, wzr, #0x8
 ; CHECK-APPLE:  mov      x20, xzr
-; CHECK-APPLE:  mov      x19, xzr
+; CHECK-APPLE:  mov      x21, xzr
 ; CHECK-APPLE:  bl      _params_in_reg2
 ; Store swifterror %error_ptr_ref.
-; CHECK-APPLE:  str     x19, [sp, #8]
+; CHECK-APPLE:  str     x21, [sp, #8]
 ; Setup call arguments from original arguments.
 ; CHECK-APPLE:  mov      x0, x22
-; CHECK-APPLE:  mov      x1, x21
+; CHECK-APPLE:  mov      x1, x19
 ; CHECK-APPLE:  mov      x2, x28
 ; CHECK-APPLE:  mov      x3, x27
 ; CHECK-APPLE:  mov      x4, x26
 ; CHECK-APPLE:  mov      x5, x25
 ; CHECK-APPLE:  mov      x6, x24
 ; CHECK-APPLE:  ldp     x7, x20, [sp, #16]
-; CHECK-APPLE:  mov      x19, x23
+; CHECK-APPLE:  mov      x21, x23
 ; CHECK-APPLE:  bl      _params_and_return_in_reg2
 ; Store return values.
-; CHECK-APPLE:  mov      x21, x0
+; CHECK-APPLE:  mov      x19, x0
 ; CHECK-APPLE:  mov      x22, x1
 ; CHECK-APPLE:  mov      x24, x2
 ; CHECK-APPLE:  mov      x25, x3
@@ -542,7 +542,7 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  mov      x28, x6
 ; CHECK-APPLE:  mov      x23, x7
 ; Save swifterror %err.
-; CHECK-APPLE:  str     x19, [sp, #24]
+; CHECK-APPLE:  str     x21, [sp, #24]
 ; Setup call.
 ; CHECK-APPLE:  orr     w0, wzr, #0x1
 ; CHECK-APPLE:  orr     w1, wzr, #0x2
@@ -554,10 +554,10 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  orr     w7, wzr, #0x8
 ; CHECK-APPLE:  mov      x20, xzr
 ; ... setup call with swiferror %error_ptr_ref.
-; CHECK-APPLE:  ldr     x19, [sp, #8]
+; CHECK-APPLE:  ldr     x21, [sp, #8]
 ; CHECK-APPLE:  bl      _params_in_reg2
 ; Restore return values for return from this function.
-; CHECK-APPLE:  mov      x0, x21
+; CHECK-APPLE:  mov      x0, x19
 ; CHECK-APPLE:  mov      x1, x22
 ; CHECK-APPLE:  mov      x2, x24
 ; CHECK-APPLE:  mov      x3, x25
@@ -566,9 +566,9 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  mov      x6, x28
 ; CHECK-APPLE:  mov      x7, x23
 ; Restore swifterror %err and callee save registers.
-; CHECK-APPLE:  ldp     x19, x28, [sp, #24
+; CHECK-APPLE:  ldp     x21, x28, [sp, #24
 ; CHECK-APPLE:  ldp     x29, x30, [sp
-; CHECK-APPLE:  ldp     x21, x20, [sp
+; CHECK-APPLE:  ldp     x20, x19, [sp
 ; CHECK-APPLE:  ldp     x23, x22, [sp
 ; CHECK-APPLE:  ldp     x25, x24, [sp
 ; CHECK-APPLE:  ldp     x27, x26, [sp
@@ -583,3 +583,17 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
 }
 
 declare swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8* swiftself, %swift_error** nocapture swifterror %err)
+
+declare void @acallee(i8*)
+
+; Make sure we don't tail call if the caller returns a swifterror value. We
+; would have to move into the swifterror register before the tail call.
+; CHECK-APPLE: tailcall_from_swifterror:
+; CHECK-APPLE-NOT: b _acallee
+; CHECK-APPLE: bl _acallee
+
+define swiftcc void @tailcall_from_swifterror(%swift_error** swifterror %error_ptr_ref) {
+entry:
+  tail call void @acallee(i8* null)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/swiftself.ll b/test/CodeGen/AArch64/swiftself.ll
index a60aed6b0f2b..33a49198430e 100644
--- a/test/CodeGen/AArch64/swiftself.ll
+++ b/test/CodeGen/AArch64/swiftself.ll
@@ -65,3 +65,21 @@ define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
   %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
   ret i8* %res
 }
+
+; We cannot pretend that 'x0' is alive across the thisreturn_attribute call as
+; we normally would. We marked the first parameter with swiftself which means it
+; will no longer be passed in x0.
+declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
+; OPT-LABEL: swiftself_nothisreturn:
+; OPT-DAG: ldr  x20, [x20]
+; OPT-DAG: mov [[CSREG:x[1-9].*]], x8
+; OPT: bl {{_?}}thisreturn_attribute
+; OPT: str x0, {{\[}}[[CSREG]]
+; OPT: ret
+define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret, i8** noalias nocapture readonly swiftself) {
+entry:
+  %2 = load i8*, i8** %1, align 8
+  %3 = tail call swiftcc i8* @thisreturn_attribute(i8* swiftself %2)
+  store i8* %3, i8** %0, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM/swifterror.ll b/test/CodeGen/ARM/swifterror.ll
index 7551291207ed..78764202f627 100644
--- a/test/CodeGen/ARM/swifterror.ll
+++ b/test/CodeGen/ARM/swifterror.ll
@@ -13,7 +13,7 @@ define float @foo(%swift_error** swifterror %error_ptr_ref) {
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], #1
-; CHECK-APPLE-DAG: mov r6, r{{.*}}
+; CHECK-APPLE-DAG: mov r8, r{{.*}}
 ; CHECK-APPLE-DAG: strb [[ID]], [r{{.*}}, #8]
 
 ; CHECK-O0-LABEL: foo:
@@ -22,7 +22,7 @@ define float @foo(%swift_error** swifterror %error_ptr_ref) {
 ; CHECK-O0: mov [[ID2:r[0-9]+]], r0
 ; CHECK-O0: mov [[ID:r[0-9]+]], #1
 ; CHECK-O0: strb [[ID]], [r0, #8]
-; CHECK-O0: mov r6, [[ID2]]
+; CHECK-O0: mov r8, [[ID2]]
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -36,21 +36,21 @@ entry:
 define float @caller(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller:
 ; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE-DAG: mov r6, #0
+; CHECK-APPLE-DAG: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
 ; spill r0
-; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r8, #0
 ; CHECK-O0-DAG: str r0, [sp, [[SLOT:#[0-9]+]]
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov [[TMP:r[0-9]+]], r6
+; CHECK-O0: mov [[TMP:r[0-9]+]], r8
 ; CHECK-O0: str [[TMP]], [sp]
 ; CHECK-O0: bne
 ; CHECK-O0: ldrb [[CODE:r[0-9]+]], [r0, #8]
@@ -81,22 +81,22 @@ handler:
 define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller2:
 ; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE-DAG: mov r6, #0
+; CHECK-APPLE-DAG: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; CHECK-APPLE: bne
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrb [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller2:
 ; spill r0
 ; CHECK-O0-DAG: str r0,
-; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r8, #0
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: mov r{{.*}}, r8
 ; CHECK-O0: str r0, [sp]
 ; CHECK-O0: bne
 ; CHECK-O0: ble
@@ -138,22 +138,22 @@ define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], #1
-; CHECK-APPLE-DAG: mov r6, r{{.*}}
+; CHECK-APPLE-DAG: mov r8, r{{.*}}
 ; CHECK-APPLE-DAG: strb [[ID]], [r{{.*}}, #8]
 
 ; CHECK-O0-LABEL: foo_if:
 ; CHECK-O0: cmp r0, #0
 ; spill to stack
-; CHECK-O0: str r6
+; CHECK-O0: str r8
 ; CHECK-O0: beq
 ; CHECK-O0: mov r0, #16
 ; CHECK-O0: malloc
 ; CHECK-O0: mov [[ID:r[0-9]+]], r0
 ; CHECK-O0: mov [[ID2:[a-z0-9]+]], #1
 ; CHECK-O0: strb [[ID2]], [r0, #8]
-; CHECK-O0: mov r6, [[ID]]
+; CHECK-O0: mov r8, [[ID]]
 ; reload from stack
-; CHECK-O0: ldr r6
+; CHECK-O0: ldr r8
 entry:
   %cond = icmp ne i32 %cc, 0
   br i1 %cond, label %gen_error, label %normal
@@ -176,17 +176,17 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-APPLE-LABEL: foo_loop:
 ; CHECK-APPLE: mov [[CODE:r[0-9]+]], r0
 ; swifterror is kept in a register
-; CHECK-APPLE: mov [[ID:r[0-9]+]], r6
+; CHECK-APPLE: mov [[ID:r[0-9]+]], r8
 ; CHECK-APPLE: cmp [[CODE]], #0
 ; CHECK-APPLE: beq
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: strb r{{.*}}, [{{.*}}[[ID]], #8]
 ; CHECK-APPLE: ble
-; CHECK-APPLE: mov r6, [[ID]]
+; CHECK-APPLE: mov r8, [[ID]]
 
 ; CHECK-O0-LABEL: foo_loop:
-; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: mov r{{.*}}, r8
 ; CHECK-O0: cmp r{{.*}}, #0
 ; CHECK-O0: beq
 ; CHECK-O0-DAG: movw r{{.*}}, #1
@@ -200,7 +200,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-O0: vcmpe
 ; CHECK-O0: ble
 ; reload from stack
-; CHECK-O0: ldr r6
+; CHECK-O0: ldr r8
 entry:
   br label %bb_loop
 
@@ -231,7 +231,7 @@ define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swi
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: mov [[REG:r[0-9]+]], #1
-; CHECK-APPLE-DAG: mov r6, r0
+; CHECK-APPLE-DAG: mov r8, r0
 ; CHECK-APPLE-DAG: strb [[REG]], [r0, #8]
 ; CHECK-APPLE-DAG: str r{{.*}}, [{{.*}}[[SRET]], #4]
 
@@ -247,7 +247,7 @@ define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swi
 ; CHECK-O0: ldr
 ; CHECK-O0: ldr
 ; CHECK-O0: str r{{.*}}, [{{.*}}, #4]
-; CHECK-O0: mov r6
+; CHECK-O0: mov r8
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -263,22 +263,22 @@ entry:
 define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller3:
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE: mov r6, #0
+; CHECK-APPLE: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo_sret
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
-; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r8, #0
 ; CHECK-O0-DAG: mov r0
 ; CHECK-O0-DAG: mov r1
 ; CHECK-O0: bl {{.*}}foo_sret
-; CHECK-O0: mov [[ID2:r[0-9]+]], r6
-; CHECK-O0: cmp r6
+; CHECK-O0: mov [[ID2:r[0-9]+]], r8
+; CHECK-O0: cmp r8
 ; CHECK-O0: str [[ID2]], [sp[[SLOT:.*]]]
 ; CHECK-O0: bne
 ; Access part of the error object and save it to error_ref
@@ -316,7 +316,7 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE: mov [[REG:r[0-9]+]], r0
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], #1
 ; CHECK-APPLE-DAG: strb [[ID]], [{{.*}}[[REG]], #8]
-; CHECK-APPLE-DAG: mov r6, [[REG]]
+; CHECK-APPLE-DAG: mov r8, [[REG]]
 
 entry:
   %call = call i8* @malloc(i64 16)
@@ -345,13 +345,13 @@ entry:
 define float @caller4(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller4:
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE: mov r6, #0
+; CHECK-APPLE: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
@@ -396,51 +396,51 @@ entry:
 }
 
 ; CHECK-APPLE-LABEL: swifterror_clobber
-; CHECK-APPLE: mov [[REG:r[0-9]+]], r6
+; CHECK-APPLE: mov [[REG:r[0-9]+]], r8
 ; CHECK-APPLE: nop
-; CHECK-APPLE: mov r6, [[REG]]
+; CHECK-APPLE: mov r8, [[REG]]
 define swiftcc void @swifterror_clobber(%swift_error** nocapture swifterror %err) {
-  call void asm sideeffect "nop", "~{r6}"()
+  call void asm sideeffect "nop", "~{r8}"()
   ret void
 }
 
 ; CHECK-APPLE-LABEL: swifterror_reg_clobber
-; CHECK-APPLE: push {{.*}}r6
+; CHECK-APPLE: push {{.*}}r8
 ; CHECK-APPLE: nop
-; CHECK-APPLE: pop  {{.*}}r6
+; CHECK-APPLE: pop  {{.*}}r8
 define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
-  call void asm sideeffect "nop", "~{r6}"()
+  call void asm sideeffect "nop", "~{r8}"()
   ret void
 }
 
 ; CHECK-ARMV7-LABEL: _params_in_reg
 ; Store callee saved registers excluding swifterror.
-; CHECK-ARMV7:  push    {r4, r5, r7, r8, r10, r11, lr}
-; Store swiftself (r10) and swifterror (r6).
-; CHECK-ARMV7-DAG:  str     r6, [s[[STK1:.*]]]
+; CHECK-ARMV7:  push   {r4, r5, r6, r7, r10, r11, lr}
+; Store swiftself (r10) and swifterror (r8).
+; CHECK-ARMV7-DAG:  str     r8, [s[[STK1:.*]]]
 ; CHECK-ARMV7-DAG:  str     r10, [s[[STK2:.*]]]
 ; Store arguments.
-; CHECK-ARMV7:  mov     r4, r3
-; CHECK-ARMV7:  mov     r5, r2
-; CHECK-ARMV7:  mov     r8, r1
-; CHECK-ARMV7:  mov     r11, r0
+; CHECK-ARMV7:  mov     r6, r3
+; CHECK-ARMV7:  mov     r4, r2
+; CHECK-ARMV7:  mov     r11, r1
+; CHECK-ARMV7:  mov     r5, r0
 ; Setup call.
 ; CHECK-ARMV7:  mov     r0, #1
 ; CHECK-ARMV7:  mov     r1, #2
 ; CHECK-ARMV7:  mov     r2, #3
 ; CHECK-ARMV7:  mov     r3, #4
 ; CHECK-ARMV7:  mov     r10, #0
-; CHECK-ARMV7:  mov     r6, #0
+; CHECK-ARMV7:  mov     r8, #0
 ; CHECK-ARMV7:  bl      _params_in_reg2
 ; Restore original arguments.
 ; CHECK-ARMV7-DAG:  ldr     r10, [s[[STK2]]]
-; CHECK-ARMV7-DAG:  ldr     r6, [s[[STK1]]]
-; CHECK-ARMV7:  mov     r0, r11
-; CHECK-ARMV7:  mov     r1, r8
-; CHECK-ARMV7:  mov     r2, r5
-; CHECK-ARMV7:  mov     r3, r4
+; CHECK-ARMV7-DAG:  ldr     r8, [s[[STK1]]]
+; CHECK-ARMV7:  mov     r0, r5
+; CHECK-ARMV7:  mov     r1, r11
+; CHECK-ARMV7:  mov     r2, r4
+; CHECK-ARMV7:  mov     r3, r6
 ; CHECK-ARMV7:  bl      _params_in_reg2
-; CHECK-ARMV7:  pop     {r4, r5, r7,  r8, r10, r11, pc}
+; CHECK-ARMV7:  pop     {r4, r5, r6, r7, r10, r11, pc}
 define swiftcc void @params_in_reg(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror %swift_error*, align 8
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -451,42 +451,42 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, i8* swiftself, %swift_err
 declare swiftcc void @params_in_reg2(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err)
 
 ; CHECK-ARMV7-LABEL: params_and_return_in_reg
-; CHECK-ARMV7:  push    {r4, r5, r7, r8, r10, r11, lr}
+; CHECK-ARMV7:  push    {r4, r5, r6, r7, r10, r11, lr}
 ; Store swifterror and swiftself
-; CHECK-ARMV7:  mov     r4, r6
+; CHECK-ARMV7:  mov     r6, r8
 ; CHECK-ARMV7:  str     r10, [s[[STK1:.*]]]
 ; Store arguments.
 ; CHECK-ARMV7:  str     r3, [s[[STK2:.*]]]
-; CHECK-ARMV7:  mov     r5, r2
-; CHECK-ARMV7:  mov     r8, r1
-; CHECK-ARMV7:  mov     r11, r0
+; CHECK-ARMV7:  mov     r4, r2
+; CHECK-ARMV7:  mov     r11, r1
+; CHECK-ARMV7:  mov     r5, r0
 ; Setup call.
 ; CHECK-ARMV7:  mov     r0, #1
 ; CHECK-ARMV7:  mov     r1, #2
 ; CHECK-ARMV7:  mov     r2, #3
 ; CHECK-ARMV7:  mov     r3, #4
 ; CHECK-ARMV7:  mov     r10, #0
-; CHECK-ARMV7:  mov     r6, #0
+; CHECK-ARMV7:  mov     r8, #0
 ; CHECK-ARMV7:  bl      _params_in_reg2
 ; Restore original arguments.
 ; CHECK-ARMV7:  ldr     r3, [s[[STK2]]]
 ; CHECK-ARMV7:  ldr     r10, [s[[STK1]]]
 ; Store %error_ptr_ref;
-; CHECK-ARMV7:  str     r6, [s[[STK3:.*]]]
+; CHECK-ARMV7:  str     r8, [s[[STK3:.*]]]
 ; Restore original arguments.
-; CHECK-ARMV7:  mov     r0, r11
-; CHECK-ARMV7:  mov     r1, r8
-; CHECK-ARMV7:  mov     r2, r5
-; CHECK-ARMV7:  mov     r6, r4
+; CHECK-ARMV7:  mov     r0, r5
+; CHECK-ARMV7:  mov     r1, r11
+; CHECK-ARMV7:  mov     r2, r4
+; CHECK-ARMV7:  mov     r8, r6
 ; CHECK-ARMV7:  bl      _params_and_return_in_reg2
 ; Store swifterror return %err;
-; CHECK-ARMV7:  str     r6, [s[[STK1]]]
+; CHECK-ARMV7:  str     r8, [s[[STK1]]]
 ; Load swifterror value %error_ptr_ref.
-; CHECK-ARMV7:  ldr     r6, [s[[STK3]]]
+; CHECK-ARMV7:  ldr     r8, [s[[STK3]]]
 ; Save return values.
-; CHECK-ARMV7:  mov     r5, r0
-; CHECK-ARMV7:  mov     r4, r1
-; CHECK-ARMV7:  mov     r8, r2
+; CHECK-ARMV7:  mov     r4, r0
+; CHECK-ARMV7:  mov     r5, r1
+; CHECK-ARMV7:  mov     r6, r2
 ; CHECK-ARMV7:  mov     r11, r3
 ; Setup call.
 ; CHECK-ARMV7:  mov     r0, #1
@@ -496,13 +496,13 @@ declare swiftcc void @params_in_reg2(i32, i32, i32, i32, i8* swiftself, %swift_e
 ; CHECK-ARMV7:  mov     r10, #0
 ; CHECK-ARMV7:  bl      _params_in_reg2
 ; Load swifterror %err;
-; CHECK-ARMV7:  ldr     r6, [s[[STK1]]]
+; CHECK-ARMV7:  ldr     r8, [s[[STK1]]]
 ; Restore return values for returning.
-; CHECK-ARMV7:  mov     r0, r5
-; CHECK-ARMV7:  mov     r1, r4
-; CHECK-ARMV7:  mov     r2, r8
+; CHECK-ARMV7:  mov     r0, r4
+; CHECK-ARMV7:  mov     r1, r5
+; CHECK-ARMV7:  mov     r2, r6
 ; CHECK-ARMV7:  mov     r3, r11
-; CHECK-ARMV7:  pop     {r4, r5, r7, r8, r10, r11, pc}
+; CHECK-ARMV7:  pop     {r4, r5, r6, r7, r10, r11, pc}
 define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror %swift_error*, align 8
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -513,3 +513,18 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3
 }
 
 declare swiftcc { i32, i32, i32, i32 } @params_and_return_in_reg2(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err)
+
+
+declare void @acallee(i8*)
+
+; Make sure we don't tail call if the caller returns a swifterror value. We
+; would have to move into the swifterror register before the tail call.
+; CHECK-APPLE: tailcall_from_swifterror:
+; CHECK-APPLE-NOT: b _acallee
+; CHECK-APPLE: bl _acallee
+
+define swiftcc void @tailcall_from_swifterror(%swift_error** swifterror %error_ptr_ref) {
+entry:
+  tail call void @acallee(i8* null)
+  ret void
+}
diff --git a/test/CodeGen/ARM/swiftself.ll b/test/CodeGen/ARM/swiftself.ll
index b7a04ca4060e..1e06b34c7052 100644
--- a/test/CodeGen/ARM/swiftself.ll
+++ b/test/CodeGen/ARM/swiftself.ll
@@ -63,3 +63,20 @@ define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind "no-fram
   %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
   ret i8* %res
 }
+
+; We cannot pretend that 'r0' is alive across the thisreturn_attribute call as
+; we normally would. We marked the first parameter with swiftself which means it
+; will no longer be passed in r0.
+declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
+; OPT-LABEL: swiftself_nothisreturn:
+; OPT-DAG: mov [[CSREG:r[1-9].*]], r0
+; OPT-DAG: ldr r10, [r10]
+; OPT: bl  {{_?}}thisreturn_attribute
+; OPT: str r0, {{\[}}[[CSREG]]
+define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret, i8** noalias nocapture readonly swiftself) {
+entry:
+  %2 = load i8*, i8** %1, align 8
+  %3 = tail call swiftcc i8* @thisreturn_attribute(i8* swiftself %2)
+  store i8* %3, i8** %0, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/dag-update-nodetomatch.ll b/test/CodeGen/X86/dag-update-nodetomatch.ll
new file mode 100644
index 000000000000..45b6d020ce45
--- /dev/null
+++ b/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+%struct.i = type { i32, i24 }
+%struct.m = type { %struct.i }
+
+@a = local_unnamed_addr global i32 0, align 4
+@b = local_unnamed_addr global i16 0, align 2
+@c = local_unnamed_addr global i16 0, align 2
+@e = local_unnamed_addr global i16 0, align 2
+@l = local_unnamed_addr global %struct.i zeroinitializer, align 4
+@k = local_unnamed_addr global %struct.m zeroinitializer, align 4
+
+@x0 = local_unnamed_addr global double 0.000000e+00, align 8
+@x1 = local_unnamed_addr global i32 0, align 4
+@x2 = local_unnamed_addr global i32 0, align 4
+@x3 = local_unnamed_addr global i32 0, align 4
+@x4 = local_unnamed_addr global i32 0, align 4
+@x5 = local_unnamed_addr global double* null, align 8
+
+; Check that compiler does not crash.
+; Test for PR30775
+define void @_Z1nv() local_unnamed_addr {
+; CHECK-LABEL: _Z1nv:
+entry:
+  %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.m, %struct.m* @k, i64 0, i32 0, i32 1) to i32*), align 4
+  %0 = load i16, i16* @c, align 2
+  %conv = sext i16 %0 to i32
+  %1 = load i16, i16* @b, align 2
+  %conv1 = sext i16 %1 to i32
+  %2 = load i32, i32* @a, align 4
+  %tobool = icmp ne i32 %2, 0
+  %bf.load3 = load i32, i32* getelementptr inbounds (%struct.i, %struct.i* @l, i64 0, i32 0), align 4
+  %bf.shl = shl i32 %bf.load3, 7
+  %bf.ashr = ashr exact i32 %bf.shl, 7
+  %bf.clear = shl i32 %bf.load, 1
+  %factor = and i32 %bf.clear, 131070
+  %add13 = add nsw i32 %factor, %conv
+  %add15 = add nsw i32 %add13, %conv1
+  %bf.ashr.op = sub nsw i32 0, %bf.ashr
+  %add28 = select i1 %tobool, i32 %bf.ashr.op, i32 0
+  %tobool29 = icmp eq i32 %add15, %add28
+  %phitmp = icmp eq i32 %bf.ashr, 0
+  %.phitmp = or i1 %phitmp, %tobool29
+  %conv37 = zext i1 %.phitmp to i16
+  store i16 %conv37, i16* @e, align 2
+  %bf.clear39 = and i32 %bf.load, 65535
+  %factor53 = shl nuw nsw i32 %bf.clear39, 1
+  %add46 = add nsw i32 %factor53, %conv
+  %add48 = add nsw i32 %add46, %conv1
+  %add48.lobit = lshr i32 %add48, 31
+  %add48.lobit.not = xor i32 %add48.lobit, 1
+  %add51 = add nuw nsw i32 %add48.lobit.not, %bf.clear39
+  %shr = ashr i32 %2, %add51
+  %conv52 = trunc i32 %shr to i16
+  store i16 %conv52, i16* @b, align 2
+  ret void
+}
+
+; Test for PR31536
+define void @_Z2x6v() local_unnamed_addr {
+; CHECK-LABEL: _Z2x6v:
+entry:
+  %0 = load i32, i32* @x1, align 4
+  %and = and i32 %0, 511
+  %add = add nuw nsw i32 %and, 1
+  store i32 %add, i32* @x4, align 4
+  %.pr = load i32, i32* @x3, align 4
+  %tobool8 = icmp eq i32 %.pr, 0
+  br i1 %tobool8, label %for.end5, label %for.cond1thread-pre-split.lr.ph
+
+for.cond1thread-pre-split.lr.ph:                  ; preds = %entry
+  %idx.ext13 = zext i32 %add to i64
+  %x5.promoted = load double*, double** @x5, align 8
+  %x5.promoted9 = bitcast double* %x5.promoted to i8*
+  %1 = xor i32 %.pr, -1
+  %2 = zext i32 %1 to i64
+  %3 = shl nuw nsw i64 %2, 3
+  %4 = add nuw nsw i64 %3, 8
+  %5 = mul nuw nsw i64 %4, %idx.ext13
+  %uglygep = getelementptr i8, i8* %x5.promoted9, i64 %5
+  %.pr6.pre = load i32, i32* @x2, align 4
+  %6 = shl nuw nsw i32 %and, 3
+  %addconv = add nuw nsw i32 %6, 8
+  %7 = zext i32 %addconv to i64
+  %scevgep15 = getelementptr double, double* %x5.promoted, i64 1
+  %scevgep1516 = bitcast double* %scevgep15 to i8*
+  br label %for.cond1thread-pre-split
+
+for.cond1thread-pre-split:                        ; preds = %for.cond1thread-pre-split.lr.ph, %for.inc3
+  %indvar = phi i64 [ 0, %for.cond1thread-pre-split.lr.ph ], [ %indvar.next, %for.inc3 ]
+  %.pr6 = phi i32 [ %.pr6.pre, %for.cond1thread-pre-split.lr.ph ], [ %.pr611, %for.inc3 ]
+  %8 = phi double* [ %x5.promoted, %for.cond1thread-pre-split.lr.ph ], [ %add.ptr, %for.inc3 ]
+  %9 = phi i32 [ %.pr, %for.cond1thread-pre-split.lr.ph ], [ %inc4, %for.inc3 ]
+  %10 = mul i64 %7, %indvar
+  %uglygep14 = getelementptr i8, i8* %x5.promoted9, i64 %10
+  %uglygep17 = getelementptr i8, i8* %scevgep1516, i64 %10
+  %cmp7 = icmp slt i32 %.pr6, 0
+  br i1 %cmp7, label %for.body2.preheader, label %for.inc3
+
+for.body2.preheader:                              ; preds = %for.cond1thread-pre-split
+  %11 = sext i32 %.pr6 to i64
+  %12 = sext i32 %.pr6 to i64
+  %13 = icmp sgt i64 %12, -1
+  %smax = select i1 %13, i64 %12, i64 -1
+  %14 = add nsw i64 %smax, 1
+  %15 = sub nsw i64 %14, %12
+  %min.iters.check = icmp ult i64 %15, 4
+  br i1 %min.iters.check, label %for.body2.preheader21, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body2.preheader
+  %n.vec = and i64 %15, -4
+  %cmp.zero = icmp eq i64 %n.vec, 0
+  br i1 %cmp.zero, label %for.body2.preheader21, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %16 = shl nsw i64 %11, 3
+  %scevgep = getelementptr i8, i8* %uglygep14, i64 %16
+  %17 = icmp sgt i64 %11, -1
+  %smax18 = select i1 %17, i64 %11, i64 -1
+  %18 = shl nsw i64 %smax18, 3
+  %scevgep19 = getelementptr i8, i8* %uglygep17, i64 %18
+  %bound0 = icmp ult i8* %scevgep, bitcast (double* @x0 to i8*)
+  %bound1 = icmp ugt i8* %scevgep19, bitcast (double* @x0 to i8*)
+  %memcheck.conflict = and i1 %bound0, %bound1
+  %ind.end = add nsw i64 %11, %n.vec
+  br i1 %memcheck.conflict, label %for.body2.preheader21, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %19 = add nsw i64 %n.vec, -4
+  %20 = lshr exact i64 %19, 2
+  %21 = and i64 %20, 1
+  %lcmp.mod = icmp eq i64 %21, 0
+  br i1 %lcmp.mod, label %vector.body.prol.preheader, label %vector.body.prol.loopexit.unr-lcssa
+
+vector.body.prol.preheader:                       ; preds = %vector.body.preheader
+  br label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol.preheader
+  %22 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %23 = insertelement <2 x i64> undef, i64 %22, i32 0
+  %24 = shufflevector <2 x i64> %23, <2 x i64> undef, <2 x i32> zeroinitializer
+  %25 = insertelement <2 x i64> undef, i64 %22, i32 0
+  %26 = shufflevector <2 x i64> %25, <2 x i64> undef, <2 x i32> zeroinitializer
+  %27 = getelementptr inbounds double, double* %8, i64 %11
+  %28 = bitcast double* %27 to <2 x i64>*
+  store <2 x i64> %24, <2 x i64>* %28, align 8
+  %29 = getelementptr double, double* %27, i64 2
+  %30 = bitcast double* %29 to <2 x i64>*
+  store <2 x i64> %26, <2 x i64>* %30, align 8
+  br label %vector.body.prol.loopexit.unr-lcssa
+
+vector.body.prol.loopexit.unr-lcssa:              ; preds = %vector.body.preheader, %vector.body.prol
+  %index.unr.ph = phi i64 [ 4, %vector.body.prol ], [ 0, %vector.body.preheader ]
+  br label %vector.body.prol.loopexit
+
+vector.body.prol.loopexit:                        ; preds = %vector.body.prol.loopexit.unr-lcssa
+  %31 = icmp eq i64 %20, 0
+  br i1 %31, label %middle.block, label %vector.body.preheader.new
+
+vector.body.preheader.new:                        ; preds = %vector.body.prol.loopexit
+  %32 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %33 = insertelement <2 x i64> undef, i64 %32, i32 0
+  %34 = shufflevector <2 x i64> %33, <2 x i64> undef, <2 x i32> zeroinitializer
+  %35 = insertelement <2 x i64> undef, i64 %32, i32 0
+  %36 = shufflevector <2 x i64> %35, <2 x i64> undef, <2 x i32> zeroinitializer
+  %37 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %38 = insertelement <2 x i64> undef, i64 %37, i32 0
+  %39 = shufflevector <2 x i64> %38, <2 x i64> undef, <2 x i32> zeroinitializer
+  %40 = insertelement <2 x i64> undef, i64 %37, i32 0
+  %41 = shufflevector <2 x i64> %40, <2 x i64> undef, <2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.new
+  %index = phi i64 [ %index.unr.ph, %vector.body.preheader.new ], [ %index.next.1, %vector.body ]
+  %42 = add i64 %11, %index
+  %43 = getelementptr inbounds double, double* %8, i64 %42
+  %44 = bitcast double* %43 to <2 x i64>*
+  store <2 x i64> %34, <2 x i64>* %44, align 8
+  %45 = getelementptr double, double* %43, i64 2
+  %46 = bitcast double* %45 to <2 x i64>*
+  store <2 x i64> %36, <2 x i64>* %46, align 8
+  %index.next = add i64 %index, 4
+  %47 = add i64 %11, %index.next
+  %48 = getelementptr inbounds double, double* %8, i64 %47
+  %49 = bitcast double* %48 to <2 x i64>*
+  store <2 x i64> %39, <2 x i64>* %49, align 8
+  %50 = getelementptr double, double* %48, i64 2
+  %51 = bitcast double* %50 to <2 x i64>*
+  store <2 x i64> %41, <2 x i64>* %51, align 8
+  %index.next.1 = add i64 %index, 8
+  %52 = icmp eq i64 %index.next.1, %n.vec
+  br i1 %52, label %middle.block.unr-lcssa, label %vector.body
+
+middle.block.unr-lcssa:                           ; preds = %vector.body
+  br label %middle.block
+
+middle.block:                                     ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa
+  %cmp.n = icmp eq i64 %15, %n.vec
+  br i1 %cmp.n, label %for.cond1.for.inc3_crit_edge, label %for.body2.preheader21
+
+for.body2.preheader21:                            ; preds = %middle.block, %vector.memcheck, %min.iters.checked, %for.body2.preheader
+  %indvars.iv.ph = phi i64 [ %11, %vector.memcheck ], [ %11, %min.iters.checked ], [ %11, %for.body2.preheader ], [ %ind.end, %middle.block ]
+  br label %for.body2
+
+for.body2:                                        ; preds = %for.body2.preheader21, %for.body2
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body2 ], [ %indvars.iv.ph, %for.body2.preheader21 ]
+  %53 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %arrayidx = getelementptr inbounds double, double* %8, i64 %indvars.iv
+  %54 = bitcast double* %arrayidx to i64*
+  store i64 %53, i64* %54, align 8
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv, -1
+  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
+
+for.cond1.for.inc3_crit_edge.loopexit:            ; preds = %for.body2
+  br label %for.cond1.for.inc3_crit_edge
+
+for.cond1.for.inc3_crit_edge:                     ; preds = %for.cond1.for.inc3_crit_edge.loopexit, %middle.block
+  %indvars.iv.next.lcssa = phi i64 [ %ind.end, %middle.block ], [ %indvars.iv.next, %for.cond1.for.inc3_crit_edge.loopexit ]
+  %55 = trunc i64 %indvars.iv.next.lcssa to i32
+  store i32 %55, i32* @x2, align 4
+  br label %for.inc3
+
+for.inc3:                                         ; preds = %for.cond1.for.inc3_crit_edge, %for.cond1thread-pre-split
+  %.pr611 = phi i32 [ %55, %for.cond1.for.inc3_crit_edge ], [ %.pr6, %for.cond1thread-pre-split ]
+  %inc4 = add nsw i32 %9, 1
+  %add.ptr = getelementptr inbounds double, double* %8, i64 %idx.ext13
+  %tobool = icmp eq i32 %inc4, 0
+  %indvar.next = add i64 %indvar, 1
+  br i1 %tobool, label %for.cond.for.end5_crit_edge, label %for.cond1thread-pre-split
+
+for.cond.for.end5_crit_edge:                      ; preds = %for.inc3
+  store i8* %uglygep, i8** bitcast (double** @x5 to i8**), align 8
+  store i32 0, i32* @x3, align 4
+  br label %for.end5
+
+for.end5:                                         ; preds = %for.cond.for.end5_crit_edge, %entry
+  ret void
+}
+
diff --git a/test/CodeGen/X86/pr31956.ll b/test/CodeGen/X86/pr31956.ll
new file mode 100644
index 000000000000..e9293048f4e5
--- /dev/null
+++ b/test/CodeGen/X86/pr31956.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+avx < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+@G1 = common global <2 x float> zeroinitializer, align 8
+@G2 = common global <8 x float> zeroinitializer, align 32
+
+define <4 x float> @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,0]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %V = load <2 x float>, <2 x float>* @G1, align 8
+  %shuffle = shufflevector <2 x float> %V, <2 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
+  %L = load <8 x float>, <8 x float>* @G2, align 32
+  %shuffle1 = shufflevector <8 x float> %shuffle, <8 x float> %L, <4 x i32> <i32 12, i32 10, i32 14, i32 4>
+  ret <4 x float> %shuffle1
+}
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
index cd4150597225..86e0221c2015 100644
--- a/test/CodeGen/X86/swifterror.ll
+++ b/test/CodeGen/X86/swifterror.ll
@@ -670,3 +670,18 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 }
 
 declare swiftcc { i64, i64, i64, i64 } @params_and_return_in_reg2(i64, i64, i64, i64, i64, i64, i8* swiftself, %swift_error** nocapture swifterror %err)
+
+
+declare void @acallee(i8*)
+
+; Make sure we don't tail call if the caller returns a swifterror value. We
+; would have to move into the swifterror register before the tail call.
+; CHECK-APPLE: tailcall_from_swifterror:
+; CHECK-APPLE-NOT: jmp _acallee
+; CHECK-APPLE: callq _acallee
+
+define swiftcc void @tailcall_from_swifterror(%swift_error** swifterror %error_ptr_ref) {
+entry:
+  tail call void @acallee(i8* null)
+  ret void
+}
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 9c4d416a1eff..9827e7a6792b 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -170,6 +170,32 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
 ; CHECK: __asan_memcpy
 ; CHECK: ret void
 
+; CHECK-LABEL: @test_swifterror
+; CHECK-NOT: __asan_report_load
+; CHECK: ret void
+define void @test_swifterror(i8** swifterror) sanitize_address {
+  %swifterror_ptr_value = load i8*, i8** %0
+  ret void
+}
+
+; CHECK-LABEL: @test_swifterror_2
+; CHECK-NOT: __asan_report_store
+; CHECK: ret void
+define void @test_swifterror_2(i8** swifterror) sanitize_address {
+  store i8* null, i8** %0
+  ret void
+}
+
+; CHECK-LABEL: @test_swifterror_3
+; CHECK-NOT: __asan_report_store
+; CHECK: ret void
+define void @test_swifterror_3() sanitize_address {
+  %swifterror_addr = alloca swifterror i8*
+  store i8* null, i8** %swifterror_addr
+  call void @test_swifterror_2(i8** swifterror %swifterror_addr)
+  ret void
+}
+
 ; CHECK: define internal void @asan.module_ctor()
 ; CHECK: call void @__asan_init()
 
diff --git a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
index 7e049c548f22..61ab98dc9997 100644
--- a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
+++ b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
@@ -54,5 +54,29 @@ entry:
 ; CHECK: ret void
 }
 
+; CHECK-LABEL: @SwiftError
+; CHECK-NOT: __tsan_read
+; CHECK-NOT: __tsan_write
+; CHECK: ret
+define void @SwiftError(i8** swifterror) sanitize_thread {
+  %swifterror_ptr_value = load i8*, i8** %0
+  store i8* null, i8** %0
+  %swifterror_addr = alloca swifterror i8*
+  %swifterror_ptr_value_2 = load i8*, i8** %swifterror_addr
+  store i8* null, i8** %swifterror_addr
+  ret void
+}
+
+; CHECK-LABEL: @SwiftErrorCall
+; CHECK-NOT: __tsan_read
+; CHECK-NOT: __tsan_write
+; CHECK: ret
+define void @SwiftErrorCall(i8** swifterror) sanitize_thread {
+  %swifterror_addr = alloca swifterror i8*
+  store i8* null, i8** %0
+  call void @SwiftError(i8** %0)
+  ret void
+}
+
 ; CHECK: define internal void @tsan.module_ctor()
 ; CHECK: call void @__tsan_init()
diff --git a/test/Transforms/LoopUnroll/runtime-li.ll b/test/Transforms/LoopUnroll/runtime-li.ll
new file mode 100644
index 000000000000..5494c8e9da7d
--- /dev/null
+++ b/test/Transforms/LoopUnroll/runtime-li.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -loop-unroll -unroll-runtime -unroll-count=2 -verify-loop-info -pass-remarks=loop-unroll < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Verify that runtime-unrolling a top-level loop that has nested loops does not
+; make the unroller produce invalid loop-info.
+; CHECK: remark: {{.*}}: unrolled loop by a factor of 2 with run-time trip count
+; CHECK: @widget
+; CHECK: ret void
+define void @widget(double* %arg, double* %arg1, double* %p, i64* %q1, i64* %q2) local_unnamed_addr {
+entry:
+  br label %header.outer
+
+header.outer:                                     ; preds = %latch.outer, %entry
+  %tmp = phi double* [ %tmp8, %latch.outer ], [ %arg, %entry ]
+  br label %header.inner
+
+header.inner:                                     ; preds = %latch.inner, %header.outer
+  br i1 undef, label %latch.inner, label %latch.outer
+
+latch.inner:                                      ; preds = %header.inner
+  %tmp5 = load i64, i64* %q1, align 8
+  store i64 %tmp5, i64* %q2, align 8
+  %tmp6 = icmp eq double* %p, %arg
+  br label %header.inner
+
+latch.outer:                                      ; preds = %header.inner
+  store double 0.0, double* %p, align 8
+  %tmp8 = getelementptr inbounds double, double* %tmp, i64 1
+  %tmp9 = icmp eq double* %tmp8, %arg1
+  br i1 %tmp9, label %exit, label %header.outer
+
+exit:                                             ; preds = %latch.outer
+  ret void
+}

From eb2854521a26d3f186018f1b119761ca7bb90dc2 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Fri, 17 Feb 2017 19:36:19 +0000
Subject: [PATCH 2/4] Vendor import of clang release_40 branch r295380:
 https://llvm.org/svn/llvm-project/cfe/branches/release_40@295380

---
 docs/AutomaticReferenceCounting.rst     |  7 +-
 docs/ReleaseNotes.rst                   | 73 ++++++++++++++++++-
 docs/UsersManual.rst                    | 39 +++++++++-
 lib/AST/ExprConstant.cpp                | 94 +++++++++++++++----------
 lib/CodeGen/CodeGenModule.h             |  2 +-
 lib/Parse/ParseExpr.cpp                 |  2 +-
 lib/Sema/SemaLookup.cpp                 | 11 +--
 lib/Sema/SemaStmt.cpp                   |  8 ++-
 lib/Sema/SemaTemplateVariadic.cpp       |  7 ++
 test/CodeGen/object-size.c              | 19 +++++
 test/Parser/cxx1z-fold-expressions.cpp  |  9 +++
 test/Sema/builtin-object-size.c         | 15 ++++
 test/SemaCXX/cxx11-inheriting-ctors.cpp | 28 ++++++++
 test/SemaObjCXX/blocks.mm               | 16 ++++-
 14 files changed, 275 insertions(+), 55 deletions(-)

diff --git a/docs/AutomaticReferenceCounting.rst b/docs/AutomaticReferenceCounting.rst
index aa2a28399d14..fbd1ba4c4d47 100644
--- a/docs/AutomaticReferenceCounting.rst
+++ b/docs/AutomaticReferenceCounting.rst
@@ -2258,16 +2258,13 @@ non-block type [*]_.  Equivalent to the following code:
 
 .. code-block:: objc
 
-  id objc_storeStrong(id *object, id value) {
-    value = [value retain];
+  void objc_storeStrong(id *object, id value) {
     id oldValue = *object;
+    value = [value retain];
     *object = value;
     [oldValue release];
-    return value;
   }
 
-Always returns ``value``.
-
 .. [*] This does not imply that a ``__strong`` object of block type is an
    invalid argument to this function. Rather it implies that an ``objc_retain``
    and not an ``objc_retainBlock`` operation will be emitted if the argument is
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 25ea577ba519..08d7a7583f5b 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -17,7 +17,7 @@ Written by the `LLVM Team <http://llvm.org/>`_
 Introduction
 ============
 
-This document contains the release notes for the Clang C/C++/Objective-C
+This document contains the release notes for the Clang C/C++/Objective-C/OpenCL
 frontend, part of the LLVM Compiler Infrastructure, release 4.0.0. Here we
 describe the status of Clang in some detail, including major
 improvements from the previous release and new feature work. For the
@@ -139,7 +139,76 @@ Objective-C Language Changes in Clang
 OpenCL C Language Changes in Clang
 ----------------------------------
 
-...
+**The following bugs in the OpenCL header have been fixed:**
+
+* Added missing ``overloadable`` and ``convergent`` attributes.
+* Removed some erroneous extra ``native_*`` functions.
+
+**The following bugs in the generation of metadata have been fixed:**
+
+* Corrected the SPIR version depending on the OpenCL version.
+* Source level address spaces are taken from the SPIR specification.
+* Image types now contain no access qualifier.
+
+**The following bugs in the AMD target have been fixed:**
+
+* Corrected the bitwidth of ``size_t`` and NULL pointer value with respect to
+  address spaces.
+* Added ``cl_khr_subgroups``, ``cl_amd_media_ops`` and ``cl_amd_media_ops2``
+  extensions.
+* Added ``cl-denorms-are-zero`` support.
+* Changed address spaces for image objects to be ``constant``.
+* Added little-endian.
+
+**The following bugs in OpenCL 2.0 have been fixed:**
+
+* Fixed pipe builtin function return type, added extra argument to generated
+  IR intrinsics to propagate size and alignment information of the pipe packed
+  type.
+* Improved pipe type to accommodate access qualifiers.
+* Added correct address space to the ObjC block generation and ``enqueue_kernel``
+  prototype.
+* Improved handling of integer parameters of ``enqueue_kernel`` prototype. We
+  now allow ``size_t`` instead of ``int`` for specifying block parameter sizes.
+* Allow using NULL (aka ``CLK_NULL_QUEUE``) with ``queue_t``.
+
+
+**Improved the following diagnostics:**
+
+* Disallow address spaces other than ``global`` for kernel pointer parameters.
+* Correct the use of half type argument and pointer assignment with
+  dereferencing.
+* Disallow variadic arguments in functions and blocks.
+* Allow partial initializer for array and struct.
+
+**Some changes to OpenCL extensions have been made:**
+
+* Added ``cl_khr_mipmap_image``.
+* Added ``-cl-ext`` flag to allow overwriting supported extensions otherwise
+  set by the target compiled for (Example: ``-cl-ext=-all,+cl_khr_fp16``).
+* New types and functions can now be flexibly added to extensions using the
+  following pragmas instead of modifying the Clang source code:
+
+  .. code-block:: c
+
+       #pragma OPENCL EXTENSION the_new_extension_name : begin
+       // declare types and functions associated with the extension here
+       #pragma OPENCL EXTENSION the_new_extension_name : end
+
+
+**Miscellaneous changes:**
+
+* Fix ``__builtin_astype`` to cast between different address space objects.
+* Allow using ``opencl_unroll_hint`` with earlier OpenCL versions than 2.0.
+* Improved handling of floating point literal to default to single precision if
+  fp64 extension is not enabled.
+* Refactor ``sampler_t`` implementation to simplify initializer representation
+  which is now handled as a compiler builtin function with an integer value
+  passed into it.
+* Change fake address space map to use the SPIR convention.
+* Added `the OpenCL manual
+  <https://clang.llvm.org/docs/UsersManual.html#opencl-features>`_ to Clang
+  documentation.
 
 OpenMP Support in Clang
 ----------------------------------
diff --git a/docs/UsersManual.rst b/docs/UsersManual.rst
index 04023dd20602..6c8355776b7b 100644
--- a/docs/UsersManual.rst
+++ b/docs/UsersManual.rst
@@ -2056,6 +2056,8 @@ can be given manually.
 In this case the kernel code should contain ``#include <opencl-c.h>`` just as a
 regular C include.
 
+.. _opencl_cl_ext:
+
 .. option:: -cl-ext
 
 Disables support of OpenCL extensions. All OpenCL targets provide a list
@@ -2177,6 +2179,41 @@ To enable modules for OpenCL:
 
      $ clang -target spir-unknown-unknown -c -emit-llvm -Xclang -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=<path to the generated module> test.cl
 
+OpenCL Extensions
+-----------------
+
+All of the ``cl_khr_*`` extensions from `the official OpenCL specification
+<https://www.khronos.org/registry/OpenCL/sdk/2.0/docs/man/xhtml/EXTENSION.html>`_
+up to and including version 2.0 are available and set per target depending on the
+support available in the specific architecture.
+
+It is possible to alter the default extensions setting per target using
+``-cl-ext`` flag. (See :ref:`flags description <opencl_cl_ext>` for more details).
+
+Vendor extensions can be added flexibly by declaring the list of types and
+functions associated with each extensions enclosed within the following
+compiler pragma directives:
+
+  .. code-block:: c
+
+       #pragma OPENCL EXTENSION the_new_extension_name : begin
+       // declare types and functions associated with the extension here
+       #pragma OPENCL EXTENSION the_new_extension_name : end
+
+For example, parsing the following code adds ``my_t`` type and ``my_func``
+function to the custom ``my_ext`` extension.
+
+  .. code-block:: c
+
+       #pragma OPENCL EXTENSION my_ext : begin
+       typedef struct{
+         int a;
+       }my_t;
+       void my_func(my_t);
+       #pragma OPENCL EXTENSION my_ext : end
+
+Declaring the same types in different vendor extensions is disallowed.
+
 OpenCL Metadata
 ---------------
 
@@ -2215,7 +2252,7 @@ does not have any effect on the IR. For more details reffer to the specification
 <https://www.khronos.org/registry/cl/specs/opencl-2.0-openclc.pdf#49>`_
 
 
-opencl_hint_unroll
+opencl_unroll_hint
 ^^^^^^^^^^^^^^^^^^
 
 The implementation of this feature mirrors the unroll hint for C.
diff --git a/lib/AST/ExprConstant.cpp b/lib/AST/ExprConstant.cpp
index 6a6baf96ad37..2c0fce91844c 100644
--- a/lib/AST/ExprConstant.cpp
+++ b/lib/AST/ExprConstant.cpp
@@ -604,10 +604,12 @@ namespace {
       /// gets a chance to look at it.
       EM_PotentialConstantExpressionUnevaluated,
 
-      /// Evaluate as a constant expression. Continue evaluating if either:
-      /// - We find a MemberExpr with a base that can't be evaluated.
-      /// - We find a variable initialized with a call to a function that has
-      ///   the alloc_size attribute on it.
+      /// Evaluate as a constant expression. In certain scenarios, if:
+      /// - we find a MemberExpr with a base that can't be evaluated, or
+      /// - we find a variable initialized with a call to a function that has
+      ///   the alloc_size attribute on it
+      /// then we may consider evaluation to have succeeded.
+      ///
       /// In either case, the LValue returned shall have an invalid base; in the
       /// former, the base will be the invalid MemberExpr, in the latter, the
       /// base will be either the alloc_size CallExpr or a CastExpr wrapping
@@ -890,10 +892,6 @@ namespace {
       return KeepGoing;
     }
 
-    bool allowInvalidBaseExpr() const {
-      return EvalMode == EM_OffsetFold;
-    }
-
     class ArrayInitLoopIndex {
       EvalInfo &Info;
       uint64_t OuterIndex;
@@ -1394,8 +1392,10 @@ static bool Evaluate(APValue &Result, EvalInfo &Info, const Expr *E);
 static bool EvaluateInPlace(APValue &Result, EvalInfo &Info,
                             const LValue &This, const Expr *E,
                             bool AllowNonLiteralTypes = false);
-static bool EvaluateLValue(const Expr *E, LValue &Result, EvalInfo &Info);
-static bool EvaluatePointer(const Expr *E, LValue &Result, EvalInfo &Info);
+static bool EvaluateLValue(const Expr *E, LValue &Result, EvalInfo &Info,
+                           bool InvalidBaseOK = false);
+static bool EvaluatePointer(const Expr *E, LValue &Result, EvalInfo &Info,
+                            bool InvalidBaseOK = false);
 static bool EvaluateMemberPointer(const Expr *E, MemberPtr &Result,
                                   EvalInfo &Info);
 static bool EvaluateTemporary(const Expr *E, LValue &Result, EvalInfo &Info);
@@ -4803,6 +4803,7 @@ class LValueExprEvaluatorBase
   : public ExprEvaluatorBase<Derived> {
 protected:
   LValue &Result;
+  bool InvalidBaseOK;
   typedef LValueExprEvaluatorBase LValueExprEvaluatorBaseTy;
   typedef ExprEvaluatorBase<Derived> ExprEvaluatorBaseTy;
 
@@ -4811,9 +4812,14 @@ protected:
     return true;
   }
 
+  bool evaluatePointer(const Expr *E, LValue &Result) {
+    return EvaluatePointer(E, Result, this->Info, InvalidBaseOK);
+  }
+
 public:
-  LValueExprEvaluatorBase(EvalInfo &Info, LValue &Result) :
-    ExprEvaluatorBaseTy(Info), Result(Result) {}
+  LValueExprEvaluatorBase(EvalInfo &Info, LValue &Result, bool InvalidBaseOK)
+      : ExprEvaluatorBaseTy(Info), Result(Result),
+        InvalidBaseOK(InvalidBaseOK) {}
 
   bool Success(const APValue &V, const Expr *E) {
     Result.setFrom(this->Info.Ctx, V);
@@ -4825,7 +4831,7 @@ public:
     QualType BaseTy;
     bool EvalOK;
     if (E->isArrow()) {
-      EvalOK = EvaluatePointer(E->getBase(), Result, this->Info);
+      EvalOK = evaluatePointer(E->getBase(), Result);
       BaseTy = E->getBase()->getType()->castAs<PointerType>()->getPointeeType();
     } else if (E->getBase()->isRValue()) {
       assert(E->getBase()->getType()->isRecordType());
@@ -4836,7 +4842,7 @@ public:
       BaseTy = E->getBase()->getType();
     }
     if (!EvalOK) {
-      if (!this->Info.allowInvalidBaseExpr())
+      if (!InvalidBaseOK)
         return false;
       Result.setInvalid(E);
       return true;
@@ -4930,8 +4936,8 @@ namespace {
 class LValueExprEvaluator
   : public LValueExprEvaluatorBase<LValueExprEvaluator> {
 public:
-  LValueExprEvaluator(EvalInfo &Info, LValue &Result) :
-    LValueExprEvaluatorBaseTy(Info, Result) {}
+  LValueExprEvaluator(EvalInfo &Info, LValue &Result, bool InvalidBaseOK) :
+    LValueExprEvaluatorBaseTy(Info, Result, InvalidBaseOK) {}
 
   bool VisitVarDecl(const Expr *E, const VarDecl *VD);
   bool VisitUnaryPreIncDec(const UnaryOperator *UO);
@@ -4984,10 +4990,11 @@ public:
 ///  * function designators in C, and
 ///  * "extern void" objects
 ///  * @selector() expressions in Objective-C
-static bool EvaluateLValue(const Expr *E, LValue &Result, EvalInfo &Info) {
+static bool EvaluateLValue(const Expr *E, LValue &Result, EvalInfo &Info,
+                           bool InvalidBaseOK) {
   assert(E->isGLValue() || E->getType()->isFunctionType() ||
          E->getType()->isVoidType() || isa<ObjCSelectorExpr>(E));
-  return LValueExprEvaluator(Info, Result).Visit(E);
+  return LValueExprEvaluator(Info, Result, InvalidBaseOK).Visit(E);
 }
 
 bool LValueExprEvaluator::VisitDeclRefExpr(const DeclRefExpr *E) {
@@ -5148,7 +5155,7 @@ bool LValueExprEvaluator::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
   if (E->getBase()->getType()->isVectorType())
     return Error(E);
 
-  if (!EvaluatePointer(E->getBase(), Result, Info))
+  if (!evaluatePointer(E->getBase(), Result))
     return false;
 
   APSInt Index;
@@ -5160,7 +5167,7 @@ bool LValueExprEvaluator::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) {
 }
 
 bool LValueExprEvaluator::VisitUnaryDeref(const UnaryOperator *E) {
-  return EvaluatePointer(E->getSubExpr(), Result, Info);
+  return evaluatePointer(E->getSubExpr(), Result);
 }
 
 bool LValueExprEvaluator::VisitUnaryReal(const UnaryOperator *E) {
@@ -5308,7 +5315,7 @@ static bool getBytesReturnedByAllocSizeCall(const ASTContext &Ctx,
 /// and mark Result's Base as invalid.
 static bool evaluateLValueAsAllocSize(EvalInfo &Info, APValue::LValueBase Base,
                                       LValue &Result) {
-  if (!Info.allowInvalidBaseExpr() || Base.isNull())
+  if (Base.isNull())
     return false;
 
   // Because we do no form of static analysis, we only support const variables.
@@ -5342,17 +5349,27 @@ namespace {
 class PointerExprEvaluator
   : public ExprEvaluatorBase<PointerExprEvaluator> {
   LValue &Result;
+  bool InvalidBaseOK;
 
   bool Success(const Expr *E) {
     Result.set(E);
     return true;
   }
 
+  bool evaluateLValue(const Expr *E, LValue &Result) {
+    return EvaluateLValue(E, Result, Info, InvalidBaseOK);
+  }
+
+  bool evaluatePointer(const Expr *E, LValue &Result) {
+    return EvaluatePointer(E, Result, Info, InvalidBaseOK);
+  }
+
   bool visitNonBuiltinCallExpr(const CallExpr *E);
 public:
 
-  PointerExprEvaluator(EvalInfo &info, LValue &Result)
-    : ExprEvaluatorBaseTy(info), Result(Result) {}
+  PointerExprEvaluator(EvalInfo &info, LValue &Result, bool InvalidBaseOK)
+      : ExprEvaluatorBaseTy(info), Result(Result),
+        InvalidBaseOK(InvalidBaseOK) {}
 
   bool Success(const APValue &V, const Expr *E) {
     Result.setFrom(Info.Ctx, V);
@@ -5399,9 +5416,10 @@ public:
 };
 } // end anonymous namespace
 
-static bool EvaluatePointer(const Expr* E, LValue& Result, EvalInfo &Info) {
+static bool EvaluatePointer(const Expr* E, LValue& Result, EvalInfo &Info,
+                            bool InvalidBaseOK) {
   assert(E->isRValue() && E->getType()->hasPointerRepresentation());
-  return PointerExprEvaluator(Info, Result).Visit(E);
+  return PointerExprEvaluator(Info, Result, InvalidBaseOK).Visit(E);
 }
 
 bool PointerExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
@@ -5414,7 +5432,7 @@ bool PointerExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
   if (IExp->getType()->isPointerType())
     std::swap(PExp, IExp);
 
-  bool EvalPtrOK = EvaluatePointer(PExp, Result, Info);
+  bool EvalPtrOK = evaluatePointer(PExp, Result);
   if (!EvalPtrOK && !Info.noteFailure())
     return false;
 
@@ -5432,7 +5450,7 @@ bool PointerExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
 }
 
 bool PointerExprEvaluator::VisitUnaryAddrOf(const UnaryOperator *E) {
-  return EvaluateLValue(E->getSubExpr(), Result, Info);
+  return evaluateLValue(E->getSubExpr(), Result);
 }
 
 bool PointerExprEvaluator::VisitCastExpr(const CastExpr* E) {
@@ -5466,7 +5484,7 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr* E) {
 
   case CK_DerivedToBase:
   case CK_UncheckedDerivedToBase:
-    if (!EvaluatePointer(E->getSubExpr(), Result, Info))
+    if (!evaluatePointer(E->getSubExpr(), Result))
       return false;
     if (!Result.Base && Result.Offset.isZero())
       return true;
@@ -5513,7 +5531,7 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr* E) {
   }
   case CK_ArrayToPointerDecay:
     if (SubExpr->isGLValue()) {
-      if (!EvaluateLValue(SubExpr, Result, Info))
+      if (!evaluateLValue(SubExpr, Result))
         return false;
     } else {
       Result.set(SubExpr, Info.CurrentCall->Index);
@@ -5530,18 +5548,19 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr* E) {
     return true;
 
   case CK_FunctionToPointerDecay:
-    return EvaluateLValue(SubExpr, Result, Info);
+    return evaluateLValue(SubExpr, Result);
 
   case CK_LValueToRValue: {
     LValue LVal;
-    if (!EvaluateLValue(E->getSubExpr(), LVal, Info))
+    if (!evaluateLValue(E->getSubExpr(), LVal))
       return false;
 
     APValue RVal;
     // Note, we use the subexpression's type in order to retain cv-qualifiers.
     if (!handleLValueToRValueConversion(Info, E, E->getSubExpr()->getType(),
                                         LVal, RVal))
-      return evaluateLValueAsAllocSize(Info, LVal.Base, Result);
+      return InvalidBaseOK &&
+             evaluateLValueAsAllocSize(Info, LVal.Base, Result);
     return Success(RVal, E);
   }
   }
@@ -5586,7 +5605,7 @@ bool PointerExprEvaluator::visitNonBuiltinCallExpr(const CallExpr *E) {
   if (ExprEvaluatorBaseTy::VisitCallExpr(E))
     return true;
 
-  if (!(Info.allowInvalidBaseExpr() && getAllocSizeAttr(E)))
+  if (!(InvalidBaseOK && getAllocSizeAttr(E)))
     return false;
 
   Result.setInvalid(E);
@@ -5609,12 +5628,12 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
                                                 unsigned BuiltinOp) {
   switch (BuiltinOp) {
   case Builtin::BI__builtin_addressof:
-    return EvaluateLValue(E->getArg(0), Result, Info);
+    return evaluateLValue(E->getArg(0), Result);
   case Builtin::BI__builtin_assume_aligned: {
     // We need to be very careful here because: if the pointer does not have the
     // asserted alignment, then the behavior is undefined, and undefined
     // behavior is non-constant.
-    if (!EvaluatePointer(E->getArg(0), Result, Info))
+    if (!evaluatePointer(E->getArg(0), Result))
       return false;
 
     LValue OffsetResult(Result);
@@ -6255,7 +6274,7 @@ class TemporaryExprEvaluator
   : public LValueExprEvaluatorBase<TemporaryExprEvaluator> {
 public:
   TemporaryExprEvaluator(EvalInfo &Info, LValue &Result) :
-    LValueExprEvaluatorBaseTy(Info, Result) {}
+    LValueExprEvaluatorBaseTy(Info, Result, false) {}
 
   /// Visit an expression which constructs the value of this temporary.
   bool VisitConstructExpr(const Expr *E) {
@@ -7358,7 +7377,8 @@ static bool tryEvaluateBuiltinObjectSize(const Expr *E, unsigned Type,
       if (!EvaluateAsRValue(Info, E, RVal))
         return false;
       LVal.setFrom(Info.Ctx, RVal);
-    } else if (!EvaluatePointer(ignorePointerCastsAndParens(E), LVal, Info))
+    } else if (!EvaluatePointer(ignorePointerCastsAndParens(E), LVal, Info,
+                                /*InvalidBaseOK=*/true))
       return false;
   }
 
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index 1d72b4edeb13..36f6785fd1b9 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -166,7 +166,7 @@ struct ObjCEntrypoints {
   /// void objc_release(id);
   llvm::Constant *objc_release;
 
-  /// id objc_storeStrong(id*, id);
+  /// void objc_storeStrong(id*, id);
   llvm::Constant *objc_storeStrong;
 
   /// id objc_storeWeak(id*, id);
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
index ee06c76f6024..852e2269393a 100644
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp
@@ -2408,7 +2408,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
       // fold-expressions, we'll need to allow multiple ArgExprs here.
       if (ArgExprs.size() == 1 && isFoldOperator(Tok.getKind()) &&
           NextToken().is(tok::ellipsis))
-        return ParseFoldExpression(Result, T);
+        return ParseFoldExpression(ArgExprs[0], T);
 
       ExprType = SimpleExpr;
       Result = Actions.ActOnParenListExpr(OpenLoc, Tok.getLocation(),
diff --git a/lib/Sema/SemaLookup.cpp b/lib/Sema/SemaLookup.cpp
index 38a7b8c127cc..e2cb2c8169ce 100644
--- a/lib/Sema/SemaLookup.cpp
+++ b/lib/Sema/SemaLookup.cpp
@@ -2831,6 +2831,9 @@ Sema::SpecialMemberOverloadResult *Sema::LookupSpecialMember(CXXRecordDecl *RD,
     assert((SM != CXXDefaultConstructor && SM != CXXDestructor) &&
            "parameter-less special members can't have qualified arguments");
 
+  // FIXME: Get the caller to pass in a location for the lookup.
+  SourceLocation LookupLoc = RD->getLocation();
+
   llvm::FoldingSetNodeID ID;
   ID.AddPointer(RD);
   ID.AddInteger(SM);
@@ -2912,7 +2915,7 @@ Sema::SpecialMemberOverloadResult *Sema::LookupSpecialMember(CXXRecordDecl *RD,
       VK = VK_RValue;
   }
 
-  OpaqueValueExpr FakeArg(SourceLocation(), ArgType, VK);
+  OpaqueValueExpr FakeArg(LookupLoc, ArgType, VK);
 
   if (SM != CXXDefaultConstructor) {
     NumArgs = 1;
@@ -2926,13 +2929,13 @@ Sema::SpecialMemberOverloadResult *Sema::LookupSpecialMember(CXXRecordDecl *RD,
   if (VolatileThis)
     ThisTy.addVolatile();
   Expr::Classification Classification =
-    OpaqueValueExpr(SourceLocation(), ThisTy,
+    OpaqueValueExpr(LookupLoc, ThisTy,
                     RValueThis ? VK_RValue : VK_LValue).Classify(Context);
 
   // Now we perform lookup on the name we computed earlier and do overload
   // resolution. Lookup is only performed directly into the class since there
   // will always be a (possibly implicit) declaration to shadow any others.
-  OverloadCandidateSet OCS(RD->getLocation(), OverloadCandidateSet::CSK_Normal);
+  OverloadCandidateSet OCS(LookupLoc, OverloadCandidateSet::CSK_Normal);
   DeclContext::lookup_result R = RD->lookup(Name);
 
   if (R.empty()) {
@@ -2987,7 +2990,7 @@ Sema::SpecialMemberOverloadResult *Sema::LookupSpecialMember(CXXRecordDecl *RD,
   }
 
   OverloadCandidateSet::iterator Best;
-  switch (OCS.BestViableFunction(*this, SourceLocation(), Best)) {
+  switch (OCS.BestViableFunction(*this, LookupLoc, Best)) {
     case OR_Success:
       Result->setMethod(cast<CXXMethodDecl>(Best->Function));
       Result->setKind(SpecialMemberOverloadResult::Success);
diff --git a/lib/Sema/SemaStmt.cpp b/lib/Sema/SemaStmt.cpp
index a8832e9a1c54..390e1b52c8ed 100644
--- a/lib/Sema/SemaStmt.cpp
+++ b/lib/Sema/SemaStmt.cpp
@@ -2743,15 +2743,17 @@ bool Sema::isCopyElisionCandidate(QualType ReturnType, const VarDecl *VD,
   // ...automatic...
   if (!VD->hasLocalStorage()) return false;
 
+  // Return false if VD is a __block variable. We don't want to implicitly move
+  // out of a __block variable during a return because we cannot assume the
+  // variable will no longer be used.
+  if (VD->hasAttr<BlocksAttr>()) return false;
+
   if (AllowParamOrMoveConstructible)
     return true;
 
   // ...non-volatile...
   if (VD->getType().isVolatileQualified()) return false;
 
-  // __block variables can't be allocated in a way that permits NRVO.
-  if (VD->hasAttr<BlocksAttr>()) return false;
-
   // Variables with higher required alignment than their type's ABI
   // alignment cannot use NRVO.
   if (!VD->getType()->isDependentType() && VD->hasAttr<AlignedAttr>() &&
diff --git a/lib/Sema/SemaTemplateVariadic.cpp b/lib/Sema/SemaTemplateVariadic.cpp
index 54556b505ee0..725a3e425201 100644
--- a/lib/Sema/SemaTemplateVariadic.cpp
+++ b/lib/Sema/SemaTemplateVariadic.cpp
@@ -1014,6 +1014,11 @@ ExprResult Sema::ActOnCXXFoldExpr(SourceLocation LParenLoc, Expr *LHS,
   CheckFoldOperand(*this, LHS);
   CheckFoldOperand(*this, RHS);
 
+  auto DiscardOperands = [&] {
+    CorrectDelayedTyposInExpr(LHS);
+    CorrectDelayedTyposInExpr(RHS);
+  };
+
   // [expr.prim.fold]p3:
   //   In a binary fold, op1 and op2 shall be the same fold-operator, and
   //   either e1 shall contain an unexpanded parameter pack or e2 shall contain
@@ -1021,6 +1026,7 @@ ExprResult Sema::ActOnCXXFoldExpr(SourceLocation LParenLoc, Expr *LHS,
   if (LHS && RHS &&
       LHS->containsUnexpandedParameterPack() ==
           RHS->containsUnexpandedParameterPack()) {
+    DiscardOperands();
     return Diag(EllipsisLoc,
                 LHS->containsUnexpandedParameterPack()
                     ? diag::err_fold_expression_packs_both_sides
@@ -1034,6 +1040,7 @@ ExprResult Sema::ActOnCXXFoldExpr(SourceLocation LParenLoc, Expr *LHS,
   if (!LHS || !RHS) {
     Expr *Pack = LHS ? LHS : RHS;
     assert(Pack && "fold expression with neither LHS nor RHS");
+    DiscardOperands();
     if (!Pack->containsUnexpandedParameterPack())
       return Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
              << Pack->getSourceRange();
diff --git a/test/CodeGen/object-size.c b/test/CodeGen/object-size.c
index fe4c1859a272..a824f554b5f4 100644
--- a/test/CodeGen/object-size.c
+++ b/test/CodeGen/object-size.c
@@ -549,3 +549,22 @@ int incomplete_and_function_types() {
   // CHECK: store i32 0
   gi = __builtin_object_size(incomplete_char_array, 3);
 }
+
+// Flips between the pointer and lvalue evaluator a lot.
+void deeply_nested() {
+  struct {
+    struct {
+      struct {
+        struct {
+          int e[2];
+          char f; // Inhibit our writing-off-the-end check
+        } d[2];
+      } c[2];
+    } b[2];
+  } *a;
+
+  // CHECK: store i32 4
+  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 1);
+  // CHECK: store i32 4
+  gi = __builtin_object_size(&a->b[1].c[1].d[1].e[1], 3);
+}
diff --git a/test/Parser/cxx1z-fold-expressions.cpp b/test/Parser/cxx1z-fold-expressions.cpp
index 030638583239..b1f7318e410d 100644
--- a/test/Parser/cxx1z-fold-expressions.cpp
+++ b/test/Parser/cxx1z-fold-expressions.cpp
@@ -34,3 +34,12 @@ template<int ...N> int bad9() { return (3 + ... * N); } // expected-error {{oper
 template<int ...N> int bad10() { return (3 ? ... : N); } // expected-error +{{}} expected-note {{to match}}
 template<int ...N> int bad11() { return (N + ... 0); } // expected-error {{expected a foldable binary operator}} expected-error {{expected expression}}
 template<int ...N> int bad12() { return (... N); } // expected-error {{expected expression}}
+
+template<typename ...T> void as_operand_of_cast(int a, T ...t) {
+  return
+    (int)(a + ... + undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(t + ... + undeclared_junk) + // expected-error {{undeclared}}
+    (int)(... + undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(undeclared_junk + ...) + // expected-error {{undeclared}}
+    (int)(a + ...); // expected-error {{does not contain any unexpanded}}
+}
diff --git a/test/Sema/builtin-object-size.c b/test/Sema/builtin-object-size.c
index 14674c66f3a6..300c739bbd14 100644
--- a/test/Sema/builtin-object-size.c
+++ b/test/Sema/builtin-object-size.c
@@ -76,3 +76,18 @@ int pr28314(void) {
   a += __builtin_object_size(p3->b, 0);
   return a;
 }
+
+int pr31843() {
+  int n = 0;
+
+  struct { int f; } a;
+  int b;
+  n += __builtin_object_size(({&(b ? &a : &a)->f; pr31843;}), 0); // expected-warning{{expression result unused}}
+
+  struct statfs { char f_mntonname[1024];};
+  struct statfs *outStatFSBuf;
+  n += __builtin_object_size(outStatFSBuf->f_mntonname ? "" : "", 1); // expected-warning{{address of array}}
+  n += __builtin_object_size(outStatFSBuf->f_mntonname ?: "", 1);
+
+  return n;
+}
diff --git a/test/SemaCXX/cxx11-inheriting-ctors.cpp b/test/SemaCXX/cxx11-inheriting-ctors.cpp
index c9e01188fd2e..7d6f4f09f09c 100644
--- a/test/SemaCXX/cxx11-inheriting-ctors.cpp
+++ b/test/SemaCXX/cxx11-inheriting-ctors.cpp
@@ -105,3 +105,31 @@ namespace PR31606 {
   // Note, we do *not* allow operator=='s argument to use the inherited A::A(Base&&) constructor to construct from B{}.
   bool b = A{} == B{}; // expected-error {{invalid operands}}
 }
+
+namespace implicit_member_srcloc {
+  template<class T>
+  struct S3 {
+  };
+
+  template<class T>
+  struct S2 {
+    S2(S3<T> &&);
+  };
+
+  template<class T>
+  struct S1 : S2<T> {
+    using S2<T>::S2;
+    S1();
+  };
+
+  template<class T>
+  struct S0 {
+    S0();
+    S0(S0&&) = default;
+    S1<T> m1;
+  };
+
+  void foo1() {
+    S0<int> s0;
+  }
+}
diff --git a/test/SemaObjCXX/blocks.mm b/test/SemaObjCXX/blocks.mm
index 09d614d37287..3f901cc0a840 100644
--- a/test/SemaObjCXX/blocks.mm
+++ b/test/SemaObjCXX/blocks.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -Wno-objc-root-class %s
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -Wno-objc-root-class -std=c++11 %s
 @protocol NSObject;
 
 void bar(id(^)(void));
@@ -144,3 +144,17 @@ namespace DependentReturn {
 
   template void f<X>(X);
 }
+
+namespace MoveBlockVariable {
+struct B0 {
+};
+
+struct B1 { // expected-note 2 {{candidate constructor (the implicit}}
+  B1(B0&&); // expected-note {{candidate constructor not viable}}
+};
+
+B1 test_move() {
+  __block B0 b;
+  return b; // expected-error {{no viable conversion from returned value of type 'MoveBlockVariable::B0' to function return type 'MoveBlockVariable::B1'}}
+}
+}

From abacad30a54c59ad437ccf54ec5236a8dd7f3ba9 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Fri, 17 Feb 2017 19:37:28 +0000
Subject: [PATCH 3/4] Vendor import of compiler-rt release_40 branch r295380:
 https://llvm.org/svn/llvm-project/compiler-rt/branches/release_40@295380

---
 lib/builtins/arm/subsf3vfp.S                     |  2 +-
 .../sanitizer_platform_limits_posix.cc           |  7 +------
 test/builtins/Unit/clear_cache_test.c            | 16 ++++++++++++++--
 test/builtins/Unit/fixunsdfdi_test.c             |  3 ---
 test/builtins/Unit/fixunssfdi_test.c             |  2 --
 test/lsan/TestCases/strace_test.cc               |  1 +
 test/lsan/lit.common.cfg                         |  5 +++++
 7 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/lib/builtins/arm/subsf3vfp.S b/lib/builtins/arm/subsf3vfp.S
index a9f3ba9422cf..3e83ea26507d 100644
--- a/lib/builtins/arm/subsf3vfp.S
+++ b/lib/builtins/arm/subsf3vfp.S
@@ -21,7 +21,7 @@
 DEFINE_COMPILERRT_FUNCTION(__subsf3vfp)
 #if defined(COMPILER_RT_ARMHF_TARGET)
 	vsub.f32 s0, s0, s1
-#elsee
+#else
 	vmov	s14, r0		// move first param from r0 into float register
 	vmov	s15, r1		// move second param from r1 into float register
 	vsub.f32 s14, s14, s15
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
index fbde5e17dc63..683f019d70c3 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
@@ -23,11 +23,6 @@
 #ifdef _FILE_OFFSET_BITS
 #undef _FILE_OFFSET_BITS
 #endif
-#if SANITIZER_FREEBSD
-#define _WANT_RTENTRY
-#include <sys/param.h>
-#include <sys/socketvar.h>
-#endif
 #include <arpa/inet.h>
 #include <dirent.h>
 #include <errno.h>
@@ -433,6 +428,7 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned struct_input_absinfo_sz = sizeof(struct input_absinfo);
   unsigned struct_input_id_sz = sizeof(struct input_id);
   unsigned struct_mtpos_sz = sizeof(struct mtpos);
+  unsigned struct_rtentry_sz = sizeof(struct rtentry);
   unsigned struct_termio_sz = sizeof(struct termio);
   unsigned struct_vt_consize_sz = sizeof(struct vt_consize);
   unsigned struct_vt_sizes_sz = sizeof(struct vt_sizes);
@@ -452,7 +448,6 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned struct_midi_info_sz = sizeof(struct midi_info);
   unsigned struct_mtget_sz = sizeof(struct mtget);
   unsigned struct_mtop_sz = sizeof(struct mtop);
-  unsigned struct_rtentry_sz = sizeof(struct rtentry);
   unsigned struct_sbi_instrument_sz = sizeof(struct sbi_instrument);
   unsigned struct_seq_event_rec_sz = sizeof(struct seq_event_rec);
   unsigned struct_synth_info_sz = sizeof(struct synth_info);
diff --git a/test/builtins/Unit/clear_cache_test.c b/test/builtins/Unit/clear_cache_test.c
index 3c893018545f..0ef704fcde88 100644
--- a/test/builtins/Unit/clear_cache_test.c
+++ b/test/builtins/Unit/clear_cache_test.c
@@ -18,9 +18,20 @@ void __clear_cache(void* start, void* end)
     if (!FlushInstructionCache(GetCurrentProcess(), start, end-start))
         exit(1);
 }
+
+static uintptr_t get_page_size() {
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return si.dwPageSize;
+}
 #else
+#include <unistd.h>
 #include <sys/mman.h>
 extern void __clear_cache(void* start, void* end);
+
+static uintptr_t get_page_size() {
+    return sysconf(_SC_PAGE_SIZE);
+}
 #endif
 
 
@@ -56,8 +67,9 @@ unsigned char execution_buffer[128];
 int main()
 {
     // make executable the page containing execution_buffer 
-    char* start = (char*)((uintptr_t)execution_buffer & (-4095));
-    char* end = (char*)((uintptr_t)(&execution_buffer[128+4096]) & (-4095));
+    uintptr_t page_size = get_page_size();
+    char* start = (char*)((uintptr_t)execution_buffer & (-page_size));
+    char* end = (char*)((uintptr_t)(&execution_buffer[128+page_size]) & (-page_size));
 #if defined(_WIN32)
     DWORD dummy_oldProt;
     MEMORY_BASIC_INFORMATION b;
diff --git a/test/builtins/Unit/fixunsdfdi_test.c b/test/builtins/Unit/fixunsdfdi_test.c
index 1ddc5340b03d..3998482876f3 100644
--- a/test/builtins/Unit/fixunsdfdi_test.c
+++ b/test/builtins/Unit/fixunsdfdi_test.c
@@ -95,9 +95,6 @@ int main()
     if (test__fixunsdfdi(0x1.FFFFFFFFFFFFEp+62, 0x7FFFFFFFFFFFF800LL))
         return 1;
 
-    if (test__fixunsdfdi(0x1.p+64, 0xFFFFFFFFFFFFFFFFLL))
-        return 1;
-
 #if !TARGET_LIBGCC
     if (test__fixunsdfdi(-0x1.FFFFFFFFFFFFFp+62, 0))
         return 1;
diff --git a/test/builtins/Unit/fixunssfdi_test.c b/test/builtins/Unit/fixunssfdi_test.c
index 166153cb5b51..812457a002de 100644
--- a/test/builtins/Unit/fixunssfdi_test.c
+++ b/test/builtins/Unit/fixunssfdi_test.c
@@ -79,8 +79,6 @@ int main()
         return 1;
     if (test__fixunssfdi(0x1.000000p+63F, 0x8000000000000000LL))
         return 1;
-    if (test__fixunssfdi(0x1.000000p+64F, 0xFFFFFFFFFFFFFFFFLL))
-        return 1;
     if (test__fixunssfdi(0x1.FFFFFEp+62F, 0x7FFFFF8000000000LL))
         return 1;
     if (test__fixunssfdi(0x1.FFFFFCp+62F, 0x7FFFFF0000000000LL))
diff --git a/test/lsan/TestCases/strace_test.cc b/test/lsan/TestCases/strace_test.cc
index b3568d0b44e8..b25e05753848 100644
--- a/test/lsan/TestCases/strace_test.cc
+++ b/test/lsan/TestCases/strace_test.cc
@@ -1,4 +1,5 @@
 // Test that lsan reports a proper error when running under strace.
+// REQUIRES: strace
 // RUN: %clangxx_lsan %s -o %t
 // RUN: not strace -o /dev/null %run %t 2>&1 | FileCheck %s
 
diff --git a/test/lsan/lit.common.cfg b/test/lsan/lit.common.cfg
index 6002e2d69444..8580eec33d28 100644
--- a/test/lsan/lit.common.cfg
+++ b/test/lsan/lit.common.cfg
@@ -4,6 +4,8 @@
 
 import os
 
+import lit.util
+
 def get_required_attr(config, attr_name):
   attr_value = getattr(config, attr_name, None)
   if attr_value == None:
@@ -29,6 +31,9 @@ else:
   lit_config.fatal("Unknown LSan test mode: %r" % lsan_lit_test_mode)
 config.name += config.name_suffix
 
+if lit.util.which('strace'):
+  config.available_features.add('strace')
+
 clang_cflags = ["-O0", config.target_cflags] + config.debug_info_flags
 clang_cxxflags = config.cxx_mode_flags + clang_cflags
 lsan_incdir = config.test_source_root + "/../"

From ef26f2ece87c282e68a15ed703f0a56869e419b2 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Fri, 17 Feb 2017 19:38:36 +0000
Subject: [PATCH 4/4] Vendor import of libc++ release_40 branch r295380:
 https://llvm.org/svn/llvm-project/libcxx/branches/release_40@295380

---
 .../fs.op.hard_lk_ct/hard_link_count.pass.cpp        | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/std/experimental/filesystem/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp b/test/std/experimental/filesystem/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
index 7537ac20c757..6b542a5b67a5 100644
--- a/test/std/experimental/filesystem/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
+++ b/test/std/experimental/filesystem/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
@@ -57,15 +57,19 @@ TEST_CASE(hard_link_count_for_directory)
     Dir3Expect = 3; // .  ..  file5
 #endif
     TEST_CHECK(hard_link_count(StaticEnv::Dir) == DirExpect ||
-               hard_link_count(StaticEnv::Dir) == DirExpectAlt);
+               hard_link_count(StaticEnv::Dir) == DirExpectAlt ||
+               hard_link_count(StaticEnv::Dir) == 1);
     TEST_CHECK(hard_link_count(StaticEnv::Dir3) == Dir3Expect ||
-               hard_link_count(StaticEnv::Dir3) == Dir3ExpectAlt);
+               hard_link_count(StaticEnv::Dir3) == Dir3ExpectAlt ||
+               hard_link_count(StaticEnv::Dir3) == 1);
 
     std::error_code ec;
     TEST_CHECK(hard_link_count(StaticEnv::Dir, ec) == DirExpect ||
-               hard_link_count(StaticEnv::Dir, ec) == DirExpectAlt);
+               hard_link_count(StaticEnv::Dir, ec) == DirExpectAlt ||
+               hard_link_count(StaticEnv::Dir) == 1);
     TEST_CHECK(hard_link_count(StaticEnv::Dir3, ec) == Dir3Expect ||
-               hard_link_count(StaticEnv::Dir3, ec) == Dir3ExpectAlt);
+               hard_link_count(StaticEnv::Dir3, ec) == Dir3ExpectAlt ||
+               hard_link_count(StaticEnv::Dir3) == 1);
 }
 TEST_CASE(hard_link_count_increments_test)
 {