From d9c9bd8485071afb22adcd2bb08f6a8e5e587ed6 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 24 Feb 2016 21:32:58 +0000
Subject: [PATCH 1/2] Vendor import of llvm release_38 branch r261684:
 https://llvm.org/svn/llvm-project/llvm/branches/release_38@261684

---
 lib/CodeGen/RegAllocFast.cpp                  |  10 +-
 lib/Target/AArch64/AArch64FrameLowering.cpp   |  12 +
 lib/Target/AArch64/AArch64FrameLowering.h     |   2 +
 lib/Target/PowerPC/PPCFrameLowering.cpp       | 205 ++++++++++++++----
 lib/Target/PowerPC/PPCFrameLowering.h         |  35 ++-
 lib/Target/X86/X86ISelLowering.cpp            |  34 +++
 lib/Target/X86/X86ISelLowering.h              |   3 +
 lib/Target/X86/X86InstrCompiler.td            |   4 +-
 .../AArch64/aarch64-dynamic-stack-layout.ll   |   2 +-
 test/CodeGen/AArch64/arm64-shrink-wrapping.ll |  85 ++++++++
 test/CodeGen/ARM/Windows/alloca.ll            |   4 +-
 test/CodeGen/PowerPC/pr26690.ll               | 118 ++++++++++
 test/CodeGen/X86/i386-tlscall-fastregalloc.ll |  26 +++
 test/CodeGen/X86/tls-shrink-wrapping.ll       |  60 +++++
 14 files changed, 540 insertions(+), 60 deletions(-)
 create mode 100644 test/CodeGen/PowerPC/pr26690.ll
 create mode 100644 test/CodeGen/X86/i386-tlscall-fastregalloc.ll
 create mode 100644 test/CodeGen/X86/tls-shrink-wrapping.ll
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index f4c076fea0e7..8d7a7213ba07 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -1002,11 +1002,13 @@ void RAFast::AllocateBasicBlock() {
 
     unsigned DefOpEnd = MI->getNumOperands();
     if (MI->isCall()) {
-      // Spill all virtregs before a call. This serves two purposes: 1. If an
+      // Spill all virtregs before a call. This serves one purpose: If an
       // exception is thrown, the landing pad is going to expect to find
-      // registers in their spill slots, and 2. we don't have to wade through
-      // all the <imp-def> operands on the call instruction.
-      DefOpEnd = VirtOpEnd;
+      // registers in their spill slots.
+      // Note: although this is appealing to just consider all definitions
+      // as call-clobbered, this is not correct because some of those
+      // definitions may be used later on and we do not want to reuse
+      // those for virtual registers in between.
       DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
       spillAll(MI);
 
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 11ae8005370d..3f63d049c34e 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -275,6 +275,18 @@ static bool isCSSave(MachineInstr *MBBI) {
          MBBI->getOpcode() == AArch64::STPDpre;
 }
 
+bool AArch64FrameLowering::canUseAsPrologue(
+    const MachineBasicBlock &MBB) const {
+  const MachineFunction *MF = MBB.getParent();
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  // Don't need a scratch register if we're not going to re-align the stack.
+  // Otherwise, we may need a scratch register to be available and we do not
+  // support that for now.
+  return !RegInfo->needsStackRealignment(*MF);
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 427afdf4acbf..7d8354c38787 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -37,6 +37,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
   int resolveFrameIndexReference(const MachineFunction &MF, int FI,
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index beab844c6025..3fd509ae27f4 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -556,16 +556,42 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
     }
 }
 
-bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
-                                           bool UseAtEnd,
-                                           unsigned *ScratchRegister) const {
+/*  This function will do the following:
+    - If MBB is an entry or exit block, set SR1 and SR2 to R0 and R12
+      respectively (defaults recommended by the ABI) and return true
+    - If MBB is not an entry block, initialize the register scavenger and look
+      for available registers.
+    - If the defaults (R0/R12) are available, return true
+    - If TwoUniqueRegsRequired is set to true, it looks for two unique
+      registers. Otherwise, look for a single available register.
+      - If the required registers are found, set SR1 and SR2 and return true.
+      - If the required registers are not found, set SR2 or both SR1 and SR2 to
+        PPC::NoRegister and return false.
+
+    Note that if both SR1 and SR2 are valid parameters and TwoUniqueRegsRequired
+    is not set, this function will attempt to find two different registers, but
+    still return true if only one register is available (and set SR1 == SR2).
+*/
+bool
+PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
+                                      bool UseAtEnd,
+                                      bool TwoUniqueRegsRequired,
+                                      unsigned *SR1,
+                                      unsigned *SR2) const {
   RegScavenger RS;
-  unsigned     R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+  unsigned R0 =  Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+  unsigned R12 = Subtarget.isPPC64() ? PPC::X12 : PPC::R12;
 
-  if (ScratchRegister)
-    *ScratchRegister = R0;
+  // Set the defaults for the two scratch registers.
+  if (SR1)
+    *SR1 = R0;
 
-  // If MBB is an entry or exit block, use R0 as the scratch register
+  if (SR2) {
+    assert (SR1 && "Asking for the second scratch register but not the first?");
+    *SR2 = R12;
+  }
+
+  // If MBB is an entry or exit block, use R0 and R12 as the scratch registers.
   if ((UseAtEnd && MBB->isReturnBlock()) ||
       (!UseAtEnd && (&MBB->getParent()->front() == MBB)))
     return true;
@@ -573,8 +599,8 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
   RS.enterBasicBlock(MBB);
 
   if (UseAtEnd && !MBB->empty()) {
-    // The scratch register will be used at the end of the block, so must consider
-    // all registers used within the block
+    // The scratch register will be used at the end of the block, so must
+    // consider all registers used within the block
 
     MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator();
     // If no terminator, back iterator up to previous instruction.
@@ -584,35 +610,86 @@ bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
     if (MBBI != MBB->begin())
       RS.forward(MBBI);
   }
-  
-  if (!RS.isRegUsed(R0)) 
+
+  // If the two registers are available, we're all good.
+  // Note that we only return here if both R0 and R12 are available because
+  // although the function may not require two unique registers, it may benefit
+  // from having two so we should try to provide them.
+  if (!RS.isRegUsed(R0) && !RS.isRegUsed(R12))
     return true;
 
-  unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass
-                                  : &PPC::GPRCRegClass);
-  
-  // Make sure the register scavenger was able to find an available register
-  // If not, use R0 but return false to indicate no register was available and
-  // R0 must be used (as recommended by the ABI)
-  if (Reg == 0)
+  // Get the list of callee-saved registers for the target.
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());
+
+  // Get all the available registers in the block.
+  BitVector BV = RS.getRegsAvailable(Subtarget.isPPC64() ? &PPC::G8RCRegClass :
+                                     &PPC::GPRCRegClass);
+
+  // We shouldn't use callee-saved registers as scratch registers as they may be
+  // available when looking for a candidate block for shrink wrapping but not
+  // available when the actual prologue/epilogue is being emitted because they
+  // were added as live-in to the prologue block by PrologueEpilogueInserter.
+  for (int i = 0; CSRegs[i]; ++i)
+    BV.reset(CSRegs[i]);
+
+  // Set the first scratch register to the first available one.
+  if (SR1) {
+    int FirstScratchReg = BV.find_first();
+    *SR1 = FirstScratchReg == -1 ? (unsigned)PPC::NoRegister : FirstScratchReg;
+  }
+
+  // If there is another one available, set the second scratch register to that.
+  // Otherwise, set it to either PPC::NoRegister if this function requires two
+  // or to whatever SR1 is set to if this function doesn't require two.
+  if (SR2) {
+    int SecondScratchReg = BV.find_next(*SR1);
+    if (SecondScratchReg != -1)
+      *SR2 = SecondScratchReg;
+    else
+      *SR2 = TwoUniqueRegsRequired ? (unsigned)PPC::NoRegister : *SR1;
+  }
+
+  // Now that we've done our best to provide both registers, double check
+  // whether we were unable to provide enough.
+  if (BV.count() < (TwoUniqueRegsRequired ? 2 : 1))
     return false;
 
-  if (ScratchRegister)
-    *ScratchRegister = Reg;
-
   return true;
 }
 
+// We need a scratch register for spilling LR and for spilling CR. By default,
+// we use two scratch registers to hide latency. However, if only one scratch
+// register is available, we can adjust for that by not overlapping the spill
+// code. However, if we need to realign the stack (i.e. have a base pointer)
+// and the stack frame is large, we need two scratch registers.
+bool
+PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
+  const PPCRegisterInfo *RegInfo =
+      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  MachineFunction &MF = *(MBB->getParent());
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  unsigned FrameSize = determineFrameLayout(MF, false);
+  int NegFrameSize = -FrameSize;
+  bool IsLargeFrame = !isInt<16>(NegFrameSize);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  return IsLargeFrame && HasBP && MaxAlign > 1;
+}
+
 bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
 
-  return findScratchRegister(TmpMBB, false, nullptr);
+  return findScratchRegister(TmpMBB, false,
+                             twoUniqueScratchRegsRequired(TmpMBB));
 }
 
 bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
 
-  return findScratchRegister(TmpMBB, true, nullptr);
+  return findScratchRegister(TmpMBB, true);
 }
 
 void PPCFrameLowering::emitPrologue(MachineFunction &MF,
@@ -664,6 +741,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  bool MustSaveCR = !MustSaveCRs.empty();
   // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
@@ -701,9 +779,15 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
          "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
 
-  findScratchRegister(&MBB, false, &ScratchReg);
-  assert(ScratchReg && "No scratch register!");
-         
+  // Using the same bool variable as below to supress compiler warnings.
+  bool SingleScratchReg =
+    findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB),
+                        &ScratchReg, &TempReg);
+  assert(SingleScratchReg &&
+         "Required number of registers not available in this block");
+
+  SingleScratchReg = ScratchReg == TempReg;
+
   int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
@@ -748,13 +832,30 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
   bool isLargeFrame = !isInt<16>(NegFrameSize);
 
+  assert((isPPC64 || !MustSaveCR) &&
+         "Prologue CR saving supported only in 64-bit mode");
+
+  // If we need to spill the CR and the LR but we don't have two separate
+  // registers available, we must spill them one at a time
+  if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+    // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it could be more
+    // efficient to use mfocrf to selectively save just those fields.
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+  }
+
   if (MustSaveLR)
     BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
 
-  assert((isPPC64 || MustSaveCRs.empty()) &&
-         "Prologue CR saving supported only in 64-bit mode");
-
-  if (!MustSaveCRs.empty()) { // will only occur for PPC64
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
     // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.
     // If only one or two CR fields are clobbered, it could be more
     // efficient to use mfocrf to selectively save just those fields.
@@ -792,7 +893,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       .addImm(LROffset)
       .addReg(SPReg);
 
-  if (!MustSaveCRs.empty()) // will only occur for PPC64
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64
     BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
       .addReg(TempReg, getKillRegState(true))
       .addImm(8)
@@ -811,6 +913,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(SPReg);
   }
 
+  // This condition must be kept in sync with canUseAsPrologue.
   if (HasBP && MaxAlign > 1) {
     if (isPPC64)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
@@ -828,6 +931,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(ScratchReg, RegState::Kill)
         .addImm(NegFrameSize);
     } else {
+      assert(!SingleScratchReg && "Only a single scratch reg available");
       BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
         .addImm(NegFrameSize >> 16);
       BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
@@ -951,7 +1055,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       // For SVR4, don't emit a move for the CR spill slot if we haven't
       // spilled CRs.
       if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
-          && MustSaveCRs.empty())
+          && !MustSaveCR)
         continue;
 
       // For 64-bit SVR4 when we have spilled CRs, the spill location
@@ -1005,6 +1109,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  bool MustSaveCR = !MustSaveCRs.empty();
   // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
@@ -1026,14 +1131,19 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
                                                    : PPC::ADDI );
   const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
                                                 : PPC::ADD4 );
-  
+
   int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
 
-  findScratchRegister(&MBB, true, &ScratchReg);
-  assert(ScratchReg && "No scratch register!");
-  
+  // Using the same bool variable as below to supress compiler warnings.
+  bool SingleScratchReg = findScratchRegister(&MBB, true, false, &ScratchReg,
+                                              &TempReg);
+  assert(SingleScratchReg &&
+         "Could not find an available scratch register");
+
+  SingleScratchReg = ScratchReg == TempReg;
+
   if (HasFP) {
     if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
@@ -1130,15 +1240,27 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     }
   }
 
+  assert((isPPC64 || !MustSaveCR) &&
+         "Epilogue CR restoring supported only in 64-bit mode");
+
+  // If we need to save both the LR and the CR and we only have one available
+  // scratch register, we must do them one at a time.
+  if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(SPReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
+  }
+
   if (MustSaveLR)
     BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
       .addImm(LROffset)
       .addReg(SPReg);
 
-  assert((isPPC64 || MustSaveCRs.empty()) &&
-         "Epilogue CR restoring supported only in 64-bit mode");
-
-  if (!MustSaveCRs.empty()) // will only occur for PPC64
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64
     BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
       .addImm(8)
       .addReg(SPReg);
@@ -1160,7 +1282,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       .addImm(BPOffset)
       .addReg(SPReg);
 
-  if (!MustSaveCRs.empty()) // will only occur for PPC64
+  if (MustSaveCR &&
+      !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
       BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
         .addReg(TempReg, getKillRegState(i == e-1));
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index bbe1329a5352..f1f3f0b831a7 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -30,28 +30,41 @@ class PPCFrameLowering: public TargetFrameLowering {
   const unsigned BasePointerSaveOffset;
 
   /**
-   * \brief Find a register that can be used in function prologue and epilogue
+   * \brief Find register[s] that can be used in function prologue and epilogue
    *
-   * Find a register that can be use as the scratch register in function
+   * Find register[s] that can be use as scratch register[s] in function
    * prologue and epilogue to save various registers (Link Register, Base
-   * Pointer, etc.). Prefer R0, if it is available. If it is not available,
-   * then choose a different register.
+   * Pointer, etc.). Prefer R0/R12, if available. Otherwise choose whatever
+   * register[s] are available.
    *
-   * This method will return true if an available register was found (including
-   * R0). If no available registers are found, the method returns false and sets
-   * ScratchRegister to R0, as per the recommendation in the ABI.
+   * This method will return true if it is able to find enough unique scratch
+   * registers (1 or 2 depending on the requirement). If it is unable to find
+   * enough available registers in the block, it will return false and set
+   * any passed output parameter that corresponds to a required unique register
+   * to PPC::NoRegister.
    *
    * \param[in] MBB The machine basic block to find an available register for
    * \param[in] UseAtEnd Specify whether the scratch register will be used at
    *                     the end of the basic block (i.e., will the scratch
    *                     register kill a register defined in the basic block)
-   * \param[out] ScratchRegister The scratch register to use
-   * \return true if a scratch register was found. false of a scratch register
-   *         was not found and R0 is being used as the default.
+   * \param[in] TwoUniqueRegsRequired Specify whether this basic block will
+   *                                  require two unique scratch registers.
+   * \param[out] SR1 The scratch register to use
+   * \param[out] SR2 The second scratch register. If this pointer is not null
+   *                 the function will attempt to set it to an available
+   *                 register regardless of whether there is a hard requirement
+   *                 for two unique scratch registers.
+   * \return true if the required number of registers was found.
+   *         false if the required number of scratch register weren't available.
+   *         If either output parameter refers to a required scratch register
+   *         that isn't available, it will be set to an invalid value.
    */
   bool findScratchRegister(MachineBasicBlock *MBB,
                            bool UseAtEnd,
-                           unsigned *ScratchRegister) const;
+                           bool TwoUniqueRegsRequired = false,
+                           unsigned *SR1 = nullptr,
+                           unsigned *SR2 = nullptr) const;
+  bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
 
 public:
   PPCFrameLowering(const PPCSubtarget &STI);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c12a3ed43d29..dd9966f9e179 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -22227,6 +22227,35 @@ X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI,
   return BB;
 }
 
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSAddr(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const {
+  // So, here we replace TLSADDR with the sequence:
+  // adjust_stackdown -> TLSADDR -> adjust_stackup.
+  // We need this because TLSADDR is lowered into calls
+  // inside MC, therefore without the two markers shrink-wrapping
+  // may push the prologue/epilogue pass them.
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction &MF = *BB->getParent();
+
+  // Emit CALLSEQ_START right before the instruction.
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  MachineInstrBuilder CallseqStart =
+    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0);
+  BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
+
+  // Emit CALLSEQ_END right after the instruction.
+  // We don't call erase from parent because we want to keep the
+  // original instruction around.
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  MachineInstrBuilder CallseqEnd =
+    BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
+  BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
+
+  return BB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
                                       MachineBasicBlock *BB) const {
@@ -22607,6 +22636,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
     return BB;
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return EmitLoweredTLSAddr(MI, BB);
   case X86::WIN_ALLOCA:
     return EmitLoweredWinAlloca(MI, BB);
   case X86::CATCHRET:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0ab786e08e02..b67958a9c498 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1129,6 +1129,9 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
                                             MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 96a29ca8c370..c709c8aca9fa 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -436,7 +436,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
-    Uses = [ESP] in {
+    usesCustomInserter = 1, Uses = [ESP] in {
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
                   [(X86tlsaddr tls32addr:$sym)]>,
@@ -456,7 +456,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
-    Uses = [RSP] in {
+    usesCustomInserter = 1, Uses = [RSP] in {
 def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_addr64",
                   [(X86tlsaddr tls64addr:$sym)]>,
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 1820b8163a90..90093f94d0ad 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -522,10 +522,10 @@ bb1:
 
 ; CHECK-LABEL: realign_conditional2
 ; Extra realignment in the prologue (performance issue).
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
 ; CHECK:  sub  x9, sp, #32            // =32
 ; CHECK:  and  sp, x9, #0xffffffffffffffe0
 ; CHECK:  mov   x19, sp
+; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 2ecd66ddf5d4..4d751f501d4a 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -630,3 +630,88 @@ loop2b:                                           ; preds = %loop1
 end:
   ret void
 }
+
+; Don't do shrink-wrapping when we need to re-align the stack pointer.
+; See bug 26642.
+; CHECK-LABEL: stack_realign:
+; CHECK-NOT: lsl w[[LSL1:[0-9]+]], w0, w1
+; CHECK-NOT: lsl w[[LSL2:[0-9]+]], w1, w0
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub x{{[0-9]+}}, sp, #16
+; CHECK-DAG: lsl w[[LSL1:[0-9]+]], w0, w1
+; CHECK-DAG: lsl w[[LSL2:[0-9]+]], w1, w0
+; CHECK-DAG: str w[[LSL1]],
+; CHECK-DAG: str w[[LSL2]],
+
+define i32 @stack_realign(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) {
+  %tmp = alloca i32, align 32
+  %shl1 = shl i32 %a, %b
+  %shl2 = shl i32 %b, %a
+  %tmp2 = icmp slt i32 %a, %b
+  br i1 %tmp2, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  %tmp4 = load i32, i32* %tmp
+  br label %false
+
+false:
+  %tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
+  store i32 %shl1, i32* %ptr1
+  store i32 %shl2, i32* %ptr2
+  ret i32 %tmp.0
+}
+
+; Re-aligned stack pointer with all caller-save regs live.  See bug
+; 26642.  In this case we currently avoid shrink wrapping because
+; ensuring we have a scratch register to re-align the stack pointer is
+; too complicated.  Output should be the same for both enabled and
+; disabled shrink wrapping.
+; CHECK-LABEL: stack_realign2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #-{{[0-9]+}}]!
+; CHECK: add x29, sp, #{{[0-9]+}}
+; CHECK: lsl {{w[0-9]+}}, w0, w1
+
+define void @stack_realign2(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2, i32* %ptr3, i32* %ptr4, i32* %ptr5, i32* %ptr6) {
+  %tmp = alloca i32, align 32
+  %tmp1 = shl i32 %a, %b
+  %tmp2 = shl i32 %b, %a
+  %tmp3 = lshr i32 %a, %b
+  %tmp4 = lshr i32 %b, %a
+  %tmp5 = add i32 %b, %a
+  %tmp6 = sub i32 %b, %a
+  %tmp7 = add i32 %tmp1, %tmp2
+  %tmp8 = sub i32 %tmp2, %tmp3
+  %tmp9 = add i32 %tmp3, %tmp4
+  %tmp10 = add i32 %tmp4, %tmp5
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %true, label %false
+
+true:
+  store i32 %a, i32* %tmp, align 4
+  call void asm sideeffect "nop", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28}"() nounwind
+  br label %false
+
+false:
+  store i32 %tmp1, i32* %ptr1, align 4
+  store i32 %tmp2, i32* %ptr2, align 4
+  store i32 %tmp3, i32* %ptr3, align 4
+  store i32 %tmp4, i32* %ptr4, align 4
+  store i32 %tmp5, i32* %ptr5, align 4
+  store i32 %tmp6, i32* %ptr6, align 4
+  %idx1 = getelementptr inbounds i32, i32* %ptr1, i64 1
+  store i32 %a, i32* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32* %ptr1, i64 2
+  store i32 %b, i32* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32* %ptr1, i64 3
+  store i32 %tmp7, i32* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32* %ptr1, i64 4
+  store i32 %tmp8, i32* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32* %ptr1, i64 5
+  store i32 %tmp9, i32* %idx5, align 4
+  %idx6 = getelementptr inbounds i32, i32* %ptr1, i64 6
+  store i32 %tmp10, i32* %idx6, align 4
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/Windows/alloca.ll b/test/CodeGen/ARM/Windows/alloca.ll
index 6a3d002ab3b3..0f20ffbd36db 100644
--- a/test/CodeGen/ARM/Windows/alloca.ll
+++ b/test/CodeGen/ARM/Windows/alloca.ll
@@ -13,7 +13,9 @@ entry:
 }
 
 ; CHECK: bl num_entries
-; CHECK: movs [[R1:r[0-9]+]], #7
+; Any register is actually valid here, but turns out we use lr,
+; because we do not have the kill flag on R0.
+; CHECK: mov.w [[R1:lr]], #7
 ; CHECK: add.w [[R0:r[0-9]+]], [[R1]], [[R0]], lsl #2
 ; CHECK: bic [[R0]], [[R0]], #7
 ; CHECK: lsrs r4, [[R0]], #2
diff --git a/test/CodeGen/PowerPC/pr26690.ll b/test/CodeGen/PowerPC/pr26690.ll
new file mode 100644
index 000000000000..3e7662409d51
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr26690.ll
@@ -0,0 +1,118 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+
+%struct.anon = type { %struct.anon.0, %struct.anon.1 }
+%struct.anon.0 = type { i32 }
+%struct.anon.1 = type { i32 }
+
+@i = common global i32 0, align 4
+@b = common global i32* null, align 8
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@h = common global i32 0, align 4
+@g = common global i32 0, align 4
+@j = common global i32 0, align 4
+@f = common global %struct.anon zeroinitializer, align 4
+@d = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+; Function Attrs: norecurse nounwind
+define signext i32 @fn1(i32* nocapture %p1, i32 signext %p2, i32* nocapture %p3) {
+entry:
+  %0 = load i32, i32* @i, align 4, !tbaa !1
+  %cond = icmp eq i32 %0, 8
+  br i1 %cond, label %if.end16, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  %1 = load i32*, i32** @b, align 8, !tbaa !5
+  %2 = load i32, i32* %1, align 4, !tbaa !1
+  %tobool18 = icmp eq i32 %2, 0
+  br i1 %tobool18, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.cond.preheader
+  %.pre = load i32, i32* @c, align 4, !tbaa !1
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.backedge, %while.body.lr.ph
+  switch i32 %.pre, label %while.body.backedge [
+    i32 0, label %sw.bb1
+    i32 8, label %sw.bb1
+    i32 6, label %sw.bb1
+    i32 24, label %while.cond.backedge
+  ]
+
+while.body.backedge:                              ; preds = %while.body, %while.cond.backedge
+  br label %while.body
+
+sw.bb1:                                           ; preds = %while.body, %while.body, %while.body
+  store i32 2, i32* @a, align 4, !tbaa !1
+  br label %while.cond.backedge
+
+while.cond.backedge:                              ; preds = %while.body, %sw.bb1
+  store i32 4, i32* @a, align 4, !tbaa !1
+  %.pre19 = load i32, i32* %1, align 4, !tbaa !1
+  %tobool = icmp eq i32 %.pre19, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body.backedge
+
+while.end.loopexit:                               ; preds = %while.cond.backedge
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
+  %3 = load i32, i32* @h, align 4, !tbaa !1
+  %mul = mul nsw i32 %0, %3
+  %4 = load i32, i32* @g, align 4, !tbaa !1
+  %mul4 = mul nsw i32 %mul, %4
+  store i32 %mul4, i32* @j, align 4, !tbaa !1
+  %5 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @f, i64 0, i32 0, i32 0), align 4, !tbaa !7
+  %tobool5 = icmp eq i32 %5, 0
+  br i1 %tobool5, label %if.end, label %if.then
+
+if.then:                                          ; preds = %while.end
+  %div = sdiv i32 %5, %mul
+  store i32 %div, i32* @g, align 4, !tbaa !1
+  br label %if.end
+
+if.end:                                           ; preds = %while.end, %if.then
+  %6 = phi i32 [ %4, %while.end ], [ %div, %if.then ]
+  %7 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @f, i64 0, i32 1, i32 0), align 4, !tbaa !10
+  %tobool7 = icmp ne i32 %7, 0
+  %tobool8 = icmp ne i32 %mul4, 0
+  %or.cond = and i1 %tobool7, %tobool8
+  %tobool10 = icmp ne i32 %0, 0
+  %or.cond17 = and i1 %or.cond, %tobool10
+  br i1 %or.cond17, label %if.then11, label %if.end13
+
+if.then11:                                        ; preds = %if.end
+  store i32 %3, i32* @d, align 4, !tbaa !1
+  %8 = load i32, i32* @e, align 4, !tbaa !1
+  store i32 %8, i32* %p3, align 4, !tbaa !1
+  %.pre20 = load i32, i32* @g, align 4, !tbaa !1
+  br label %if.end13
+
+if.end13:                                         ; preds = %if.then11, %if.end
+  %9 = phi i32 [ %.pre20, %if.then11 ], [ %6, %if.end ]
+  %tobool14 = icmp eq i32 %9, 0
+  br i1 %tobool14, label %if.end16, label %if.then15
+
+if.then15:                                        ; preds = %if.end13
+  store i32 %p2, i32* %p1, align 4, !tbaa !1
+  br label %if.end16
+
+if.end16:                                         ; preds = %entry, %if.end13, %if.then15
+  ret i32 2
+}
+
+; CHECK: mfcr {{[0-9]+}}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 (trunk 261520)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !3, i64 0}
+!7 = !{!8, !2, i64 0}
+!8 = !{!"", !9, i64 0, !9, i64 4}
+!9 = !{!"", !2, i64 0}
+!10 = !{!8, !2, i64 4}
diff --git a/test/CodeGen/X86/i386-tlscall-fastregalloc.ll b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
new file mode 100644
index 000000000000..775c0c1b3784
--- /dev/null
+++ b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -o - -O0 -regalloc=fast | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.10"
+
+@c = external global i8, align 1
+@p = thread_local global i8* null, align 4
+
+; Check that regalloc fast correctly preserves EAX that is set by the TLS call
+; until the actual use.
+; PR26485.
+;
+; CHECK-LABEL: f:
+; Get p.
+; CHECK: movl _p@{{[0-9a-zA-Z]+}}, [[P_ADDR:%[a-z]+]]
+; CHECK-NEXT: calll *([[P_ADDR]])
+; At this point eax contiains the address of p.
+; Load c address.
+; Make sure we do not clobber eax.
+; CHECK-NEXT: movl L_c{{[^,]*}}, [[C_ADDR:%e[b-z]x+]]
+; Store c address into p.
+; CHECK-NEXT: movl [[C_ADDR]], (%eax)
+define void @f() #0 {
+entry:
+  store i8* @c, i8** @p, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/tls-shrink-wrapping.ll b/test/CodeGen/X86/tls-shrink-wrapping.ll
new file mode 100644
index 000000000000..37c1754c0be8
--- /dev/null
+++ b/test/CodeGen/X86/tls-shrink-wrapping.ll
@@ -0,0 +1,60 @@
+; Testcase generated from the following code:
+; extern __thread int i;
+; void f();
+; int g(void) {
+;   if (i) {
+;     i = 0;
+;     f();
+;   }
+;   return i;
+; }
+; We want to make sure that TLS variables are not accessed before
+; the stack frame is set up.
+
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-freebsd11.0"
+
+@i = external thread_local global i32, align 4
+
+define i32 @g() #0 {
+entry:
+  %tmp = load i32, i32* @i, align 4
+  %tobool = icmp eq i32 %tmp, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* @i, align 4
+  tail call void (...) @f() #2
+  %.pre = load i32, i32* @i, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %tmp1 = phi i32 [ 0, %entry ], [ %.pre, %if.then ]
+  ret i32 %tmp1
+}
+
+; CHECK: g:                                      # @g
+; CHECK-NEXT:         .cfi_startproc
+; CHECK-NEXT: # BB#0:                                 # %entry
+; CHECK-NEXT:         pushq   %rbp
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT:         .cfi_def_cfa_offset 16
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT:         .cfi_offset %rbp, -16
+; CHECK-NEXT:         movq    %rsp, %rbp
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT:         .cfi_def_cfa_register %rbp
+; CHECK-NEXT:         pushq   %rbx
+; CHECK-NEXT:         pushq   %rax
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT:         .cfi_offset %rbx, -24
+; CHECK-NEXT:         data16
+; CHECK-NEXT:         leaq    i@TLSGD(%rip), %rdi
+
+declare void @f(...) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }

From 283b7f175721fa9e0e13f59243932e7b26dffc26 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 24 Feb 2016 21:33:38 +0000
Subject: [PATCH 2/2] Vendor import of clang release_38 branch r261684:
 https://llvm.org/svn/llvm-project/cfe/branches/release_38@261684

---
 docs/ReleaseNotes.rst             | 27 +++++++++++++++++++++++++++
 lib/CodeGen/TargetInfo.cpp        |  5 ++++-
 lib/Sema/SemaExpr.cpp             | 11 +++++++----
 test/CodeGen/ppc-varargs-struct.c |  2 ++
 test/Sema/generic-selection.c     |  4 ++++
 5 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 18015f8d7c6e..a9b4b0de2906 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -179,6 +179,33 @@ Several additional features/bugfixes have been added to the previous standards:
 
 - Improved diagnostics for function pointers.
 
+OpenMP Support in Clang
+---------------------
+
+OpenMP 3.1 is fully supported and is enabled by default with -fopenmp 
+which now uses the clang OpenMP library instead of the GCC OpenMP library.
+The runtime can be built in-tree.  
+
+In addition to OpenMP 3.1, several important elements of the OpenMP 4.0/4.5 
+are supported as well. We continue to aim to complete OpenMP 4.5
+
+- ``map`` clause
+- task dependencies
+- ``num_teams`` clause
+- ``thread_limit`` clause
+- ``target`` and ``target data`` directive
+- ``target`` directive with implicit data mapping
+- ``target enter data`` and ``target exit data`` directive
+- Array sections [2.4, Array Sections].
+- Directive name modifiers for ``if`` clause [2.12, if Clause].
+- ``linear`` clause can be used in loop-based directives [2.7.2, loop Construct].
+- ``simdlen`` clause [2.8, SIMD Construct].
+- ``hint`` clause [2.13.2, critical Construct].
+- Parsing/semantic analysis of all non-device directives introduced in OpenMP 4.5.
+
+The codegen for OpenMP constructs was significantly improved allowing us to produce much more stable and fast code.
+Full test cases of IR are also implemented.
+
 CUDA Support in Clang
 ---------------------
 Clang has experimental support for end-to-end CUDA compilation now:
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index cdb325f256f2..3d1ddef94657 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -3475,6 +3475,7 @@ class PPC32TargetCodeGenInfo : public TargetCodeGenInfo {
 
 Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
                                       QualType Ty) const {
+  const unsigned OverflowLimit = 8;
   if (const ComplexType *CTy = Ty->getAs<ComplexType>()) {
     // TODO: Implement this. For now ignore.
     (void)CTy;
@@ -3517,7 +3518,7 @@ Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
   }
 
   llvm::Value *CC =
-      Builder.CreateICmpULT(NumRegs, Builder.getInt8(8), "cond");
+      Builder.CreateICmpULT(NumRegs, Builder.getInt8(OverflowLimit), "cond");
 
   llvm::BasicBlock *UsingRegs = CGF.createBasicBlock("using_regs");
   llvm::BasicBlock *UsingOverflow = CGF.createBasicBlock("using_overflow");
@@ -3569,6 +3570,8 @@ Address PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
   {
     CGF.EmitBlock(UsingOverflow);
 
+    Builder.CreateStore(Builder.getInt8(OverflowLimit), NumRegsAddr);
+
     // Everything in the overflow area is rounded up to a size of at least 4.
     CharUnits OverflowAreaAlign = CharUnits::fromQuantity(4);
 
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index ebf79812d8dc..5a2eb6060ee9 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -1365,10 +1365,13 @@ Sema::CreateGenericSelectionExpr(SourceLocation KeyLoc,
 
   // Decay and strip qualifiers for the controlling expression type, and handle
   // placeholder type replacement. See committee discussion from WG14 DR423.
-  ExprResult R = DefaultFunctionArrayLvalueConversion(ControllingExpr);
-  if (R.isInvalid())
-    return ExprError();
-  ControllingExpr = R.get();
+  {
+    EnterExpressionEvaluationContext Unevaluated(*this, Sema::Unevaluated);
+    ExprResult R = DefaultFunctionArrayLvalueConversion(ControllingExpr);
+    if (R.isInvalid())
+      return ExprError();
+    ControllingExpr = R.get();
+  }
 
   // The controlling expression is an unevaluated operand, so side effects are
   // likely unintended.
diff --git a/test/CodeGen/ppc-varargs-struct.c b/test/CodeGen/ppc-varargs-struct.c
index 1ad57c26b485..d7936a126960 100644
--- a/test/CodeGen/ppc-varargs-struct.c
+++ b/test/CodeGen/ppc-varargs-struct.c
@@ -37,6 +37,7 @@ void testva (int n, ...)
 // CHECK-PPC-NEXT:  br label %[[CONT:[a-z0-9]+]]
 //
 // CHECK-PPC:[[USING_OVERFLOW]]
+// CHECK-PPC-NEXT:  store i8 8, i8* [[GPRPTR]], align 4
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA_P:%[0-9]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* [[ARRAYDECAY]], i32 0, i32 3
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA:%.+]] = load i8*, i8** [[OVERFLOW_AREA_P]], align 4
 // CHECK-PPC-NEXT:  %{{[0-9]+}} =  ptrtoint i8* %argp.cur to i32
@@ -76,6 +77,7 @@ void testva (int n, ...)
 // CHECK-PPC-NEXT:  br label %[[CONT:[a-z0-9]+]]
 //
 // CHECK-PPC:[[USING_OVERFLOW]]
+// CHECK-PPC-NEXT:  store i8 8, i8* [[GPRPTR]], align 4
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA_P:%[0-9]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* [[ARRAYDECAY]], i32 0, i32 3
 // CHECK-PPC-NEXT:  [[OVERFLOW_AREA:%.+]] = load i8*, i8** [[OVERFLOW_AREA_P]], align 4
 // CHECK-PPC-NEXT:  [[MEMADDR:%.+]] = bitcast i8* [[OVERFLOW_AREA]] to i32*
diff --git a/test/Sema/generic-selection.c b/test/Sema/generic-selection.c
index 0563ec0f4fc0..5c02005d0fa8 100644
--- a/test/Sema/generic-selection.c
+++ b/test/Sema/generic-selection.c
@@ -31,4 +31,8 @@ void foo(int n) {
 
   const int i = 12;
   int a9[_Generic(i, int: 1, default: 2) == 1 ? 1 : -1];
+
+  // This is expected to not trigger any diagnostics because the controlling
+  // expression is not evaluated.
+  (void)_Generic(*(int *)0, int: 1);
 }