Update llvm and clang to release_38 branch r258549.

2016-01-22 21:50:08 +00:00 · 2016-01-22 21:50:08 +00:00 · 8c24ff90c4
commit 8c24ff90c4
parent d9b9dae1a9 790462cc37 dadbdfff07
22 changed files with 723 additions and 275 deletions
--- a/contrib/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/contrib/llvm/include/llvm/CodeGen/MachineFunction.h
@ -295,7 +295,7 @@ class MachineFunction {
  }

  /// Should we be emitting segmented stack stuff for the function
-  bool shouldSplitStack();
+  bool shouldSplitStack() const;

  /// getNumBlockIDs - Return the number of MBB ID's allocated.
  ///
--- a/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/contrib/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@ -369,6 +369,18 @@ struct SDNodeFlags {
    (UnsafeAlgebra << 3) | (NoNaNs << 4) | (NoInfs << 5) |
    (NoSignedZeros << 6) | (AllowReciprocal << 7);
  }
+
+  /// Clear any flags in this flag set that aren't also set in Flags.
+  void intersectWith(const SDNodeFlags *Flags) {
+    NoUnsignedWrap &= Flags->NoUnsignedWrap;
+    NoSignedWrap &= Flags->NoSignedWrap;
+    Exact &= Flags->Exact;
+    UnsafeAlgebra &= Flags->UnsafeAlgebra;
+    NoNaNs &= Flags->NoNaNs;
+    NoInfs &= Flags->NoInfs;
+    NoSignedZeros &= Flags->NoSignedZeros;
+    AllowReciprocal &= Flags->AllowReciprocal;
+  }
 };

 /// Represents one node in the SelectionDAG.
@ -682,6 +694,9 @@ class SDNode : public FoldingSetNode, public ilist_node<SDNode> {
  /// and directly, but it is not to avoid creating a vtable for this class.
  const SDNodeFlags *getFlags() const;

+  /// Clear any flags in this node that aren't also set in Flags.
+  void intersectFlagsWith(const SDNodeFlags *Flags);
+
  /// Return the number of values defined/returned by this operator.
  unsigned getNumValues() const { return NumValues; }

--- a/contrib/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/Local.h
@ -331,6 +331,25 @@ unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT,
 /// during lowering by the GC infrastructure.
 bool callsGCLeafFunction(ImmutableCallSite CS);

+//===----------------------------------------------------------------------===//
+//  Intrinsic pattern matching
+//
+
+/// Try and match a bitreverse or bswap idiom.
+///
+/// If an idiom is matched, an intrinsic call is inserted before \c I. Any added
+/// instructions are returned in \c InsertedInsts. They will all have been added
+/// to a basic block.
+///
+/// A bitreverse idiom normally requires around 2*BW nodes to be searched (where
+/// BW is the bitwidth of the integer type). A bswap idiom requires anywhere up
+/// to BW / 4 nodes to be searched, so is significantly faster.
+///
+/// This function returns true on a successful match or false otherwise.
+bool recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts);
+
 } // End llvm namespace

 #endif
--- a/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/contrib/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@ -125,8 +125,6 @@ class LibCallSimplifier {
  Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B);

  // Math Library Optimizations
-  Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType);
-  Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B);
  Value *optimizeCos(CallInst *CI, IRBuilder<> &B);
  Value *optimizePow(CallInst *CI, IRBuilder<> &B);
  Value *optimizeExp2(CallInst *CI, IRBuilder<> &B);
--- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
@ -5211,6 +5211,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
  return false;
 }

+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+static bool makeBitReverse(Instruction &I, const DataLayout &DL,
+                           const TargetLowering &TLI) {
+  if (!I.getType()->isIntegerTy() ||
+      !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
+                                    TLI.getValueType(DL, I.getType(), true)))
+    return false;
+
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, false, true, Insts))
+    return false;
+  Instruction *LastInst = Insts.back();
+  I.replaceAllUsesWith(LastInst);
+  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  return true;
+}
+
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
@ -5224,8 +5242,19 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
    if (ModifiedDT)
      return true;
  }
-  MadeChange |= dupRetToEnableTailCallOpts(&BB);

+  bool MadeBitReverse = true;
+  while (TLI && MadeBitReverse) {
+    MadeBitReverse = false;
+    for (auto &I : reverse(BB)) {
+      if (makeBitReverse(I, *DL, *TLI)) {
+        MadeBitReverse = MadeChange = true;
+        break;
+      }
+    }
+  }
+  MadeChange |= dupRetToEnableTailCallOpts(&BB);
+  
  return MadeChange;
 }

--- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
@ -163,7 +163,7 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
 }

 /// Should we be emitting segmented stack stuff for the function
-bool MachineFunction::shouldSplitStack() {
+bool MachineFunction::shouldSplitStack() const {
  return getFunction()->hasFnAttribute("split-stack");
 }

--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -377,22 +377,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
  }
 }

-/// Add logical or fast math flag values to FoldingSetNodeID value.
-static void AddNodeIDFlags(FoldingSetNodeID &ID, unsigned Opcode,
-                           const SDNodeFlags *Flags) {
-  if (!isBinOpWithFlags(Opcode))
-    return;
-
-  unsigned RawFlags = 0;
-  if (Flags)
-    RawFlags = Flags->getRawFlags();
-  ID.AddInteger(RawFlags);
-}
-
-static void AddNodeIDFlags(FoldingSetNodeID &ID, const SDNode *N) {
-  AddNodeIDFlags(ID, N->getOpcode(), N->getFlags());
-}
-
 static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
                          SDVTList VTList, ArrayRef<SDValue> OpList) {
  AddNodeIDOpcode(ID, OpC);
@ -528,8 +512,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
  }
  } // end switch (N->getOpcode())

-  AddNodeIDFlags(ID, N);
-
  // Target specific memory nodes could also have address spaces to check.
  if (N->isTargetMemoryOpcode())
    ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
@ -851,6 +833,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
  AddNodeIDCustom(ID, N);
  SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
  return Node;
 }

@ -869,6 +854,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
  AddNodeIDCustom(ID, N);
  SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
  return Node;
 }

@ -886,6 +874,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
  AddNodeIDCustom(ID, N);
  SDNode *Node = FindNodeOrInsertPos(ID, N->getDebugLoc(), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
  return Node;
 }

@ -3892,10 +3883,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
    SDValue Ops[] = {N1, N2};
    FoldingSetNodeID ID;
    AddNodeIDNode(ID, Opcode, VTs, Ops);
-    AddNodeIDFlags(ID, Opcode, Flags);
    void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL.getDebugLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
      return SDValue(E, 0);
+    }

    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);

@ -6249,10 +6242,12 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
    FoldingSetNodeID ID;
    AddNodeIDNode(ID, Opcode, VTList, Ops);
-    AddNodeIDFlags(ID, Opcode, Flags);
    void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DebugLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
      return E;
+    }
  }
  return nullptr;
 }
@ -6948,6 +6943,11 @@ const SDNodeFlags *SDNode::getFlags() const {
  return nullptr;
 }

+void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) {
+  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
+    FlagsNode->Flags.intersectWith(Flags);
+}
+
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
  assert(N->getNumValues() == 1 &&
         "Can't unroll a vector with multiple results!");
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -10133,6 +10133,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
  for (const MCPhysReg *I = IStart; *I; ++I) {
    const TargetRegisterClass *RC = nullptr;
    if (AArch64::GPR64RegClass.contains(*I))
@ -10152,13 +10153,13 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
               Attribute::NoUnwind) &&
           "Function should be nounwind in insertCopiesSplitCSR!");
    Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
        .addReg(*I);

+    // Insert the copy-back instructions right before the terminator.
    for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
          .addReg(NewVR);
  }
 }
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@ -112,9 +112,21 @@ class AArch64ELFStreamer : public MCELFStreamer {
    MCELFStreamer::EmitInstruction(Inst, STI);
  }

+  /// Emit a 32-bit value as an instruction. This is only used for the .inst
+  /// directive, EmitInstruction should be used in other cases.
  void emitInst(uint32_t Inst) {
+    char Buffer[4];
+
+    // We can't just use EmitIntValue here, as that will emit a data mapping
+    // symbol, and swap the endianness on big-endian systems (instructions are
+    // always little-endian).
+    for (unsigned I = 0; I < 4; ++I) {
+      Buffer[I] = uint8_t(Inst);
+      Inst >>= 8;
+    }
+
    EmitA64MappingSymbol();
-    MCELFStreamer::EmitIntValue(Inst, 4);
+    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
  }

  /// This is one of the functions used to emit data into an ELF section, so the
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -12423,6 +12423,7 @@ void ARMTargetLowering::insertCopiesSplitCSR(

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
  for (const MCPhysReg *I = IStart; *I; ++I) {
    const TargetRegisterClass *RC = nullptr;
    if (ARM::GPRRegClass.contains(*I))
@ -12442,13 +12443,13 @@ void ARMTargetLowering::insertCopiesSplitCSR(
               Attribute::NoUnwind) &&
           "Function should be nounwind in insertCopiesSplitCSR!");
    Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
        .addReg(*I);

+    // Insert the copy-back instructions right before the terminator.
    for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
          .addReg(NewVR);
  }
 }
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@ -832,10 +832,10 @@ def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
                                             R8, R9, R10, R11)>;

 // CSRs that are handled by prologue, epilogue.
-def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
+def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>;

 // CSRs that are handled explicitly via copies.
-def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
+def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>;

 // All GPRs - except r11
 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@ -2031,6 +2031,10 @@ void X86FrameLowering::adjustForSegmentedStacks(
  unsigned TlsReg, TlsOffset;
  DebugLoc DL;

+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
         "Scratch register is live-in");
@ -2271,6 +2275,11 @@ void X86FrameLowering::adjustForHiPEPrologue(
    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
  MachineFrameInfo *MFI = MF.getFrameInfo();
  DebugLoc DL;
+
+  // To support shrink-wrapping we would need to insert the new blocks
+  // at the right place and update the branches to PrologueMBB.
+  assert(&(*MF.begin()) == &PrologueMBB && "Shrink-wrapping not supported yet");
+
  // HiPE-specific values
  const unsigned HipeLeafWords = 24;
  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
@ -2584,7 +2593,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
 bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
  // If we may need to emit frameless compact unwind information, give
  // up as this is currently broken: PR25614.
-  return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF);
+  return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
+         // The lowering of segmented stack and HiPE only support entry blocks
+         // as prologue blocks: PR26107.
+         // This limitation may be lifted if we fix:
+         // - adjustForSegmentedStacks
+         // - adjustForHiPEPrologue
+         MF.getFunction()->getCallingConv() != CallingConv::HiPE &&
+         !MF.shouldSplitStack();
 }

 MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -28908,6 +28908,7 @@ void X86TargetLowering::insertCopiesSplitCSR(

  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  MachineBasicBlock::iterator MBBI = Entry->begin();
  for (const MCPhysReg *I = IStart; *I; ++I) {
    const TargetRegisterClass *RC = nullptr;
    if (X86::GR64RegClass.contains(*I))
@ -28925,13 +28926,13 @@ void X86TargetLowering::insertCopiesSplitCSR(
               Attribute::NoUnwind) &&
           "Function should be nounwind in insertCopiesSplitCSR!");
    Entry->addLiveIn(*I);
-    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            NewVR)
+    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
        .addReg(*I);

+    // Insert the copy-back instructions right before the terminator.
    for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-              *I)
+      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), *I)
          .addReg(NewVR);
  }
 }
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@ -17,6 +17,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;

@ -1565,190 +1566,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
  return Changed ? &I : nullptr;
 }

-
-/// Analyze the specified subexpression and see if it is capable of providing
-/// pieces of a bswap or bitreverse. The subexpression provides a potential
-/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
-/// the output of the expression came from a corresponding bit in some other
-/// value. This function is recursive, and the end result is a mapping of
-/// (value, bitnumber) to bitnumber. It is the caller's responsibility to
-/// validate that all `value`s are identical and that the bitnumber to bitnumber
-/// mapping is correct for a bswap or bitreverse.
-///
-/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
-/// that the expression deposits the low byte of %X into the high byte of the
-/// result and that all other bits are zero. This expression is accepted,
-/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7].
-///
-/// This function returns true if the match was unsuccessful and false if so.
-/// On entry to the function the "OverallLeftShift" is a signed integer value
-/// indicating the number of bits that the subexpression is later shifted.  For
-/// example, if the expression is later right shifted by 16 bits, the
-/// OverallLeftShift value would be -16 on entry.  This is used to specify which
-/// bits of BitValues are actually being set.
-///
-/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding
-/// bit is masked to zero by a user.  For example, in (X & 255), X will be
-/// processed with a bytemask of 255. BitMask is always in the local
-/// (OverallLeftShift) coordinate space.
-///
-static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask,
-                            SmallVectorImpl<Value *> &BitValues,
-                            SmallVectorImpl<int> &BitProvenance) {
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    // If this is an or instruction, it may be an inner node of the bswap.
-    if (I->getOpcode() == Instruction::Or)
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance) ||
-             CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-
-    // If this is a logical shift by a constant, recurse with OverallLeftShift
-    // and BitMask adjusted.
-    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned ShAmt =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
-      // Ensure the shift amount is defined.
-      if (ShAmt > BitValues.size())
-        return true;
-
-      unsigned BitShift = ShAmt;
-      if (I->getOpcode() == Instruction::Shl) {
-        // X << C -> collect(X, +C)
-        OverallLeftShift += BitShift;
-        BitMask = BitMask.lshr(BitShift);
-      } else {
-        // X >>u C -> collect(X, -C)
-        OverallLeftShift -= BitShift;
-        BitMask = BitMask.shl(BitShift);
-      }
-
-      if (OverallLeftShift >= (int)BitValues.size())
-        return true;
-      if (OverallLeftShift <= -(int)BitValues.size())
-        return true;
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-
-    // If this is a logical 'and' with a mask that clears bits, clear the
-    // corresponding bits in BitMask.
-    if (I->getOpcode() == Instruction::And &&
-        isa<ConstantInt>(I->getOperand(1))) {
-      unsigned NumBits = BitValues.size();
-      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
-      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
-
-      for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) {
-        // If this bit is masked out by a later operation, we don't care what
-        // the and mask is.
-        if (BitMask[i] == 0)
-          continue;
-
-        // If the AndMask is zero for this bit, clear the bit.
-        APInt MaskB = AndMask & Bit;
-        if (MaskB == 0) {
-          BitMask.clearBit(i);
-          continue;
-        }
-
-        // Otherwise, this bit is kept.
-      }
-
-      return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask,
-                             BitValues, BitProvenance);
-    }
-  }
-
-  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
-  // the input value to the bswap/bitreverse. To be part of a bswap or
-  // bitreverse we must be demanding a contiguous range of bits from it.
-  unsigned InputBitLen = BitMask.countPopulation();
-  unsigned InputBitNo = BitMask.countTrailingZeros();
-  if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo !=
-      InputBitLen)
-    // Not a contiguous set range of bits!
-    return true;
-
-  // We know we're moving a contiguous range of bits from the input to the
-  // output. Record which bits in the output came from which bits in the input.
-  unsigned DestBitNo = InputBitNo + OverallLeftShift;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitProvenance[DestBitNo + I] = InputBitNo + I;
-
-  // If the destination bit value is already defined, the values are or'd
-  // together, which isn't a bswap/bitreverse (unless it's an or of the same
-  // bits).
-  if (BitValues[DestBitNo] && BitValues[DestBitNo] != V)
-    return true;
-  for (unsigned I = 0; I < InputBitLen; ++I)
-    BitValues[DestBitNo + I] = V;
-
-  return false;
-}
-
-static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
-                                          unsigned BitWidth) {
-  if (From % 8 != To % 8)
-    return false;
-  // Convert from bit indices to byte indices and check for a byte reversal.
-  From >>= 3;
-  To >>= 3;
-  BitWidth >>= 3;
-  return From == BitWidth - To - 1;
-}
-
-static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
-                                               unsigned BitWidth) {
-  return From == BitWidth - To - 1;
-}
-
 /// Given an OR instruction, check to see if this is a bswap or bitreverse
 /// idiom. If so, insert the new intrinsic and return it.
 Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) {
-  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-  if (!ITy)
-    return nullptr;   // Can't do vectors.
-  unsigned BW = ITy->getBitWidth();
-  
-  /// We keep track of which bit (BitProvenance) inside which value (BitValues)
-  /// defines each bit in the result.
-  SmallVector<Value *, 8> BitValues(BW, nullptr);
-  SmallVector<int, 8> BitProvenance(BW, -1);
-  
-  // Try to find all the pieces corresponding to the bswap.
-  APInt BitMask = APInt::getAllOnesValue(BitValues.size());
-  if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance))
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBitReverseOrBSwapIdiom(&I, true, false, Insts))
    return nullptr;
+  Instruction *LastInst = Insts.pop_back_val();
+  LastInst->removeFromParent();

-  // Check to see if all of the bits come from the same value.
-  Value *V = BitValues[0];
-  if (!V) return nullptr;  // Didn't find a bit?  Must be zero.
-
-  if (!std::all_of(BitValues.begin(), BitValues.end(),
-                   [&](const Value *X) { return X == V; }))
-    return nullptr;
-
-  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
-  // only byteswap values with an even number of bytes.
-  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;;
-  for (unsigned i = 0, e = BitValues.size(); i != e; ++i) {
-    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
-    OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
-  }
-
-  Intrinsic::ID Intrin;
-  if (OKForBSwap)
-    Intrin = Intrinsic::bswap;
-  else if (OKForBitReverse)
-    Intrin = Intrinsic::bitreverse;
-  else
-    return nullptr;
-
-  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy);
-  return CallInst::Create(F, V);
+  for (auto *Inst : Insts)
+    Worklist.Add(Inst);
+  return LastInst;
 }

 /// We have an expression of the form (A&C)|(B&D).  Check if A is (cond?-1:0)
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@ -179,13 +179,244 @@ void LandingPadInliningInfo::forwardResume(
  RI->eraseFromParent();
 }

+/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
+static Value *getParentPad(Value *EHPad) {
+  if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+    return FPI->getParentPad();
+  return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+typedef DenseMap<Instruction *, Value *> UnwindDestMemoTy;
+
+/// Helper for getUnwindDestToken that does the descendant-ward part of
+/// the search.
+static Value *getUnwindDestTokenHelper(Instruction *EHPad,
+                                       UnwindDestMemoTy &MemoMap) {
+  SmallVector<Instruction *, 8> Worklist(1, EHPad);
+
+  while (!Worklist.empty()) {
+    Instruction *CurrentPad = Worklist.pop_back_val();
+    // We only put pads on the worklist that aren't in the MemoMap.  When
+    // we find an unwind dest for a pad we may update its ancestors, but
+    // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
+    // so they should never get updated while queued on the worklist.
+    assert(!MemoMap.count(CurrentPad));
+    Value *UnwindDestToken = nullptr;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
+      if (CatchSwitch->hasUnwindDest()) {
+        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+      } else {
+        // Catchswitch doesn't have a 'nounwind' variant, and one might be
+        // annotated as "unwinds to caller" when really it's nounwind (see
+        // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
+        // parent's unwind dest from this.  We can check its catchpads'
+        // descendants, since they might include a cleanuppad with an
+        // "unwinds to caller" cleanupret, which can be trusted.
+        for (auto HI = CatchSwitch->handler_begin(),
+                  HE = CatchSwitch->handler_end();
+             HI != HE && !UnwindDestToken; ++HI) {
+          BasicBlock *HandlerBlock = *HI;
+          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+          for (User *Child : CatchPad->users()) {
+            // Intentionally ignore invokes here -- since the catchswitch is
+            // marked "unwind to caller", it would be a verifier error if it
+            // contained an invoke which unwinds out of it, so any invoke we'd
+            // encounter must unwind to some child of the catch.
+            if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
+              continue;
+
+            Instruction *ChildPad = cast<Instruction>(Child);
+            auto Memo = MemoMap.find(ChildPad);
+            if (Memo == MemoMap.end()) {
+              // Haven't figure out this child pad yet; queue it.
+              Worklist.push_back(ChildPad);
+              continue;
+            }
+            // We've already checked this child, but might have found that
+            // it offers no proof either way.
+            Value *ChildUnwindDestToken = Memo->second;
+            if (!ChildUnwindDestToken)
+              continue;
+            // We already know the child's unwind dest, which can either
+            // be ConstantTokenNone to indicate unwind to caller, or can
+            // be another child of the catchpad.  Only the former indicates
+            // the unwind dest of the catchswitch.
+            if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
+              UnwindDestToken = ChildUnwindDestToken;
+              break;
+            }
+            assert(getParentPad(ChildUnwindDestToken) == CatchPad);
+          }
+        }
+      }
+    } else {
+      auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
+      for (User *U : CleanupPad->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
+            UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+          else
+            UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
+          break;
+        }
+        Value *ChildUnwindDestToken;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+        } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
+          Instruction *ChildPad = cast<Instruction>(U);
+          auto Memo = MemoMap.find(ChildPad);
+          if (Memo == MemoMap.end()) {
+            // Haven't resolved this child yet; queue it and keep searching.
+            Worklist.push_back(ChildPad);
+            continue;
+          }
+          // We've checked this child, but still need to ignore it if it
+          // had no proof either way.
+          ChildUnwindDestToken = Memo->second;
+          if (!ChildUnwindDestToken)
+            continue;
+        } else {
+          // Not a relevant user of the cleanuppad
+          continue;
+        }
+        // In a well-formed program, the child/invoke must either unwind to
+        // an(other) child of the cleanup, or exit the cleanup.  In the
+        // first case, continue searching.
+        if (isa<Instruction>(ChildUnwindDestToken) &&
+            getParentPad(ChildUnwindDestToken) == CleanupPad)
+          continue;
+        UnwindDestToken = ChildUnwindDestToken;
+        break;
+      }
+    }
+    // If we haven't found an unwind dest for CurrentPad, we may have queued its
+    // children, so move on to the next in the worklist.
+    if (!UnwindDestToken)
+      continue;
+
+    // Now we know that CurrentPad unwinds to UnwindDestToken.  It also exits
+    // any ancestors of CurrentPad up to but not including UnwindDestToken's
+    // parent pad.  Record this in the memo map, and check to see if the
+    // original EHPad being queried is one of the ones exited.
+    Value *UnwindParent;
+    if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
+      UnwindParent = getParentPad(UnwindPad);
+    else
+      UnwindParent = nullptr;
+    bool ExitedOriginalPad = false;
+    for (Instruction *ExitedPad = CurrentPad;
+         ExitedPad && ExitedPad != UnwindParent;
+         ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
+      // Skip over catchpads since they just follow their catchswitches.
+      if (isa<CatchPadInst>(ExitedPad))
+        continue;
+      MemoMap[ExitedPad] = UnwindDestToken;
+      ExitedOriginalPad |= (ExitedPad == EHPad);
+    }
+
+    if (ExitedOriginalPad)
+      return UnwindDestToken;
+
+    // Continue the search.
+  }
+
+  // No definitive information is contained within this funclet.
+  return nullptr;
+}
+
+/// Given an EH pad, find where it unwinds.  If it unwinds to an EH pad,
+/// return that pad instruction.  If it unwinds to caller, return
+/// ConstantTokenNone.  If it does not have a definitive unwind destination,
+/// return nullptr.
+///
+/// This routine gets invoked for calls in funclets in inlinees when inlining
+/// an invoke.  Since many funclets don't have calls inside them, it's queried
+/// on-demand rather than building a map of pads to unwind dests up front.
+/// Determining a funclet's unwind dest may require recursively searching its
+/// descendants, and also ancestors and cousins if the descendants don't provide
+/// an answer.  Since most funclets will have their unwind dest immediately
+/// available as the unwind dest of a catchswitch or cleanupret, this routine
+/// searches top-down from the given pad and then up. To avoid worst-case
+/// quadratic run-time given that approach, it uses a memo map to avoid
+/// re-processing funclet trees.  The callers that rewrite the IR as they go
+/// take advantage of this, for correctness, by checking/forcing rewritten
+/// pads' entries to match the original callee view.
+static Value *getUnwindDestToken(Instruction *EHPad,
+                                 UnwindDestMemoTy &MemoMap) {
+  // Catchpads unwind to the same place as their catchswitch;
+  // redirct any queries on catchpads so the code below can
+  // deal with just catchswitches and cleanuppads.
+  if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
+    EHPad = CPI->getCatchSwitch();
+
+  // Check if we've already determined the unwind dest for this pad.
+  auto Memo = MemoMap.find(EHPad);
+  if (Memo != MemoMap.end())
+    return Memo->second;
+
+  // Search EHPad and, if necessary, its descendants.
+  Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
+  assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
+  if (UnwindDestToken)
+    return UnwindDestToken;
+
+  // No information is available for this EHPad from itself or any of its
+  // descendants.  An unwind all the way out to a pad in the caller would
+  // need also to agree with the unwind dest of the parent funclet, so
+  // search up the chain to try to find a funclet with information.  Put
+  // null entries in the memo map to avoid re-processing as we go up.
+  MemoMap[EHPad] = nullptr;
+  Instruction *LastUselessPad = EHPad;
+  Value *AncestorToken;
+  for (AncestorToken = getParentPad(EHPad);
+       auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
+       AncestorToken = getParentPad(AncestorToken)) {
+    // Skip over catchpads since they just follow their catchswitches.
+    if (isa<CatchPadInst>(AncestorPad))
+      continue;
+    assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
+    auto AncestorMemo = MemoMap.find(AncestorPad);
+    if (AncestorMemo == MemoMap.end()) {
+      UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
+    } else {
+      UnwindDestToken = AncestorMemo->second;
+    }
+    if (UnwindDestToken)
+      break;
+    LastUselessPad = AncestorPad;
+  }
+
+  // Since the whole tree under LastUselessPad has no information, it all must
+  // match UnwindDestToken; record that to avoid repeating the search.
+  SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
+  while (!Worklist.empty()) {
+    Instruction *UselessPad = Worklist.pop_back_val();
+    assert(!MemoMap.count(UselessPad) || MemoMap[UselessPad] == nullptr);
+    MemoMap[UselessPad] = UnwindDestToken;
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
+      for (BasicBlock *HandlerBlock : CatchSwitch->handlers())
+        for (User *U : HandlerBlock->getFirstNonPHI()->users())
+          if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+            Worklist.push_back(cast<Instruction>(U));
+    } else {
+      assert(isa<CleanupPadInst>(UselessPad));
+      for (User *U : UselessPad->users())
+        if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+          Worklist.push_back(cast<Instruction>(U));
+    }
+  }
+
+  return UnwindDestToken;
+}
+
 /// When we inline a basic block into an invoke,
 /// we have to turn all of the calls that can throw into invokes.
 /// This function analyze BB to see if there are any calls, and if so,
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
-static BasicBlock *
-HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
+static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
+    BasicBlock *BB, BasicBlock *UnwindEdge,
+    UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
  for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
    Instruction *I = &*BBI++;

@ -196,6 +427,31 @@ HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, BasicBlock *UnwindEdge) {
    if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
      continue;

+    if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+      // This call is nested inside a funclet.  If that funclet has an unwind
+      // destination within the inlinee, then unwinding out of this call would
+      // be UB.  Rewriting this call to an invoke which targets the inlined
+      // invoke's unwind dest would give the call's parent funclet multiple
+      // unwind destinations, which is something that subsequent EH table
+      // generation can't handle and that the veirifer rejects.  So when we
+      // see such a call, leave it as a call.
+      auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
+      Value *UnwindDestToken =
+          getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
+      if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+        continue;
+#ifndef NDEBUG
+      Instruction *MemoKey;
+      if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+        MemoKey = CatchPad->getCatchSwitch();
+      else
+        MemoKey = FuncletPad;
+      assert(FuncletUnwindMap->count(MemoKey) &&
+             (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
+             "must get memoized to avoid confusing later searches");
+#endif // NDEBUG
+    }
+
    // Convert this function call into an invoke instruction.  First, split the
    // basic block.
    BasicBlock *Split =
@ -328,13 +584,23 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,

  // This connects all the instructions which 'unwind to caller' to the invoke
  // destination.
+  UnwindDestMemoTy FuncletUnwindMap;
  for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
       BB != E; ++BB) {
    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
      if (CRI->unwindsToCaller()) {
-        CleanupReturnInst::Create(CRI->getCleanupPad(), UnwindDest, CRI);
+        auto *CleanupPad = CRI->getCleanupPad();
+        CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
        CRI->eraseFromParent();
        UpdatePHINodes(&*BB);
+        // Finding a cleanupret with an unwind destination would confuse
+        // subsequent calls to getUnwindDestToken, so map the cleanuppad
+        // to short-circuit any such calls and recognize this as an "unwind
+        // to caller" cleanup.
+        assert(!FuncletUnwindMap.count(CleanupPad) ||
+               isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
+        FuncletUnwindMap[CleanupPad] =
+            ConstantTokenNone::get(Caller->getContext());
      }
    }

@ -345,12 +611,41 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
    Instruction *Replacement = nullptr;
    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
      if (CatchSwitch->unwindsToCaller()) {
+        Value *UnwindDestToken;
+        if (auto *ParentPad =
+                dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
+          // This catchswitch is nested inside another funclet.  If that
+          // funclet has an unwind destination within the inlinee, then
+          // unwinding out of this catchswitch would be UB.  Rewriting this
+          // catchswitch to unwind to the inlined invoke's unwind dest would
+          // give the parent funclet multiple unwind destinations, which is
+          // something that subsequent EH table generation can't handle and
+          // that the veirifer rejects.  So when we see such a call, leave it
+          // as "unwind to caller".
+          UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
+          if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+            continue;
+        } else {
+          // This catchswitch has no parent to inherit constraints from, and
+          // none of its descendants can have an unwind edge that exits it and
+          // targets another funclet in the inlinee.  It may or may not have a
+          // descendant that definitively has an unwind to caller.  In either
+          // case, we'll have to assume that any unwinds out of it may need to
+          // be routed to the caller, so treat it as though it has a definitive
+          // unwind to caller.
+          UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
+        }
        auto *NewCatchSwitch = CatchSwitchInst::Create(
            CatchSwitch->getParentPad(), UnwindDest,
            CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
            CatchSwitch);
        for (BasicBlock *PadBB : CatchSwitch->handlers())
          NewCatchSwitch->addHandler(PadBB);
+        // Propagate info for the old catchswitch over to the new one in
+        // the unwind map.  This also serves to short-circuit any subsequent
+        // checks for the unwind dest of this catchswitch, which would get
+        // confused if they found the outer handler in the callee.
+        FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
        Replacement = NewCatchSwitch;
      }
    } else if (!isa<FuncletPadInst>(I)) {
@ -369,8 +664,8 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
    for (Function::iterator BB = FirstNewBlock->getIterator(),
                            E = Caller->end();
         BB != E; ++BB)
-      if (BasicBlock *NewBB =
-              HandleCallsInBlockInlinedThroughInvoke(&*BB, UnwindDest))
+      if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+              &*BB, UnwindDest, &FuncletUnwindMap))
        // Update any PHI nodes in the exceptional block to indicate that there
        // is now a new entry in them.
        UpdatePHINodes(NewBB);
@ -1415,6 +1710,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
    }
  }

+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any call instructions into invoke instructions.  This is sensitive to which
+  // funclet pads were top-level in the inlinee, so must be done before
+  // rewriting the "parent pad" links.
+  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
+    BasicBlock *UnwindDest = II->getUnwindDest();
+    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    if (isa<LandingPadInst>(FirstNonPHI)) {
+      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    } else {
+      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+    }
+  }
+
  // Update the lexical scopes of the new funclets and callsites.
  // Anything that had 'none' as its parent is now nested inside the callsite's
  // EHPad.
@ -1472,18 +1781,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
    }
  }

-  // If we are inlining for an invoke instruction, we must make sure to rewrite
-  // any call instructions into invoke instructions.
-  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
-    BasicBlock *UnwindDest = II->getUnwindDest();
-    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
-    if (isa<LandingPadInst>(FirstNonPHI)) {
-      HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
-    } else {
-      HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
-    }
-  }
-
  // Handle any inlined musttail call sites.  In order for a new call site to be
  // musttail, the source of the clone and the inlined call site must have been
  // musttail.  Therefore it's safe to return without merging control into the
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@ -1592,3 +1592,205 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {

  return false;
 }
+
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+  BitPart(Value *P, unsigned BW) : Provider(P) {
+    Provenance.resize(BW);
+  }
+
+  /// The Value that this is a bitreverse/bswap of.
+  Value *Provider;
+  /// The "provenance" of each bit. Provenance[A] = B means that bit A
+  /// in Provider becomes bit B in the result of this expression.
+  SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+  enum { Unset = -1 };
+};
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
+/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+///
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+                std::map<Value *, Optional<BitPart>> &BPS) {
+  auto I = BPS.find(V);
+  if (I != BPS.end())
+    return I->second;
+
+  auto &Result = BPS[V] = None;
+  auto BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      auto &A = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      auto &B = collectBitParts(I->getOperand(1), MatchBSwaps,
+                                MatchBitReversals, BPS);
+      if (!A || !B)
+        return Result;
+
+      // Try and merge the two together.
+      if (!A->Provider || A->Provider != B->Provider)
+        return Result;
+
+      Result = BitPart(A->Provider, BitWidth);
+      for (unsigned i = 0; i < A->Provenance.size(); ++i) {
+        if (A->Provenance[i] != BitPart::Unset &&
+            B->Provenance[i] != BitPart::Unset &&
+            A->Provenance[i] != B->Provenance[i])
+          return Result = None;
+
+        if (A->Provenance[i] == BitPart::Unset)
+          Result->Provenance[i] = B->Provenance[i];
+        else
+          Result->Provenance[i] = A->Provenance[i];
+      }
+
+      return Result;
+    }
+
+    // If this is a logical shift by a constant, recurse then shift the result.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned BitShift =
+          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined.
+      if (BitShift > BitWidth)
+        return Result;
+
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      // Perform the "shift" on BitProvenance.
+      auto &P = Result->Provenance;
+      if (I->getOpcode() == Instruction::Shl) {
+        P.erase(std::prev(P.end(), BitShift), P.end());
+        P.insert(P.begin(), BitShift, BitPart::Unset);
+      } else {
+        P.erase(P.begin(), std::next(P.begin(), BitShift));
+        P.insert(P.end(), BitShift, BitPart::Unset);
+      }
+
+      return Result;
+    }
+
+    // If this is a logical 'and' with a mask that clears bits, recurse then
+    // unset the appropriate bits.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+
+      // Check that the mask allows a multiple of 8 bits for a bswap, for an
+      // early exit.
+      unsigned NumMaskedBits = AndMask.countPopulation();
+      if (!MatchBitReversals && NumMaskedBits % 8 != 0)
+        return Result;
+      
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+      Result = Res;
+
+      for (unsigned i = 0; i < BitWidth; ++i, Bit <<= 1)
+        // If the AndMask is zero for this bit, clear the bit.
+        if ((AndMask & Bit) == 0)
+          Result->Provenance[i] = BitPart::Unset;
+
+      return Result;
+    }
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap/bitreverse.
+  Result = BitPart(V, BitWidth);
+  for (unsigned i = 0; i < BitWidth; ++i)
+    Result->Provenance[i] = i;
+  return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+                                          unsigned BitWidth) {
+  if (From % 8 != To % 8)
+    return false;
+  // Convert from bit indices to byte indices and check for a byte reversal.
+  From >>= 3;
+  To >>= 3;
+  BitWidth >>= 3;
+  return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+                                               unsigned BitWidth) {
+  return From == BitWidth - To - 1;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+bool llvm::recognizeBitReverseOrBSwapIdiom(
+    Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+    SmallVectorImpl<Instruction *> &InsertedInsts) {
+  if (Operator::getOpcode(I) != Instruction::Or)
+    return false;
+  if (!MatchBSwaps && !MatchBitReversals)
+    return false;
+  IntegerType *ITy = dyn_cast<IntegerType>(I->getType());
+  if (!ITy || ITy->getBitWidth() > 128)
+    return false;   // Can't do vectors or integers > 128 bits.
+  unsigned BW = ITy->getBitWidth();
+
+  // Try to find all the pieces corresponding to the bswap.
+  std::map<Value *, Optional<BitPart>> BPS;
+  auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
+  if (!Res)
+    return false;
+  auto &BitProvenance = Res->Provenance;
+
+  // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+  // only byteswap values with an even number of bytes.
+  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < BW; ++i) {
+    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+    OKForBitReverse &=
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+  }
+
+  Intrinsic::ID Intrin;
+  if (OKForBSwap && MatchBSwaps)
+    Intrin = Intrinsic::bswap;
+  else if (OKForBitReverse && MatchBitReversals)
+    Intrin = Intrinsic::bitreverse;
+  else
+    return false;
+
+  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
+  InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
+  return true;
+}
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@ -970,15 +970,34 @@ static Value *valueHasFloatPrecision(Value *Val) {
  return nullptr;
 }

-//===----------------------------------------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+/// Any floating-point library function that we're trying to simplify will have
+/// a signature of the form: fptype foo(fptype param1, fptype param2, ...).
+/// CheckDoubleTy indicates that 'fptype' must be 'double'.
+static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams,
+                                          bool CheckDoubleTy) {
+  FunctionType *FT = F->getFunctionType();
+  if (FT->getNumParams() != NumParams)
+    return false;

-Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
-                                                bool CheckRetType) {
+  // The return type must match what we're looking for.
+  Type *RetTy = FT->getReturnType();
+  if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy())
+    return false;
+
+  // Each parameter must match the return type, and therefore, match every other
+  // parameter too.
+  for (const Type *ParamTy : FT->params())
+    if (ParamTy != RetTy)
+      return false;
+
+  return true;
+}
+
+/// Shrink double -> float for unary functions like 'floor'.
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+                                    bool CheckRetType) {
  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-      !FT->getParamType(0)->isDoubleTy())
+  if (!matchesFPLibFunctionSignature(Callee, 1, true))
    return nullptr;

  if (CheckRetType) {
@ -1013,15 +1032,10 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
  return B.CreateFPExt(V, B.getDoubleTy());
 }

-// Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax'
-Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
+/// Shrink double -> float for binary functions like 'fmin/fmax'.
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 2 arguments of the same FP type, which match the
-  // result type.
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
+  if (!matchesFPLibFunctionSignature(Callee, 2, true))
    return nullptr;

  // If this is something like 'fmin((double)floatval1, (double)floatval2)',
@ -1394,12 +1408,21 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {

 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
  Function *Callee = CI->getCalledFunction();
-  
+
  Value *Ret = nullptr;
  if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
                                   Callee->getIntrinsicID() == Intrinsic::sqrt))
    Ret = optimizeUnaryDoubleFP(CI, B, true);

+  // FIXME: Refactor - this check is repeated all over this file and even in the
+  // preceding call to shrink double -> float.
+
+  // Make sure this has 1 argument of FP type, which matches the result type.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
  if (!CI->hasUnsafeAlgebra())
    return Ret;

--- a/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@ -3548,14 +3548,16 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
              E = CGF.EmitAnyExpr(EExpr);
            CGF.EmitOMPAtomicSimpleUpdateExpr(
                X, E, BO, /*IsXLHSInRHSPart=*/true, llvm::Monotonic, Loc,
-                [&CGF, UpExpr, VD, IPriv](RValue XRValue) {
+                [&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) {
                  CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
-                  PrivateScope.addPrivate(VD, [&CGF, VD, XRValue]() -> Address {
-                    Address LHSTemp = CGF.CreateMemTemp(VD->getType());
-                    CGF.EmitStoreThroughLValue(
-                        XRValue, CGF.MakeAddrLValue(LHSTemp, VD->getType()));
-                    return LHSTemp;
-                  });
+                  PrivateScope.addPrivate(
+                      VD, [&CGF, VD, XRValue, Loc]() -> Address {
+                        Address LHSTemp = CGF.CreateMemTemp(VD->getType());
+                        CGF.emitOMPSimpleStore(
+                            CGF.MakeAddrLValue(LHSTemp, VD->getType()), XRValue,
+                            VD->getType().getNonReferenceType(), Loc);
+                        return LHSTemp;
+                      });
                  (void)PrivateScope.Privatize();
                  return CGF.EmitAnyExpr(UpExpr);
                });
--- a/contrib/llvm/tools/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/contrib/llvm/tools/clang/lib/CodeGen/CGStmtOpenMP.cpp
@ -2163,17 +2163,17 @@ static void emitSimpleAtomicStore(CodeGenFunction &CGF, bool IsSeqCst,
  }
 }

-static void emitSimpleStore(CodeGenFunction &CGF, LValue LVal, RValue RVal,
-                            QualType RValTy, SourceLocation Loc) {
-  switch (CGF.getEvaluationKind(LVal.getType())) {
+void CodeGenFunction::emitOMPSimpleStore(LValue LVal, RValue RVal,
+                                         QualType RValTy, SourceLocation Loc) {
+  switch (getEvaluationKind(LVal.getType())) {
  case TEK_Scalar:
-    CGF.EmitStoreThroughLValue(RValue::get(convertToScalarValue(
-                                   CGF, RVal, RValTy, LVal.getType(), Loc)),
-                               LVal);
+    EmitStoreThroughLValue(RValue::get(convertToScalarValue(
+                               *this, RVal, RValTy, LVal.getType(), Loc)),
+                           LVal);
    break;
  case TEK_Complex:
-    CGF.EmitStoreOfComplex(
-        convertToComplexValue(CGF, RVal, RValTy, LVal.getType(), Loc), LVal,
+    EmitStoreOfComplex(
+        convertToComplexValue(*this, RVal, RValTy, LVal.getType(), Loc), LVal,
        /*isInit=*/false);
    break;
  case TEK_Aggregate:
@ -2201,7 +2201,7 @@ static void EmitOMPAtomicReadExpr(CodeGenFunction &CGF, bool IsSeqCst,
  // list.
  if (IsSeqCst)
    CGF.CGM.getOpenMPRuntime().emitFlush(CGF, llvm::None, Loc);
-  emitSimpleStore(CGF, VLValue, Res, X->getType().getNonReferenceType(), Loc);
+  CGF.emitOMPSimpleStore(VLValue, Res, X->getType().getNonReferenceType(), Loc);
 }

 static void EmitOMPAtomicWriteExpr(CodeGenFunction &CGF, bool IsSeqCst,
@ -2459,7 +2459,7 @@ static void EmitOMPAtomicCaptureExpr(CodeGenFunction &CGF, bool IsSeqCst,
    }
  }
  // Emit post-update store to 'v' of old/new 'x' value.
-  emitSimpleStore(CGF, VLValue, NewVVal, NewVValType, Loc);
+  CGF.emitOMPSimpleStore(VLValue, NewVVal, NewVValType, Loc);
  // OpenMP, 2.12.6, atomic Construct
  // Any atomic construct with a seq_cst clause forces the atomically
  // performed operation to include an implicit flush operation without a
--- a/contrib/llvm/tools/clang/lib/CodeGen/CodeGenFunction.h
+++ b/contrib/llvm/tools/clang/lib/CodeGen/CodeGenFunction.h
@ -2211,6 +2211,8 @@ class CodeGenFunction : public CodeGenTypeCache {
  llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S);
  void GenerateOpenMPCapturedVars(const CapturedStmt &S,
                                  SmallVectorImpl<llvm::Value *> &CapturedVars);
+  void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy,
+                          SourceLocation Loc);
  /// \brief Perform element by element copying of arrays with type \a
  /// OriginalType from \a SrcAddr to \a DestAddr using copying procedure
  /// generated by \a CopyGen.
--- a/contrib/llvm/tools/lli/lli.cpp
+++ b/contrib/llvm/tools/lli/lli.cpp
@ -16,6 +16,7 @@
 #include "OrcLazyJIT.h"
 #include "RemoteJITUtils.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
@ -741,11 +742,11 @@ std::unique_ptr<FDRPCChannel> launchRemote() {
      ChildPath.reset(new char[ChildExecPath.size() + 1]);
      std::copy(ChildExecPath.begin(), ChildExecPath.end(), &ChildPath[0]);
      ChildPath[ChildExecPath.size()] = '\0';
-      std::string ChildInStr = std::to_string(PipeFD[0][0]);
+      std::string ChildInStr = utostr(PipeFD[0][0]);
      ChildIn.reset(new char[ChildInStr.size() + 1]);
      std::copy(ChildInStr.begin(), ChildInStr.end(), &ChildIn[0]);
      ChildIn[ChildInStr.size()] = '\0';
-      std::string ChildOutStr = std::to_string(PipeFD[1][1]);
+      std::string ChildOutStr = utostr(PipeFD[1][1]);
      ChildOut.reset(new char[ChildOutStr.size() + 1]);
      std::copy(ChildOutStr.begin(), ChildOutStr.end(), &ChildOut[0]);
      ChildOut[ChildOutStr.size()] = '\0';
--- a/lib/clang/include/clang/Basic/Version.inc
+++ b/lib/clang/include/clang/Basic/Version.inc
@ -7,4 +7,4 @@

 #define	CLANG_VENDOR			"FreeBSD "

-#define	SVN_REVISION			"257836"
+#define	SVN_REVISION			"258549"