Upgrade our copy of clang and llvm to 3.6.1 release.

This release contains the following cherry-picked revisions from upstream trunk: 226124 226151 226164 226165 226166 226407 226408 226409 226652 226905 226983 227084 227087 227089 227208 227209 227210 227211 227212 227213 227214 227269 227430 227482 227503 227519 227574 227822 227986 227987 227988 227989 227990 228037 228038 228039 228040 228188 228189 228190 228273 228372 228373 228374 228403 228765 228848 228918 229223 229225 229226 229227 229228 229230 229234 229235 229236 229238 229239 229413 229507 229680 229750 229751 229752 229911 230146 230147 230235 230253 230255 230469 230500 230564 230603 230657 230742 230748 230956 231219 231237 231245 231259 231280 231451 231563 231601 231658 231659 231662 231984 231986 232046 232085 232142 232176 232179 232189 232382 232386 232389 232425 232438 232443 232675 232786 232797 232943 232957 233075 233080 233351 233353 233409 233410 233508 233584 233819 233904 234629 234636 234891 234975 234977 235524 235641 235662 235931 236099 236306 236307 Please note that from 3.5.0 onwards, clang and llvm require C++11 support to build; see UPDATING for more information.
svn path=/head/; revision=283526
2015-05-25 13:43:03 +00:00 · 2015-05-25 13:43:03 +00:00 · ef6fa9e26d · 2020-12-20 02:59:44 +00:00
commit ef6fa9e26d
parent 4b040d9513 d5f23b0b75 f03b5bed27
117 changed files with 2075 additions and 4172 deletions
--- a/ObsoleteFiles.inc
+++ b/ObsoleteFiles.inc
@ -38,6 +38,67 @@
 #   xargs -n1 | sort | uniq -d;
 # done

+# 20150525: new clang import which bumps version from 3.6.0 to 3.6.1.
+OLD_FILES+=usr/lib/clang/3.6.0/include/__stddef_max_align_t.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/__wmmintrin_aes.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/__wmmintrin_pclmul.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/adxintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/altivec.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/ammintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/arm_acle.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/arm_neon.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avx2intrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avx512bwintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avx512erintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avx512fintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avx512vlbwintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avx512vlintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/avxintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/bmi2intrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/bmiintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/cpuid.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/emmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/f16cintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/fma4intrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/fmaintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/ia32intrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/immintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/lzcntintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/mm3dnow.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/mm_malloc.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/mmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/module.modulemap
+OLD_FILES+=usr/lib/clang/3.6.0/include/nmmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/pmmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/popcntintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/prfchwintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/rdseedintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/rtmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/shaintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/smmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/tbmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/tmmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/wmmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/x86intrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/xmmintrin.h
+OLD_FILES+=usr/lib/clang/3.6.0/include/xopintrin.h
+OLD_DIRS+=usr/lib/clang/3.6.0/include
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.asan-i386.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.asan-x86_64.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.asan_cxx-i386.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.asan_cxx-x86_64.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.profile-arm.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.profile-i386.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.profile-x86_64.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.san-i386.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.san-x86_64.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.ubsan-i386.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.ubsan-x86_64.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.ubsan_cxx-i386.a
+OLD_FILES+=usr/lib/clang/3.6.0/lib/freebsd/libclang_rt.ubsan_cxx-x86_64.a
+OLD_DIRS+=usr/lib/clang/3.6.0/lib/freebsd
+OLD_DIRS+=usr/lib/clang/3.6.0/lib
+OLD_DIRS+=usr/lib/clang/3.6.0
 # 20150521
 OLD_FILES+=usr/bin/demandoc
 OLD_FILES+=usr/share/man/man1/demandoc.1.gz
--- a/5
+++ b/5
@ -31,6 +31,11 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 11.x IS SLOW:
 	disable the most expensive debugging functionality run
 	"ln -s 'abort:false,junk:false' /etc/malloc.conf".)

+20150525:
+	Clang and llvm have been upgraded to 3.6.1 release.  Please see the
+	20141231 entry below for information about prerequisites and upgrading,
+	if you are not already using 3.5.0 or higher.
+
 20150521:
 	TI platform code switched to using vendor DTS files and this update
 	may break existing systems running on Beaglebone, Beaglebone Black,
--- a/contrib/llvm/include/llvm/Target/TargetCallingConv.h
+++ b/contrib/llvm/include/llvm/Target/TargetCallingConv.h
@ -134,6 +134,8 @@ namespace ISD {

    /// Index original Function's argument.
    unsigned OrigArgIndex;
+    /// Sentinel value for implicit machine-level input arguments.
+    static const unsigned NoArgIndex = UINT_MAX;

    /// Offset in bytes of current input value relative to the beginning of
    /// original argument. E.g. if argument was splitted into four 32 bit
@ -147,6 +149,15 @@ namespace ISD {
      VT = vt.getSimpleVT();
      ArgVT = argvt;
    }
+
+    bool isOrigArg() const {
+      return OrigArgIndex != NoArgIndex;
+    }
+
+    unsigned getOrigArgIndex() const {
+      assert(OrigArgIndex != NoArgIndex && "Implicit machine-level argument");
+      return OrigArgIndex;
+    }
  };

  /// OutputArg - This struct carries flags and a value for a
--- a/contrib/llvm/include/llvm/Target/TargetLowering.h
+++ b/contrib/llvm/include/llvm/Target/TargetLowering.h
@ -2806,6 +2806,11 @@ class TargetLowering : public TargetLoweringBase {
  virtual bool useLoadStackGuardNode() const {
    return false;
  }
+
+  /// Returns true if arguments should be sign-extended in lib calls.
+  virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+    return IsSigned;
+  }
 };

 /// Given an LLVM IR type and return type attributes, compute the return value
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@ -1776,9 +1776,12 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
                        << *IsomorphicInc << '\n');
        Value *NewInc = OrigInc;
        if (OrigInc->getType() != IsomorphicInc->getType()) {
-          Instruction *IP = isa<PHINode>(OrigInc)
-            ? (Instruction*)L->getHeader()->getFirstInsertionPt()
-            : OrigInc->getNextNode();
+          Instruction *IP = nullptr;
+          if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
+            IP = PN->getParent()->getFirstInsertionPt();
+          else
+            IP = OrigInc->getNextNode();
+
          IRBuilder<> Builder(IP);
          Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
          NewInc = Builder.
--- a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@ -75,10 +75,9 @@ MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg,
           I != E; ++I) {
        unsigned MappedDef = *I;
        // Source of copy is no longer available for propagation.
-        if (AvailCopyMap.erase(MappedDef)) {
-          for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR)
-            AvailCopyMap.erase(*SR);
-        }
+        AvailCopyMap.erase(MappedDef);
+        for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR)
+          AvailCopyMap.erase(*SR);
      }
    }
  }
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -1160,13 +1160,6 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
  LegalOperations = Level >= AfterLegalizeVectorOps;
  LegalTypes = Level >= AfterLegalizeTypes;

-  // Early exit if this basic block is in an optnone function.
-  AttributeSet FnAttrs =
-    DAG.getMachineFunction().getFunction()->getAttributes();
-  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::OptimizeNone))
-    return;
-
  // Add all the dag nodes to the worklist.
  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
       E = DAG.allnodes_end(); I != E; ++I)
@ -2788,9 +2781,13 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
               SplatBitSize = SplatBitSize * 2)
            SplatValue |= SplatValue.shl(SplatBitSize);

-        Constant = APInt::getAllOnesValue(BitWidth);
-        for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
-          Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
+        // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
+        // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
+        if (SplatBitSize % BitWidth == 0) {
+          Constant = APInt::getAllOnesValue(BitWidth);
+          for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
+            Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
+        }
      }
    }

@ -11043,7 +11040,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
      } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) {
        // If the input vector is too large, try to split it.
        // We don't support having two input vectors that are too large.
-        if (VecIn2.getNode())
+        // If the zero vector was used, we can not split the vector,
+        // since we'd need 3 inputs.
+        if (UsesZeroVector || VecIn2.getNode())
          return SDValue();

        if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements()))
@ -11055,7 +11054,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
          DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy()));
        VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
          DAG.getConstant(0, TLI.getVectorIdxTy()));
-        UsesZeroVector = false;
      } else
        return SDValue();
    }
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@ -497,7 +497,7 @@ bool FastISel::selectGetElementPtr(const User *I) {
       OI != E; ++OI) {
    const Value *Idx = *OI;
    if (auto *StTy = dyn_cast<StructType>(Ty)) {
-      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      uint64_t Field = cast<ConstantInt>(Idx)->getZExtValue();
      if (Field) {
        // N = N + Offset
        TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
@ -518,8 +518,8 @@ bool FastISel::selectGetElementPtr(const User *I) {
        if (CI->isZero())
          continue;
        // N = N + Offset
-        TotalOffs +=
-            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+        uint64_t IdxN = CI->getValue().sextOrTrunc(64).getSExtValue();
+        TotalOffs += DL.getTypeAllocSize(Ty) * IdxN;
        if (TotalOffs >= MaxOffs) {
          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
          if (!N) // Unhandled operand. Halt "fast" selection and bail.
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@ -658,7 +658,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                           NVT, N->getOperand(0));
  return TLI.makeLibCall(DAG, LC,
                         TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, false, dl).first;
+                         &Op, 1, Signed, dl).first;
 }


--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@ -1423,9 +1423,10 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {

      // If one or more successors has been unscheduled, then the current
      // node is no longer available.
-      if (!TrySU->isAvailable)
+      if (!TrySU->isAvailable || !TrySU->NodeQueueId)
        CurSU = AvailableQueue->pop();
      else {
+        // Available and in AvailableQueue
        AvailableQueue->remove(TrySU);
        CurSU = TrySU;
      }
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -3399,30 +3399,21 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
      Ty = StTy->getElementType(Field);
    } else {
      Ty = cast<SequentialType>(Ty)->getElementType();
+      MVT PtrTy = DAG.getTargetLoweringInfo().getPointerTy(AS);
+      unsigned PtrSize = PtrTy.getSizeInBits();
+      APInt ElementSize(PtrSize, DL->getTypeAllocSize(Ty));

      // If this is a constant subscript, handle it quickly.
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->isZero()) continue;
-        uint64_t Offs =
-            DL->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
-        SDValue OffsVal;
-        EVT PTy = TLI.getPointerTy(AS);
-        unsigned PtrBits = PTy.getSizeInBits();
-        if (PtrBits < 64)
-          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy,
-                                DAG.getConstant(Offs, MVT::i64));
-        else
-          OffsVal = DAG.getConstant(Offs, PTy);
-
-        N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N,
-                        OffsVal);
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
+        APInt Offs = ElementSize * CI->getValue().sextOrTrunc(PtrSize);
+        SDValue OffsVal = DAG.getConstant(Offs, PtrTy);
+        N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N, OffsVal);
        continue;
      }

      // N = N + Idx * ElementSize;
-      APInt ElementSize =
-          APInt(TLI.getPointerSizeInBits(AS), DL->getTypeAllocSize(Ty));
      SDValue IdxN = getValue(Idx);

      // If the index is smaller or larger than intptr_t, truncate or extend
@ -5727,6 +5718,11 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
    // Skip the first return-type Attribute to get to params.
    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
    Args.push_back(Entry);
+
+    // If we have an explicit sret argument that is an Instruction, (i.e., it
+    // might point to function-local memory), we can't meaningfully tail-call.
+    if (Entry.isSRet && isa<Instruction>(V))
+      isTailCall = false;
  }

  // Check if target-independent constraints permit a tail call here.
@ -7353,6 +7349,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
    Entry.Alignment = Align;
    CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
    CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+
+    // sret demotion isn't compatible with tail-calls, since the sret argument
+    // points into the callers stack frame.
+    CLI.IsTailCall = false;
  } else {
    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
      EVT VT = RetTys[I];
@ -7638,7 +7638,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
    ISD::ArgFlagsTy Flags;
    Flags.setSRet();
    MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
+                         ISD::InputArg::NoArgIndex, 0);
    Ins.push_back(RetArg);
  }

--- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -96,18 +96,19 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
  for (unsigned i = 0; i != NumOps; ++i) {
    Entry.Node = Ops[i];
    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.isSExt = shouldSignExtendTypeInLibCall(Ops[i].getValueType(), isSigned);
+    Entry.isZExt = !shouldSignExtendTypeInLibCall(Ops[i].getValueType(), isSigned);
    Args.push_back(Entry);
  }
  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy());

  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
  TargetLowering::CallLoweringInfo CLI(DAG);
+  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args), 0)
    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+    .setSExtResult(signExtend).setZExtResult(!signExtend);
  return LowerCallTo(CLI);
 }

--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@ -177,25 +177,30 @@ bool RuntimeDyldMachO::isCompatibleFile(const object::ObjectFile &Obj) const {
 }

 template <typename Impl>
-void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &ObjImg,
+void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(const ObjectFile &Obj,
                                                  ObjSectionToIDMap &SectionMap) {
  unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
  unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
  unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
-  ObjSectionToIDMap::iterator i, e;

-  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
-    const SectionRef &Section = i->first;
+  for (const auto &Section : Obj.sections()) {
    StringRef Name;
    Section.getName(Name);
-    if (Name == "__eh_frame")
-      EHFrameSID = i->second;
-    else if (Name == "__text")
-      TextSID = i->second;
+
+    // Force emission of the __text, __eh_frame, and __gcc_except_tab sections
+    // if they're present. Otherwise call down to the impl to handle other
+    // sections that have already been emitted.
+    if (Name == "__text")
+      TextSID = findOrEmitSection(Obj, Section, true, SectionMap);
+    else if (Name == "__eh_frame")
+      EHFrameSID = findOrEmitSection(Obj, Section, false, SectionMap);
    else if (Name == "__gcc_except_tab")
-      ExceptTabSID = i->second;
-    else
-      impl().finalizeSection(ObjImg, i->second, Section);
+      ExceptTabSID = findOrEmitSection(Obj, Section, true, SectionMap);
+    else {
+      auto I = SectionMap.find(Section);
+      if (I != SectionMap.end())
+        impl().finalizeSection(Obj, I->second, Section);
+    }
  }
  UnregisteredEHFrameSections.push_back(
    EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID));
@ -238,7 +243,8 @@ unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P,
 }

 static int64_t computeDelta(SectionEntry *A, SectionEntry *B) {
-  int64_t ObjDistance = A->ObjAddress - B->ObjAddress;
+  int64_t ObjDistance =
+    static_cast<int64_t>(A->ObjAddress) - static_cast<int64_t>(B->ObjAddress);
  int64_t MemDistance = A->LoadAddress - B->LoadAddress;
  return ObjDistance - MemDistance;
 }
--- a/contrib/llvm/lib/IR/ConstantFold.cpp
+++ b/contrib/llvm/lib/IR/ConstantFold.cpp
@ -1120,27 +1120,18 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
        return ConstantInt::get(CI1->getContext(), C1V | C2V);
      case Instruction::Xor:
        return ConstantInt::get(CI1->getContext(), C1V ^ C2V);
-      case Instruction::Shl: {
-        uint32_t shiftAmt = C2V.getZExtValue();
-        if (shiftAmt < C1V.getBitWidth())
-          return ConstantInt::get(CI1->getContext(), C1V.shl(shiftAmt));
-        else
-          return UndefValue::get(C1->getType()); // too big shift is undef
-      }
-      case Instruction::LShr: {
-        uint32_t shiftAmt = C2V.getZExtValue();
-        if (shiftAmt < C1V.getBitWidth())
-          return ConstantInt::get(CI1->getContext(), C1V.lshr(shiftAmt));
-        else
-          return UndefValue::get(C1->getType()); // too big shift is undef
-      }
-      case Instruction::AShr: {
-        uint32_t shiftAmt = C2V.getZExtValue();
-        if (shiftAmt < C1V.getBitWidth())
-          return ConstantInt::get(CI1->getContext(), C1V.ashr(shiftAmt));
-        else
-          return UndefValue::get(C1->getType()); // too big shift is undef
-      }
+      case Instruction::Shl:
+        if (C2V.ult(C1V.getBitWidth()))
+          return ConstantInt::get(CI1->getContext(), C1V.shl(C2V));
+        return UndefValue::get(C1->getType()); // too big shift is undef
+      case Instruction::LShr:
+        if (C2V.ult(C1V.getBitWidth()))
+          return ConstantInt::get(CI1->getContext(), C1V.lshr(C2V));
+        return UndefValue::get(C1->getType()); // too big shift is undef
+      case Instruction::AShr:
+        if (C2V.ult(C1V.getBitWidth()))
+          return ConstantInt::get(CI1->getContext(), C1V.ashr(C2V));
+        return UndefValue::get(C1->getType()); // too big shift is undef
      }
    }

--- a/contrib/llvm/lib/IR/GCOV.cpp
+++ b/contrib/llvm/lib/IR/GCOV.cpp
@ -263,10 +263,12 @@ bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
  // required to combine the edge counts that are contained in the GCDA file.
  for (uint32_t BlockNo = 0; Count > 0; ++BlockNo) {
    // The last block is always reserved for exit block
-    if (BlockNo >= Blocks.size()-1) {
+    if (BlockNo >= Blocks.size()) {
      errs() << "Unexpected number of edges (in " << Name << ").\n";
      return false;
    }
+    if (BlockNo == Blocks.size() - 1)
+      errs() << "(" << Name << ") has arcs from exit block.\n";
    GCOVBlock &Block = *Blocks[BlockNo];
    for (size_t EdgeNo = 0, End = Block.getNumDstEdges(); EdgeNo < End;
           ++EdgeNo) {
--- a/contrib/llvm/lib/Support/Unix/Memory.inc
+++ b/contrib/llvm/lib/Support/Unix/Memory.inc
@ -333,23 +333,12 @@ void Memory::InvalidateInstructionCache(const void *Addr,
  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
    asm volatile("icbi 0, %0" : : "r"(Line));
  asm volatile("isync");
-#  elif (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+#  elif (defined(__arm__) || defined(__aarch64__) || defined(__mips__)) && \
+        defined(__GNUC__)
  // FIXME: Can we safely always call this for __GNUC__ everywhere?
  const char *Start = static_cast<const char *>(Addr);
  const char *End = Start + Len;
  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
-#  elif defined(__mips__)
-  const char *Start = static_cast<const char *>(Addr);
-#    if defined(ANDROID)
-  // The declaration of "cacheflush" in Android bionic:
-  // extern int cacheflush(long start, long end, long flags);
-  const char *End = Start + Len;
-  long LStart = reinterpret_cast<long>(const_cast<char *>(Start));
-  long LEnd = reinterpret_cast<long>(const_cast<char *>(End));
-  cacheflush(LStart, LEnd, BCACHE);
-#    else
-  cacheflush(const_cast<char *>(Start), Len, BCACHE);
-#    endif
 #  endif

 #endif  // end apple
--- a/contrib/llvm/lib/Support/Windows/explicit_symbols.inc
+++ b/contrib/llvm/lib/Support/Windows/explicit_symbols.inc
@ -10,9 +10,15 @@
 #ifdef HAVE___CHKSTK
  EXPLICIT_SYMBOL(__chkstk)
 #endif
+#ifdef HAVE___CHKSTK_MS
+  EXPLICIT_SYMBOL(__chkstk_ms)
+#endif
 #ifdef HAVE____CHKSTK
  EXPLICIT_SYMBOL(___chkstk)
 #endif
+#ifdef HAVE____CHKSTK_MS
+  EXPLICIT_SYMBOL(___chkstk_ms)
+#endif
 #ifdef HAVE___MAIN
  EXPLICIT_SYMBOL(__main) // FIXME: Don't call it.
 #endif
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -2031,18 +2031,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
  unsigned CurArgIdx = 0;
  for (unsigned i = 0; i != NumArgs; ++i) {
    MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      ValVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      ValVT = MVT::i16;
+    if (Ins[i].isOrigArg()) {
+      std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();

+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+    }
    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
    bool Res =
        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@ -3092,8 +3092,11 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,

  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
-    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
+    if (Ins[VA.getValNo()].isOrigArg()) {
+      std::advance(CurOrigArg,
+                   Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
+    }
    // Arguments stored in registers.
    if (VA.isRegLoc()) {
      EVT RegVT = VA.getLocVT();
@ -3173,7 +3176,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
      assert(VA.isMemLoc());
      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");

-      int index = ArgLocs[i].getValNo();
+      int index = VA.getValNo();

      // Some Ins[] entries become multiple ArgLoc[] entries.
      // Process them only once.
@ -3186,6 +3189,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
          // Since they could be overwritten by lowering of arguments in case of
          // a tail call.
          if (Flags.isByVal()) {
+            assert(Ins[index].isOrigArg() &&
+                   "Byval arguments cannot be implicit");
            unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();

            ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign());
--- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@ -3667,43 +3667,44 @@ bool MipsAsmParser::parseDirectiveModule() {
    return false;
  }

-  if (Lexer.is(AsmToken::Identifier)) {
-    StringRef Option = Parser.getTok().getString();
-    Parser.Lex();
-
-    if (Option == "oddspreg") {
-      getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32());
-      clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
-
-      if (getLexer().isNot(AsmToken::EndOfStatement)) {
-        reportParseError("unexpected token, expected end of statement");
-        return false;
-      }
-
-      return false;
-    } else if (Option == "nooddspreg") {
-      if (!isABI_O32()) {
-        Error(L, "'.module nooddspreg' requires the O32 ABI");
-        return false;
-      }
-
-      getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32());
-      setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
-
-      if (getLexer().isNot(AsmToken::EndOfStatement)) {
-        reportParseError("unexpected token, expected end of statement");
-        return false;
-      }
-
-      return false;
-    } else if (Option == "fp") {
-      return parseDirectiveModuleFP();
-    }
-
-    return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
+  StringRef Option;
+  if (Parser.parseIdentifier(Option)) {
+    reportParseError("expected .module option identifier");
+    return false;
  }

-  return false;
+  if (Option == "oddspreg") {
+    getTargetStreamer().emitDirectiveModuleOddSPReg(true, isABI_O32());
+    clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "nooddspreg") {
+    if (!isABI_O32()) {
+      Error(L, "'.module nooddspreg' requires the O32 ABI");
+      return false;
+    }
+
+    getTargetStreamer().emitDirectiveModuleOddSPReg(false, isABI_O32());
+    setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "fp") {
+    return parseDirectiveModuleFP();
+  } else {
+    return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
+  }
 }

 /// parseDirectiveModuleFP
--- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@ -259,6 +259,11 @@ static DecodeStatus DecodeCacheOp(MCInst &Inst,
                              uint64_t Address,
                              const void *Decoder);

+static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder);
+
 static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
                                    unsigned Insn,
                                    uint64_t Address,
@ -304,6 +309,10 @@ static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn,
                               uint64_t Address,
                               const void *Decoder);

+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                       unsigned Insn,
                                       uint64_t Address,
@ -1118,6 +1127,23 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst,
  return MCDisassembler::Success;
 }

+static DecodeStatus DecodeCacheOpR6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = fieldFromInstruction(Insn, 7, 9);
+  unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+  Inst.addOperand(MCOperand::CreateImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSyncI(MCInst &Inst,
                              unsigned Insn,
                              uint64_t Address,
@ -1354,6 +1380,23 @@ static DecodeStatus DecodeFMem3(MCInst &Inst,
  return MCDisassembler::Success;
 }

+static DecodeStatus DecodeFMemCop2R6(MCInst &Inst,
+                                    unsigned Insn,
+                                    uint64_t Address,
+                                    const void *Decoder) {
+  int Offset = SignExtend32<11>(Insn & 0x07ff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                       unsigned Insn,
                                       uint64_t Address,
--- a/contrib/llvm/lib/Target/Mips/Mips.td
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@ -58,15 +58,15 @@ def MipsInstrInfo : InstrInfo;
 //===----------------------------------------------------------------------===//

 def FeatureNoABICalls  : SubtargetFeature<"noabicalls", "NoABICalls", "true",
-                                "Disable SVR4-style position-independent code.">;
+                                "Disable SVR4-style position-independent code">;
 def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
-                                "General Purpose Registers are 64-bit wide.">;
+                                "General Purpose Registers are 64-bit wide">;
 def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
-                                "Support 64-bit FP registers.">;
+                                "Support 64-bit FP registers">;
 def FeatureFPXX        : SubtargetFeature<"fpxx", "IsFPXX", "true",
-                                "Support for FPXX.">;
+                                "Support for FPXX">;
 def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
-                                "IEEE 754-2008 NaN encoding.">;
+                                "IEEE 754-2008 NaN encoding">;
 def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                "true", "Only supports single precision float">;
 def FeatureO32         : SubtargetFeature<"o32", "ABI", "MipsABIInfo::O32()",
@ -81,7 +81,7 @@ def FeatureNoOddSPReg  : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
                              "Disable odd numbered single-precision "
                              "registers">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
-                                "true", "Enable vector FPU instructions.">;
+                                "true", "Enable vector FPU instructions">;
 def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
                                "Mips I ISA Support [highly experimental]">;
 def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@ -293,6 +293,9 @@ void Mips16InstrInfo::adjustStackPtrBigUnrestricted(
 void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
                                     MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I) const {
+  if (Amount == 0)
+    return;
+
  if (isInt<16>(Amount))  // need to change to addiu sp, ....and isInt<16>
    BuildAddiuSpImm(MBB, I, Amount);
  else
--- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@ -379,7 +379,6 @@ class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
  list<dag> Pattern = [];
  bit isTerminator = 1;
  bit hasDelaySlot = 0;
-  string DecoderMethod = "DecodeSimm16";
 }

 class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
@ -550,6 +549,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd,
  dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint);
  string AsmString = !strconcat(instr_asm, "\t$hint, $addr");
  list<dag> Pattern = [];
+  string DecoderMethod = "DecodeCacheOpR6";
 }

 class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>;
@ -561,6 +561,7 @@ class COP2LD_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
  list<dag> Pattern = [];
  bit mayLoad = 1;
+  string DecoderMethod = "DecodeFMemCop2R6";
 }

 class LDC2_R6_DESC : COP2LD_DESC_BASE<"ldc2", COP2Opnd>;
@ -572,6 +573,7 @@ class COP2ST_DESC_BASE<string instr_asm, RegisterOperand COPOpnd> {
  string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
  list<dag> Pattern = [];
  bit mayStore = 1;
+  string DecoderMethod = "DecodeFMemCop2R6";
 }

 class SDC2_R6_DESC : COP2ST_DESC_BASE<"sdc2", COP2Opnd>;
--- a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
@ -132,8 +132,8 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128(
      continue;
    }

-    assert(Ins[i].OrigArgIndex < MF.getFunction()->arg_size());
-    std::advance(FuncArg, Ins[i].OrigArgIndex);
+    assert(Ins[i].getOrigArgIndex() < MF.getFunction()->arg_size());
+    std::advance(FuncArg, Ins[i].getOrigArgIndex());

    OriginalArgWasF128.push_back(
        originalTypeIsF128(FuncArg->getType(), nullptr));
--- a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@ -123,7 +123,7 @@ def CC_MipsN_SoftFloat : CallingConv<[
 ]>;

 def CC_MipsN : CallingConv<[
-  CCIfType<[i8, i16, i32],
+  CCIfType<[i8, i16, i32, i64],
      CCIfSubtargetNot<"isLittle()",
          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,

@ -159,6 +159,10 @@ def CC_MipsN : CallingConv<[
 // N32/64 variable arguments.
 // All arguments are passed in integer registers.
 def CC_MipsN_VarArg : CallingConv<[
+  CCIfType<[i8, i16, i32, i64],
+      CCIfSubtargetNot<"isLittle()",
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
  // All integers are promoted to 64-bit.
  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,

--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@ -619,6 +619,33 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

+static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue ValueIfTrue = N->getOperand(0), ValueIfFalse = N->getOperand(2);
+
+  ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(ValueIfFalse);
+  if (!FalseC || FalseC->getZExtValue())
+    return SDValue();
+
+  // Since RHS (False) is 0, we swap the order of the True/False operands
+  // (obviously also inverting the condition) so that we can
+  // take advantage of conditional moves using the $0 register.
+  // Example:
+  //   return (a != 0) ? x : 0;
+  //     load $reg, x
+  //     movz $reg, $0, a
+  unsigned Opc = (N->getOpcode() == MipsISD::CMovFP_T) ? MipsISD::CMovFP_F :
+                                                         MipsISD::CMovFP_T;
+
+  SDValue FCC = N->getOperand(1), Glue = N->getOperand(3);
+  return DAG.getNode(Opc, SDLoc(N), ValueIfFalse.getValueType(),
+                     ValueIfFalse, FCC, ValueIfTrue, Glue);
+}
+
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const MipsSubtarget &Subtarget) {
@ -752,6 +779,9 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
    return performDivRemCombine(N, DAG, DCI, Subtarget);
  case ISD::SELECT:
    return performSELECTCombine(N, DAG, DCI, Subtarget);
+  case MipsISD::CMovFP_F:
+  case MipsISD::CMovFP_T:
+    return performCMovFPCombine(N, DAG, DCI, Subtarget);
  case ISD::AND:
    return performANDCombine(N, DAG, DCI, Subtarget);
  case ISD::OR:
@ -2039,7 +2069,7 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
-                             DAG.getConstant(0x20, MVT::i32));
+                             DAG.getConstant(VT.getSizeInBits(), MVT::i32));
  Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond,
                   DAG.getConstant(0, VT), ShiftLeftLo);
  Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftLeftLo, Or);
@ -2078,11 +2108,12 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL,
                                     DL, VT, Hi, Shamt);
  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
-                             DAG.getConstant(0x20, MVT::i32));
-  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, VT, Hi, DAG.getConstant(31, VT));
+                             DAG.getConstant(VT.getSizeInBits(), MVT::i32));
+  SDValue Ext = DAG.getNode(ISD::SRA, DL, VT, Hi,
+                            DAG.getConstant(VT.getSizeInBits() - 1, VT));
  Lo = DAG.getNode(ISD::SELECT, DL, VT, Cond, ShiftRightHi, Or);
  Hi = DAG.getNode(ISD::SELECT, DL, VT, Cond,
-                   IsSRA ? Shift31 : DAG.getConstant(0, VT), ShiftRightHi);
+                   IsSRA ? Ext : DAG.getConstant(0, VT), ShiftRightHi);

  SDValue Ops[2] = {Lo, Hi};
  return DAG.getMergeValues(Ops, DL);
@ -2902,13 +2933,16 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,

  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
-    std::advance(FuncArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
+    if (Ins[i].isOrigArg()) {
+      std::advance(FuncArg, Ins[i].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[i].getOrigArgIndex();
+    }
    EVT ValVT = VA.getValVT();
    ISD::ArgFlagsTy Flags = Ins[i].Flags;
    bool IsRegLoc = VA.isRegLoc();

    if (Flags.isByVal()) {
+      assert(Ins[i].isOrigArg() && "Byval arguments cannot be implicit");
      unsigned FirstByValReg, LastByValReg;
      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
@ -3029,6 +3063,15 @@ MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
  return CCInfo.CheckReturn(Outs, RetCC_Mips);
 }

+bool
+MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
+  if (Subtarget.hasMips3() && Subtarget.abiUsesSoftFloat()) {
+    if (Type == MVT::i32)
+      return true;
+  }
+  return IsSigned;
+}
+
 SDValue
 MipsTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool IsVarArg,
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@ -472,6 +472,8 @@ namespace llvm {
                        const SmallVectorImpl<SDValue> &OutVals,
                        SDLoc dl, SelectionDAG &DAG) const override;

+    bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override;
+
    // Inline asm support
    ConstraintType
      getConstraintType(const std::string &Constraint) const override;
--- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@ -458,42 +458,42 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;

 def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-             MADDS_FM<4, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+             MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-             MADDS_FM<5, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+             MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;

 let AdditionalPredicates = [NoNaNsFPMath] in {
  def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                MADDS_FM<6, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+                MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
  def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                MADDS_FM<7, 0>, ISA_MIPS32R2_NOT_32R6_64R6;
+                MADDS_FM<7, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
 }

 def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-               MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+               MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-               MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+               MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;

 let AdditionalPredicates = [NoNaNsFPMath] in {
  def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
  def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_32;
+                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
 }

-let isCodeGenOnly=1 in {
+let DecoderNamespace = "Mips64" in {
  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
 }

 let AdditionalPredicates = [NoNaNsFPMath],
-    isCodeGenOnly=1 in {
+    DecoderNamespace = "Mips64" in {
  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                  MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+                  MADDS_FM<7, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
 }

 //===----------------------------------------------------------------------===//
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
@ -388,6 +388,8 @@ def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
                           (sequence "W%u", 0, 31)>;
 def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
                           (sequence "W%u", 0, 31)>;
+def MSA128WEvens: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                                (decimate (sequence "W%u", 0, 31), 2)>;

 def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
  MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@ -258,8 +258,12 @@ SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
                                   CurDAG->getTargetConstant(Mips::sub_32, VT));
  }

-  SDNode *AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT,
-                                            SDValue(Carry, 0), RHS);
+  // Generate a second addition only if we know that RHS is not a
+  // constant-zero node.
+  SDNode *AddCarry = Carry;
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue())
+    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);

  return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
                              SDValue(AddCarry, 0));
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@ -2883,10 +2883,21 @@ emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
  unsigned Ws = MI->getOperand(1).getReg();
  unsigned Lane = MI->getOperand(2).getImm();

-  if (Lane == 0)
-    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
-  else {
-    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  if (Lane == 0) {
+    unsigned Wt = Ws;
+    if (!Subtarget.useOddSPReg()) {
+      // We must copy to an even-numbered MSA register so that the
+      // single-precision sub-register is also guaranteed to be even-numbered.
+      Wt = RegInfo.createVirtualRegister(&Mips::MSA128WEvensRegClass);
+
+      BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Wt).addReg(Ws);
+    }
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  } else {
+    unsigned Wt = RegInfo.createVirtualRegister(
+        Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+                                  &Mips::MSA128WEvensRegClass);

    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(Lane);
    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
@ -2948,7 +2959,9 @@ MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
  unsigned Wd_in = MI->getOperand(1).getReg();
  unsigned Lane = MI->getOperand(2).getImm();
  unsigned Fs = MI->getOperand(3).getReg();
-  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass :
+                                &Mips::MSA128WEvensRegClass);

  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
      .addImm(0)
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@ -364,6 +364,9 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;

+  if (Amount == 0)
+    return;
+
  if (isInt<16>(Amount))// addi sp, sp, amount
    BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
  else { // Expand immediate that doesn't fit in 16-bit.
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@ -2688,9 +2688,10 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
    unsigned ObjSize = ObjectVT.getStoreSize();
    unsigned ArgSize = ObjSize;
    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[ArgNo].OrigArgIndex;
-
+    if (Ins[ArgNo].isOrigArg()) {
+      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+    }
    /* Respect alignment of argument on the stack.  */
    unsigned Align =
      CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
@ -2704,6 +2705,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
    // FIXME the codegen can be much improved in some cases.
    // We do not have to keep everything in memory.
    if (Flags.isByVal()) {
+      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
      ObjSize = Flags.getByValSize();
      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
@ -3064,9 +3067,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
    unsigned ArgSize = ObjSize;
    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
-    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[ArgNo].OrigArgIndex;
-
+    if (Ins[ArgNo].isOrigArg()) {
+      std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
+      CurArgIdx = Ins[ArgNo].getOrigArgIndex();
+    }
    unsigned CurArgOffset = ArgOffset;

    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
@ -3087,6 +3091,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
    // FIXME the codegen can be much improved in some cases.
    // We do not have to keep everything in memory.
    if (Flags.isByVal()) {
+      assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
+
      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
      ObjSize = Flags.getByValSize();
      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
--- a/contrib/llvm/lib/Target/R600/AMDGPU.td
+++ b/contrib/llvm/lib/Target/R600/AMDGPU.td
@ -97,6 +97,11 @@ def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
        "true",
        "Enable spilling of VGPRs to scratch memory">;

+def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
+        "SGPRInitBug",
+        "true",
+        "VI SGPR initilization bug requiring a fixed SGPR allocation size">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                          SubtargetFeature <"fetch"#Value,
        "TexVTXClauseSize",
--- a/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
+++ b/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
@ -40,7 +40,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
  std::vector<Function*> FuncsToClone;
  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
    Function &F = *I;
-    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty())
+    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
+        !F.hasFnAttribute(Attribute::NoInline))
      FuncsToClone.push_back(&F);
  }

@ -54,7 +55,7 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {

  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
    Function &F = *I;
-    if (F.hasLocalLinkage()) {
+    if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) {
      F.addFnAttr(Attribute::AlwaysInline);
    }
  }
--- a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp
@ -343,6 +343,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
  ProgInfo.NumVGPR = MaxVGPR + 1;
  ProgInfo.NumSGPR = MaxSGPR + 1;

+  if (STM.hasSGPRInitBug()) {
+    if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG)
+      llvm_unreachable("Too many SGPRs used with the SGPR init bug");
+
+    ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+  }
+
  ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
  ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
  // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
--- a/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@ -439,6 +439,31 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    break;
  }

+  case ISD::STORE: {
+    // Handle i64 stores here for the same reason mentioned above for loads.
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    SDValue Value = ST->getValue();
+    if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+      break;
+
+    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
+                                      MVT::v2i32, Value);
+    SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
+                                        ST->getBasePtr(), ST->getMemOperand());
+
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
+
+    if (NewValue.getOpcode() == ISD::BITCAST) {
+      Select(NewStore.getNode());
+      return SelectCode(NewValue.getNode());
+    }
+
+    // getNode() may fold the bitcast if its input was another bitcast.  If that
+    // happens we should only select the new store.
+    N = NewStore.getNode();
+    break;
+  }
+
  case AMDGPUISD::REGISTER_LOAD: {
    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
      break;
@ -761,6 +786,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
  return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
 }

+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
 SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
  SDLoc SL(N);
  EVT VT = N->getValueType(0);
@ -770,19 +797,12 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
  unsigned Opc
    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;

-  const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
-  const SDValue False = CurDAG->getTargetConstant(0, MVT::i1);
-  SDValue Ops[] = {
-    Zero,             // src0_modifiers
-    N->getOperand(0), // src0
-    Zero,             // src1_modifiers
-    N->getOperand(1), // src1
-    Zero,             // src2_modifiers
-    N->getOperand(2), // src2
-    False,            // clamp
-    Zero              // omod
-  };
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  SDValue Ops[8];

+  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
+  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
+  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
  return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
 }

--- a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@ -141,9 +141,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);

-  setOperationAction(ISD::STORE, MVT::i64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
-
  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);

@ -162,9 +159,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  // Custom lowering of vector stores is required for local address space
  // stores.
  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  // XXX: Native v2i32 local address space stores are possible, but not
-  // currently implemented.
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);

  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
@ -832,11 +826,9 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                  SelectionDAG &DAG) const {
  SmallVector<SDValue, 8> Args;
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);

-  DAG.ExtractVectorElements(A, Args);
-  DAG.ExtractVectorElements(B, Args);
+  for (const SDUse &U : Op->ops())
+    DAG.ExtractVectorElements(U.get(), Args);

  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
@ -881,9 +873,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      return LowerIntrinsicIABS(Op, DAG);
    case AMDGPUIntrinsic::AMDGPU_lrp:
      return LowerIntrinsicLRP(Op, DAG);
-    case AMDGPUIntrinsic::AMDGPU_fract:
-    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
-      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));

    case AMDGPUIntrinsic::AMDGPU_clamp:
    case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
@ -913,10 +902,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    }

    case Intrinsic::AMDGPU_div_fmas:
-      // FIXME: Dropping bool parameter. Work is needed to support the implicit
-      // read from VCC.
      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                         Op.getOperand(4));

    case Intrinsic::AMDGPU_div_fixup:
      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
--- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h
@ -140,6 +140,12 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
  /// not exist. If Opcode is not a pseudo instruction, this is identity.
  int pseudoToMCOpcode(int Opcode) const;

+  /// \brief Return the descriptor of the target-specific machine instruction
+  /// that corresponds to the specified pseudo or native opcode.
+  const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const {
+    return get(pseudoToMCOpcode(Opcode));
+  }
+
 //===---------------------------------------------------------------------===//
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
--- a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td
@ -35,6 +35,11 @@ def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;

+// float, float, float, vcc
+def AMDGPUFmasOp : SDTypeProfile<1, 4,
+  [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
@ -153,7 +158,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;

 //  Special case divide FMA with scale and flags (src0 = Quotient,
 //  src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;

 // Single or double precision division fixup.
 // Special case divide fixup and flags(src0 = Quotient, src1 =
--- a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td
@ -164,10 +164,6 @@ class PrivateStore <SDPatternOperator op> : PrivateMemOp <
  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
 >;

-def extloadi8_private : PrivateLoad <extloadi8>;
-def sextloadi8_private : PrivateLoad <sextloadi8>;
-def extloadi16_private : PrivateLoad <extloadi16>;
-def sextloadi16_private : PrivateLoad <sextloadi16>;
 def load_private : PrivateLoad <load>;

 def truncstorei8_private : PrivateStore <truncstorei8>;
@ -231,6 +227,9 @@ def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
    return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;

+def extloadi8_private : PrivateLoad <az_extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+
 def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
@ -267,6 +266,9 @@ def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
    return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;

+def extloadi16_private : PrivateLoad <az_extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+
 def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
@ -649,17 +651,10 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
  (RcpInst $src)
 >;

-multiclass RsqPat<Instruction RsqInst, ValueType vt> {
-  def : Pat <
-    (fdiv FP_ONE, (fsqrt vt:$src)),
-    (RsqInst $src)
-  >;
-
-  def : Pat <
-    (AMDGPUrcp (fsqrt vt:$src)),
-    (RsqInst $src)
-  >;
-}
+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
+  (AMDGPUrcp (fsqrt vt:$src)),
+  (RsqInst $src)
+>;

 include "R600Instructions.td"
 include "R700Instructions.td"
--- a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td
@ -68,6 +68,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
  def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
  def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
 }
--- a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp
@ -80,7 +80,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
      FlatAddressSpace(false), EnableIRStructurizer(true),
      EnablePromoteAlloca(false), EnableIfCvt(true),
      EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
-      EnableVGPRSpilling(false),
+      EnableVGPRSpilling(false),SGPRInitBug(false),
      DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
      FrameLowering(TargetFrameLowering::StackGrowsUp,
                    64 * 16, // Maximum stack alignment (long16)
--- a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h
@ -45,6 +45,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
    VOLCANIC_ISLANDS,
  };

+  enum {
+    FIXED_SGPR_COUNT_FOR_INIT_BUG = 80
+  };
+
 private:
  std::string DevName;
  bool Is64bit;
@ -66,6 +70,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
  bool CFALUBug;
  int LocalMemorySize;
  bool EnableVGPRSpilling;
+  bool SGPRInitBug;

  const DataLayout DL;
  AMDGPUFrameLowering FrameLowering;
@ -203,6 +208,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
    return LocalMemorySize;
  }

+  bool hasSGPRInitBug() const {
+    return SGPRInitBug;
+  }
+
  unsigned getAmdKernelCodeChipID() const;

  bool enableMachineScheduler() const override {
--- a/contrib/llvm/lib/Target/R600/CaymanInstructions.td
+++ b/contrib/llvm/lib/Target/R600/CaymanInstructions.td
@ -46,7 +46,7 @@ def SIN_cm : SIN_Common<0x8D>;
 def COS_cm : COS_Common<0x8E>;
 } // End isVector = 1

-defm : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+def : RsqPat<RECIPSQRT_IEEE_cm, f32>;

 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;

--- a/contrib/llvm/lib/Target/R600/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/R600/EvergreenInstructions.td
@ -69,7 +69,7 @@ def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
 def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
 def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
 def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
-defm : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
 def SIN_eg : SIN_Common<0x8D>;
 def COS_eg : COS_Common<0x8E>;

--- a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@ -291,6 +291,8 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
        printImmediate64(Op.getImm(), O);
      else
        llvm_unreachable("Invalid register class size");
+    } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
+      printImmediate32(Op.getImm(), O);
    } else {
      // We hit this for the immediate instruction bits that don't yet have a
      // custom printer.
--- a/contrib/llvm/lib/Target/R600/Processors.td
+++ b/contrib/llvm/lib/Target/R600/Processors.td
@ -113,8 +113,12 @@ def : ProcessorModel<"mullins",    SIQuarterSpeedModel, [FeatureSeaIslands]>;
 // Volcanic Islands
 //===----------------------------------------------------------------------===//

-def : ProcessorModel<"tonga",   SIFullSpeedModel, [FeatureVolcanicIslands]>;
+def : ProcessorModel<"tonga",   SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureSGPRInitBug]
+>;

-def : ProcessorModel<"iceland", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
+def : ProcessorModel<"iceland", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureSGPRInitBug]
+>;

 def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>;
--- a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp
@ -838,6 +838,10 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::AMDGPU_rsq:
      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+    case AMDGPUIntrinsic::AMDGPU_fract:
+    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
+      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
    }
    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
    break;
@ -1694,7 +1698,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
    // XXX - I think PartOffset should give you this, but it seems to give the
    // size of the register which isn't useful.

-    unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
+    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
    unsigned PartOffset = VA.getLocMemOffset();
    unsigned Offset = 36 + VA.getLocMemOffset();

--- a/contrib/llvm/lib/Target/R600/R600Instructions.td
+++ b/contrib/llvm/lib/Target/R600/R600Instructions.td
@ -1193,7 +1193,7 @@ let Predicates = [isR600] in {
  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;

  def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
-  defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+  def : RsqPat<RECIPSQRT_IEEE_r600, f32>;

  def : FROUNDPat <CNDGE_r600, CNDGT_r600>;

--- a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp
@ -83,7 +83,7 @@ class SIAnnotateControlFlow : public FunctionPass {

  void insertElse(BranchInst *Term);

-  Value *handleLoopCondition(Value *Cond, PHINode *Broken);
+  Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L);

  void handleLoop(BranchInst *Term);

@ -207,8 +207,17 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 }

 /// \brief Recursively handle the condition leading to a loop
-Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken) {
-  if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
+Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
+                                                  llvm::Loop *L) {
+
+  // Only search through PHI nodes which are inside the loop.  If we try this
+  // with PHI nodes that are outside of the loop, we end up inserting new PHI
+  // nodes outside of the loop which depend on values defined inside the loop.
+  // This will break the module with
+  // 'Instruction does not dominate all users!' errors.
+  PHINode *Phi = nullptr;
+  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
+
    BasicBlock *Parent = Phi->getParent();
    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
    Value *Ret = NewPhi;
@ -223,7 +232,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken)
      }

      Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L);
      NewPhi->addIncoming(PhiArg, From);
    }

@ -253,7 +262,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken)

  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
    BasicBlock *Parent = Inst->getParent();
-    TerminatorInst *Insert = Parent->getTerminator();
+    Instruction *Insert;
+    if (L->contains(Inst)) {
+      Insert = Parent->getTerminator();
+    } else {
+      Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+    }
    Value *Args[] = { Cond, Broken };
    return CallInst::Create(IfBreak, Args, "", Insert);

@ -265,14 +279,15 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken)

 /// \brief Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  BasicBlock *BB = Term->getParent();
+  llvm::Loop *L = LI->getLoopFor(BB);
  BasicBlock *Target = Term->getSuccessor(1);
  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());

  Value *Cond = Term->getCondition();
  Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken);
+  Value *Arg = handleLoopCondition(Cond, Broken, L);

-  BasicBlock *BB = Term->getParent();
  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
       PI != PE; ++PI) {

--- a/contrib/llvm/lib/Target/R600/SIDefines.h
+++ b/contrib/llvm/lib/Target/R600/SIDefines.h
@ -35,7 +35,8 @@ enum {
  SMRD = 1 << 16,
  DS = 1 << 17,
  MIMG = 1 << 18,
-  FLAT = 1 << 19
+  FLAT = 1 << 19,
+  WQM = 1 << 20
 };
 }

--- a/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp
@ -209,7 +209,12 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
        APInt Imm;

        if (FoldingImm) {
-          const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
+          unsigned UseReg = UseOp.getReg();
+          const TargetRegisterClass *UseRC
+            = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+            MRI.getRegClass(UseReg) :
+            TRI.getRegClass(UseReg);
+
          Imm = APInt(64, OpToFold.getImm());

          // Split 64-bit constants into 32-bits for folding.
@ -228,8 +233,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
          // In order to fold immediates into copies, we need to change the
          // copy to a MOV.
          if (UseMI->getOpcode() == AMDGPU::COPY) {
-            unsigned MovOp = TII->getMovOpcode(
-                MRI.getRegClass(UseMI->getOperand(0).getReg()));
+            unsigned DestReg = UseMI->getOperand(0).getReg();
+            const TargetRegisterClass *DestRC
+              = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+              MRI.getRegClass(DestReg) :
+              TRI.getRegClass(DestReg);
+
+            unsigned MovOp = TII->getMovOpcode(DestRC);
            if (MovOp == AMDGPU::COPY)
              continue;

--- a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp
@ -89,8 +89,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
  setOperationAction(ISD::STORE, MVT::v16i32, Custom);

  setOperationAction(ISD::STORE, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  setOperationAction(ISD::STORE, MVT::v4i32, Custom);

  setOperationAction(ISD::SELECT, MVT::i64, Custom);
@ -158,8 +156,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
  for (MVT VT : MVT::fp_valuetypes())
    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);

-  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
@ -214,6 +210,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
  }

  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::f64, Custom);

  setTargetDAGCombine(ISD::FADD);
  setTargetDAGCombine(ISD::FSUB);
@ -314,9 +311,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
  if (!VT.isSimple() || VT == MVT::Other)
    return false;

-  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
-  // see what for specifically. The wording everywhere else seems to be the
-  // same.
+  // TODO - CI+ supports unaligned memory accesses, but this requires driver
+  // support.

  // XXX - The only mention I see of this in the ISA manual is for LDS direct
  // reads the "byte address and must be dword aligned". Is it also true for the
@ -328,12 +324,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    return Align % 4 == 0;
  }

+  // Smaller than dword value must be aligned.
+  // FIXME: This should be allowed on CI+
+  if (VT.bitsLT(MVT::i32))
+    return false;
+
  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
  // byte-address are ignored, thus forcing Dword alignment.
  // This applies to private, global, and constant memory.
  if (IsFast)
    *IsFast = true;
-  return VT.bitsGT(MVT::i32);
+
+  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
 }

 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
@ -448,7 +450,7 @@ SDValue SITargetLowering::LowerFormalArguments(
      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
      // three or five element vertex only needs three or five registers,
      // NOT four or eigth.
-      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
      unsigned NumElements = ParamType->getVectorNumElements();

      for (unsigned j = 0; j != NumElements; ++j) {
@ -531,7 +533,7 @@ SDValue SITargetLowering::LowerFormalArguments(
                                   Offset, Ins[i].Flags.isSExt());

      const PointerType *ParamTy =
-          dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
        // On SI local pointers are just offsets into LDS, so they are always
@ -566,7 +568,7 @@ SDValue SITargetLowering::LowerFormalArguments(
    if (Arg.VT.isVector()) {

      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
      unsigned NumElements = ParamType->getVectorNumElements();

      SmallVector<SDValue, 4> Regs;
@ -919,6 +921,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                       Op.getOperand(1),
                       Op.getOperand(2),
                       Op.getOperand(3));
+
+  case AMDGPUIntrinsic::AMDGPU_fract:
+  case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
+    return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
+                       DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
+
  default:
    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
  }
@ -1104,7 +1112,70 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 }

 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue();
+  if (DAG.getTarget().Options.UnsafeFPMath)
+    return LowerFastFDIV(Op, DAG);
+
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
+
+  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
+
+  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
+
+  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
+
+  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
+
+  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
+
+  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
+
+  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
+
+  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
+
+  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
+
+  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
+                             NegDivScale0, Mul, DivScale1);
+
+  SDValue Scale;
+
+  if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    // Workaround a hardware bug on SI where the condition output from div_scale
+    // is not usable.
+
+    const SDValue Hi = DAG.getConstant(1, MVT::i32);
+
+    // Figure out if the scale to use for div_fmas.
+    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
+    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
+    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
+
+    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
+    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
+
+    SDValue Scale0Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
+    SDValue Scale1Hi
+      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
+
+    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
+    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
+    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
+  } else {
+    Scale = DivScale1.getValue(1);
+  }
+
+  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
+                             Fma4, Fma3, Mul, Scale);
+
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
 }

 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
@ -1125,11 +1196,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
  EVT VT = Store->getMemoryVT();

  // These stores are legal.
-  if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
-      VT.isVector() && VT.getVectorNumElements() == 2 &&
-      VT.getVectorElementType() == MVT::i32)
-    return SDValue();
-
  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
    if (VT.isVector() && VT.getVectorNumElements() > 4)
      return ScalarizeVectorStore(Op, DAG);
@ -1524,6 +1590,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
  case AMDGPUISD::UMAX:
  case AMDGPUISD::UMIN: {
    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+        N->getValueType(0) != MVT::f64 &&
        getTargetMachine().getOptLevel() > CodeGenOpt::None)
      return performMin3Max3Combine(N, DCI);
    break;
--- a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp
@ -82,6 +82,8 @@ class SIInsertWaits : public MachineFunctionPass {
  /// \brief Type of the last opcode.
  InstType LastOpcodeType;

+  bool LastInstWritesM0;
+
  /// \brief Get increment/decrement amount for this instruction.
  Counters getHwCounts(MachineInstr &MI);

@ -106,6 +108,9 @@ class SIInsertWaits : public MachineFunctionPass {
  /// \brief Resolve all operand dependencies to counter requirements
  Counters handleOperands(MachineInstr &MI);

+  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
 public:
  SIInsertWaits(TargetMachine &tm) :
    MachineFunctionPass(ID),
@ -269,6 +274,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
      // Insert a NOP to break the clause.
      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
          .addImm(0);
+      LastInstWritesM0 = false;
    }

    if (TII->isSMRD(I->getOpcode()))
@ -362,6 +368,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                  ((Counts.Named.LGKM & 0x7) << 8));

  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
  return true;
 }

@ -403,6 +410,30 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
  return Result;
 }

+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return;
+
+  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+    LastInstWritesM0 = false;
+    return;
+  }
+
+  // Set whether this instruction sets M0
+  LastInstWritesM0 = false;
+
+  unsigned NumOperands = I->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    const MachineOperand &Op = I->getOperand(i);
+
+    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+      LastInstWritesM0 = true;
+  }
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@ -417,6 +448,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
  WaitedOn = ZeroCounts;
  LastIssued = ZeroCounts;
  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;

  memset(&UsedRegs, 0, sizeof(UsedRegs));
  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@ -433,7 +465,9 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
        Changes |= insertWait(MBB, I, LastIssued);
      else
        Changes |= insertWait(MBB, I, handleOperands(*I));
+
      pushInstruction(MBB, I);
+      handleSendMsg(MBB, I);
    }

    // Wait for everything at the end of the MBB
--- a/contrib/llvm/lib/Target/R600/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/R600/SIInstrFormats.td
@ -38,6 +38,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
  field bits<1> DS = 0;
  field bits<1> MIMG = 0;
  field bits<1> FLAT = 0;
+  field bits<1> WQM = 0;

  // These need to be kept in sync with the enum in SIInstrFlags.
  let TSFlags{0} = VM_CNT;
@ -64,6 +65,7 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
  let TSFlags{17} = DS;
  let TSFlags{18} = MIMG;
  let TSFlags{19} = FLAT;
+  let TSFlags{20} = WQM;

  // Most instructions require adjustments after selection to satisfy
  // operand requirements.
@ -295,18 +297,32 @@ class VOP1e <bits<8> op> : Enc32 {
 }

 class VOP2e <bits<6> op> : Enc32 {
+  bits<8> vdst;
+  bits<9> src0;
+  bits<8> src1;

-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<8> VSRC1;
-
-  let Inst{8-0} = SRC0;
-  let Inst{16-9} = VSRC1;
-  let Inst{24-17} = VDST;
+  let Inst{8-0} = src0;
+  let Inst{16-9} = src1;
+  let Inst{24-17} = vdst;
  let Inst{30-25} = op;
  let Inst{31} = 0x0; //encoding
 }

+class VOP2_MADKe <bits<6> op> : Enc64 {
+
+  bits<8>  vdst;
+  bits<9>  src0;
+  bits<8>  vsrc1;
+  bits<32> src2;
+
+  let Inst{8-0} = src0;
+  let Inst{16-9} = vsrc1;
+  let Inst{24-17} = vdst;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; // encoding
+  let Inst{63-32} = src2;
+}
+
 class VOP3e <bits<9> op> : Enc64 {

  bits<8> dst;
@ -554,9 +570,6 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
 class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
    VOP2Common <outs, ins, asm, pattern>, VOP2e<op>;

-class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
-
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
    VOPCCommon <ins, asm, pattern>, VOPCe <op>;

@ -585,9 +598,6 @@ class DS <dag outs, dag ins, string asm, list<dag> pattern> :
  let SchedRW = [WriteLDS];
 }

-class DS_si <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    DS <outs, ins, asm, pattern>, DSe<op>;
-
 class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
    InstSI<outs, ins, asm, pattern> {

--- a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp
@ -121,12 +121,20 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
    if (Load0->getOperand(0) != Load1->getOperand(0))
      return false;

+    const ConstantSDNode *Load0Offset =
+        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
+    const ConstantSDNode *Load1Offset =
+        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
+
+    if (!Load0Offset || !Load1Offset)
+      return false;
+
    // Check chain.
    if (findChainOperand(Load0) != findChainOperand(Load1))
      return false;

-    Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue();
-    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue();
+    Offset0 = Load0Offset->getZExtValue();
+    Offset1 = Load1Offset->getZExtValue();
    return true;
  }

@ -333,6 +341,21 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    return;

  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+    if (DestReg == AMDGPU::VCC) {
+      if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // FIXME: Hack until VReg_1 removed.
+        assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+          .addImm(0)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+
+      return;
+    }
+
    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
            .addReg(SrcReg, getKillRegState(KillSrc));
@ -408,11 +431,15 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
  int NewOpc;

  // Try to map original to commuted opcode
-  if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1)
+  NewOpc = AMDGPU::getCommuteRev(Opcode);
+  // Check if the commuted (REV) opcode exists on the target.
+  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
    return NewOpc;

  // Try to map commuted to original opcode
-  if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1)
+  NewOpc = AMDGPU::getCommuteOrig(Opcode);
+  // Check if the original (non-REV) opcode exists on the target.
+  if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1)
    return NewOpc;

  return Opcode;
@ -1121,6 +1148,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
      return false;
    }

+    int RegClass = Desc.OpInfo[i].RegClass;
+
    switch (Desc.OpInfo[i].OperandType) {
    case MCOI::OPERAND_REGISTER:
      if (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) {
@ -1131,7 +1160,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
    case AMDGPU::OPERAND_REG_IMM32:
      break;
    case AMDGPU::OPERAND_REG_INLINE_C:
-      if (MI->getOperand(i).isImm() && !isInlineConstant(MI->getOperand(i))) {
+      if (isLiteralConstant(MI->getOperand(i))) {
        ErrInfo = "Illegal immediate value for operand.";
        return false;
      }
@ -1152,7 +1181,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
    if (!MI->getOperand(i).isReg())
      continue;

-    int RegClass = Desc.OpInfo[i].RegClass;
    if (RegClass != -1) {
      unsigned Reg = MI->getOperand(i).getReg();
      if (TargetRegisterInfo::isVirtualRegister(Reg))
@ -1197,31 +1225,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
    }
  }

-  // Verify SRC1 for VOP2 and VOPC
-  if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
-    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
-    if (Src1.isImm()) {
-      ErrInfo = "VOP[2C] src1 cannot be an immediate.";
-      return false;
-    }
-  }
-
-  // Verify VOP3
-  if (isVOP3(Opcode)) {
-    if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
-      ErrInfo = "VOP3 src0 cannot be a literal constant.";
-      return false;
-    }
-    if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
-      ErrInfo = "VOP3 src1 cannot be a literal constant.";
-      return false;
-    }
-    if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
-      ErrInfo = "VOP3 src2 cannot be a literal constant.";
-      return false;
-    }
-  }
-
  // Verify misc. restrictions on specific instructions.
  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
      Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
@ -1292,6 +1295,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
+  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
  }
 }

@ -2043,6 +2047,24 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
        swapOperands(Inst);
      }
      break;
+    case AMDGPU::S_LSHL_B64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHLREV_B64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_ASHR_I64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_ASHRREV_I64;
+        swapOperands(Inst);
+      }
+      break;
+    case AMDGPU::S_LSHR_B64:
+      if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        NewOpcode = AMDGPU::V_LSHRREV_B64;
+        swapOperands(Inst);
+      }
+      break;

    case AMDGPU::S_BFE_U64:
    case AMDGPU::S_BFM_B64:
--- a/contrib/llvm/lib/Target/R600/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.h
@ -204,6 +204,10 @@ class SIInstrInfo : public AMDGPUInstrInfo {
    return get(Opcode).TSFlags & SIInstrFlags::FLAT;
  }

+  bool isWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::WQM;
+  }
+
  bool isInlineConstant(const APInt &Imm) const;
  bool isInlineConstant(const MachineOperand &MO) const;
  bool isLiteralConstant(const MachineOperand &MO) const;
@ -243,7 +247,27 @@ class SIInstrInfo : public AMDGPUInstrInfo {
  /// the register class of its machine operand.
  /// to infer the correct register class base on the other operands.
  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
-                                           unsigned OpNo) const;\
+                                           unsigned OpNo) const;
+
+  /// \brief Return the size in bytes of the operand OpNo on the given
+  // instruction opcode.
+  unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const {
+    const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo];
+
+    if (OpInfo.RegClass == -1) {
+      // If this is an immediate operand, this must be a 32-bit literal.
+      assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE);
+      return 4;
+    }
+
+    return RI.getRegClass(OpInfo.RegClass)->getSize();
+  }
+
+  /// \brief This form should usually be preferred since it handles operands
+  /// with unknown register classes.
+  unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
+    return getOpRegClass(MI, OpNo)->getSize();
+  }

  /// \returns true if it is legal for the operand at index \p OpNo
  /// to read a VGPR.
--- a/contrib/llvm/lib/Target/R600/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.td
--- a/contrib/llvm/lib/Target/R600/SIInstructions.td
+++ b/contrib/llvm/lib/Target/R600/SIInstructions.td
@ -152,9 +152,11 @@ defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
  [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
 >;

-//defm S_FLBIT_I32_B64 : SOP1_32 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
-defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", []>;
-//defm S_FLBIT_I32_I64 : SOP1_32 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
+defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
+defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32",
+  [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))]
+>;
+defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>;
 defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8",
  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
 >;
@ -764,88 +766,88 @@ defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <vopc<0xb8, 0x13>, "v_cmpx_class_f64">;
 //===----------------------------------------------------------------------===//


-def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
-def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
-def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
-def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
-def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
-def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
-def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
-def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
-def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
-def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
-def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
-def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
-def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
-def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
-def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
-def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>;
-def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>;
+defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>;
+defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>;
+defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>;
+defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>;
+defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>;
+defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>;
+defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>;
+defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>;
+defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>;
+defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>;
+defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>;
+defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>;
+defm DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VGPR_32>;
+defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>;
+defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>;
+defm DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VGPR_32>;
+defm DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VGPR_32>;

-def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
-def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
-def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
-def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
-def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
-def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
-def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
-def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
-def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
-def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
-def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
-def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
-def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
-def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
+defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
+defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
+defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
+defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">;
+defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">;
+defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">;
+defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">;
+defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">;
+defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">;
+defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">;
+defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">;
+defm DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">;
+defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>;
 //def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2_b32">;
 //def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VGPR_32, "ds_wrxchg2st64_b32">;
-def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
-def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
-def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
-def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
+defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">;
+defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
+defm DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
+defm DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;

 let SubtargetPredicate = isCI in {
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
 } // End isCI


-def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
-def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
-def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
-def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
-def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
-def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
-def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
-def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
-def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
-def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
-def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
-def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
-def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
-def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
-def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
-def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
-def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
+defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
+defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
+defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
+defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
+defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
+defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
+defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
+defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
+defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
+defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
+defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
+defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
+defm DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
+defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
+defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
+defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
+defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;

-def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
-def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
-def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
-def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
-def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
-def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
-def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
-def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
-def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
-def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
-def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
-def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
-def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
-def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
+defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+defm DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
 //def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">;
 //def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">;
-def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
-def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
-def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">;
-def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
+defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">;
+defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">;

 //let SubtargetPredicate = isCI in {
 // DS_CONDXCHG32_RTN_B64
@ -874,123 +876,120 @@ defm DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg

 defm DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
 defm DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
-defm DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
-defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
+defm DS_READ2_B64 : DS_Load2_Helper <0x00000077, "ds_read2_b64", VReg_128>;
+defm DS_READ2ST64_B64 : DS_Load2_Helper <0x00000078, "ds_read2st64_b64", VReg_128>;

 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//

-let SubtargetPredicate = isSICI in {
-
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>;
-//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>;
-//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>;
-//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>;
-//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>;
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <mubuf<0x00>, "buffer_load_format_x", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <mubuf<0x01>, "buffer_load_format_xy", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <mubuf<0x02>, "buffer_load_format_xyz", []>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <mubuf<0x03>, "buffer_load_format_xyzw", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <mubuf<0x04>, "buffer_store_format_x", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <mubuf<0x05>, "buffer_store_format_xy", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <mubuf<0x06>, "buffer_store_format_xyz", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <mubuf<0x07>, "buffer_store_format_xyzw", []>;
 defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
-  0x00000008, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
+  mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global
 >;
 defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
-  0x00000009, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
+  mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global
 >;
 defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
-  0x0000000a, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
+  mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global
 >;
 defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
-  0x0000000b, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
+  mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
-  0x0000000c, "buffer_load_dword", VGPR_32, i32, global_load
+  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
-  0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
 >;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
-  0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
 >;

 defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
-  0x00000018, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+  mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global
 >;

 defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
-  0x0000001a, "buffer_store_short", VGPR_32, i32, truncstorei16_global
+  mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
 >;

 defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "buffer_store_dword", VGPR_32, i32, global_store
+  mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
 >;

 defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store
+  mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store
 >;

 defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store
+  mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
 >;
-//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>;
+
 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
-  0x00000030, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+  mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
 >;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>;
 defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
-  0x00000032, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+  mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global
 >;
 defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
-  0x00000033, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+  mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
 >;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <mubuf<0x34>, "buffer_atomic_rsub", []>; // isn't on CI & VI
 defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
-  0x00000035, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+  mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
 >;
 defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
-  0x00000036, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+  mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
 >;
 defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
-  0x00000037, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+  mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
 >;
 defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
-  0x00000038, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+  mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
 >;
 defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
-  0x00000039, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+  mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global
 >;
 defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
-  0x0000003a, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+  mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global
 >;
 defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
-  0x0000003b, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+  mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
 >;
-//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>;
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>;
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>;
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>;
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>;
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>;
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>;
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>;
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>;
-
-} // End SubtargetPredicate = isSICI
+//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
+//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;

 //===----------------------------------------------------------------------===//
 // MTBUF Instructions
@ -1037,63 +1036,63 @@ defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
-defm IMAGE_SAMPLE           : MIMG_Sampler <0x00000020, "image_sample">;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">;
 defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
 defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
 defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler <0x00000025, "image_sample_b">;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">;
 defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler <0x00000028, "image_sample_c">;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">;
 defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
 defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
 defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler <0x0000002d, "image_sample_c_b">;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">;
 defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler <0x00000030, "image_sample_o">;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">;
 defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
 defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
 defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler <0x00000035, "image_sample_b_o">;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">;
 defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler <0x00000038, "image_sample_c_o">;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">;
 defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
 defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
 defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">;
 defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
-defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "image_gather4">;
-defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">;
 defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
-defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "image_gather4_b">;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">;
 defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
-defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "image_gather4_c">;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">;
 defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "image_gather4_c_b">;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">;
 defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
-defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "image_gather4_o">;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">;
 defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">;
 defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
 defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "image_gather4_c_o">;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">;
 defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">;
 defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
-defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, "image_get_lod">;
+defm IMAGE_GET_LOD          : MIMG_Sampler_WQM <0x00000060, "image_get_lod">;
 defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
 defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
 defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
@ -1445,53 +1444,37 @@ defm V_MIN_F32 : VOP2Inst <vop2<0xf, 0xa>, "v_min_f32", VOP_F32_F32_F32,
  fminnum>;
 defm V_MAX_F32 : VOP2Inst <vop2<0x10, 0xb>, "v_max_f32", VOP_F32_F32_F32,
  fmaxnum>;
-defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32,
-  AMDGPUsmin
->;
-defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32,
-  AMDGPUsmax
->;
-defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32,
-  AMDGPUumin
->;
-defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32,
-  AMDGPUumax
->;
+defm V_MIN_I32 : VOP2Inst <vop2<0x11, 0xc>, "v_min_i32", VOP_I32_I32_I32>;
+defm V_MAX_I32 : VOP2Inst <vop2<0x12, 0xd>, "v_max_i32", VOP_I32_I32_I32>;
+defm V_MIN_U32 : VOP2Inst <vop2<0x13, 0xe>, "v_min_u32", VOP_I32_I32_I32>;
+defm V_MAX_U32 : VOP2Inst <vop2<0x14, 0xf>, "v_max_u32", VOP_I32_I32_I32>;

-// No non-Rev Op on VI
 defm V_LSHRREV_B32 : VOP2Inst <
  vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag,
-    "v_lshr_b32", "v_lshrrev_b32"
+    "v_lshr_b32"
 >;

-// No non-Rev OP on VI
 defm V_ASHRREV_I32 : VOP2Inst <
  vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag,
-    "v_ashr_i32", "v_ashrrev_i32"
+    "v_ashr_i32"
 >;

-// No non-Rev OP on VI
 defm V_LSHLREV_B32 : VOP2Inst <
  vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag,
-    "v_lshl_b32", "v_lshlrev_b32"
+    "v_lshl_b32"
 >;

-defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32",
-  VOP_I32_I32_I32, and>;
-defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32",
-  VOP_I32_I32_I32, or
->;
-defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32",
-  VOP_I32_I32_I32, xor
->;
+defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>;
+defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>;
+defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>;

 defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_F32_F32_F32>;
 } // End isCommutable = 1

-defm V_MADMK_F32 : VOP2Inst <vop2<0x20, 0x17>, "v_madmk_f32", VOP_F32_F32_F32>;
+defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">;

 let isCommutable = 1 in {
-defm V_MADAK_F32 : VOP2Inst <vop2<0x21, 0x18>, "v_madak_f32", VOP_F32_F32_F32>;
+defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
 } // End isCommutable = 1

 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
@ -1503,9 +1486,7 @@ let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
  VOP_I32_I32_I32, add
 >;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32",
-  VOP_I32_I32_I32, sub
->;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;

 defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
  VOP_I32_I32_I32, null_frag, "v_sub_i32"
@ -1513,10 +1494,10 @@ defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",

 let Uses = [VCC] in { // Carry-in comes from VCC
 defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
-  VOP_I32_I32_I32_VCC, adde
+  VOP_I32_I32_I32_VCC
 >;
 defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
-  VOP_I32_I32_I32_VCC, sube
+  VOP_I32_I32_I32_VCC
 >;
 defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
  VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
@ -1529,47 +1510,41 @@ defm V_READLANE_B32 : VOP2SI_3VI_m <
  vop3 <0x001, 0x289>,
  "v_readlane_b32",
  (outs SReg_32:$vdst),
-  (ins VGPR_32:$src0, SSrc_32:$vsrc1),
-  "v_readlane_b32 $vdst, $src0, $vsrc1"
+  (ins VGPR_32:$src0, SCSrc_32:$src1),
+  "v_readlane_b32 $vdst, $src0, $src1"
 >;

 defm V_WRITELANE_B32 : VOP2SI_3VI_m <
  vop3 <0x002, 0x28a>,
  "v_writelane_b32",
  (outs VGPR_32:$vdst),
-  (ins SReg_32:$src0, SSrc_32:$vsrc1),
-  "v_writelane_b32 $vdst, $src0, $vsrc1"
+  (ins SReg_32:$src0, SCSrc_32:$src1),
+  "v_writelane_b32 $vdst, $src0, $src1"
 >;

 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {

-let isCommutable = 1 in {
-defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
-  VOP_F32_F32_F32
->;
-} // End isCommutable = 1
-
-defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
+defm V_MIN_LEGACY_F32 : VOP2InstSI <vop2<0xd>, "v_min_legacy_f32",
  VOP_F32_F32_F32, AMDGPUfmin_legacy
 >;
-defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32",
+defm V_MAX_LEGACY_F32 : VOP2InstSI <vop2<0xe>, "v_max_legacy_f32",
  VOP_F32_F32_F32, AMDGPUfmax_legacy
 >;

 let isCommutable = 1 in {
-defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>;
-defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32",
-  VOP_I32_I32_I32, sra
->;
-
-let hasPostISelHook = 1 in {
-defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>;
-}
-
+defm V_LSHR_B32 : VOP2InstSI <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32>;
+defm V_ASHR_I32 : VOP2InstSI <vop2<0x17>, "v_ashr_i32", VOP_I32_I32_I32>;
+defm V_LSHL_B32 : VOP2InstSI <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32>;
 } // End isCommutable = 1
 } // End let SubtargetPredicate = SICI

+let isCommutable = 1 in {
+defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst <vop23<0x6, 0x28e>, "v_mac_legacy_f32",
+  VOP_F32_F32_F32
+>;
+} // End isCommutable = 1
+
 defm V_BFM_B32 : VOP2_VI3_Inst <vop23<0x1e, 0x293>, "v_bfm_b32", VOP_I32_I32_I32,
  AMDGPUbfm
 >;
@ -1586,14 +1561,25 @@ defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
  VOP_F32_F32_I32, AMDGPUldexp
 >;

-////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>;
-////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>;
-////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
- VOP_I32_F32_F32, int_SI_packf16
+
+defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst <vop23<0x2c, 0x1f0>, "v_cvt_pkaccum_u8_f32",
+  VOP_I32_F32_I32>; // TODO: set "Uses = dst"
+
+defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst <vop23<0x2d, 0x294>, "v_cvt_pknorm_i16_f32",
+  VOP_I32_F32_F32
+>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst <vop23<0x2e, 0x295>, "v_cvt_pknorm_u16_f32",
+  VOP_I32_F32_F32
+>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst <vop23<0x2f, 0x296>, "v_cvt_pkrtz_f16_f32",
+  VOP_I32_F32_F32, int_SI_packf16
+>;
+defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst <vop23<0x30, 0x297>, "v_cvt_pk_u16_u32",
+  VOP_I32_I32_I32
+>;
+defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst <vop23<0x31, 0x298>, "v_cvt_pk_i16_i32",
+  VOP_I32_I32_I32
 >;
-////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>;
-////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>;

 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
@ -1659,27 +1645,34 @@ defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f, 0x1cf>, "v_alignbyte_b32",
  VOP_I32_I32_I32_I32
 >;

-defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32",
+defm V_MIN3_F32 : VOP3Inst <vop3<0x151, 0x1d0>, "v_min3_f32",
  VOP_F32_F32_F32_F32, AMDGPUfmin3>;

-defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32",
+defm V_MIN3_I32 : VOP3Inst <vop3<0x152, 0x1d1>, "v_min3_i32",
  VOP_I32_I32_I32_I32, AMDGPUsmin3
 >;
-defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32",
+defm V_MIN3_U32 : VOP3Inst <vop3<0x153, 0x1d2>, "v_min3_u32",
  VOP_I32_I32_I32_I32, AMDGPUumin3
 >;
-defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32",
+defm V_MAX3_F32 : VOP3Inst <vop3<0x154, 0x1d3>, "v_max3_f32",
  VOP_F32_F32_F32_F32, AMDGPUfmax3
 >;
-defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32",
+defm V_MAX3_I32 : VOP3Inst <vop3<0x155, 0x1d4>, "v_max3_i32",
  VOP_I32_I32_I32_I32, AMDGPUsmax3
 >;
-defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32",
+defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32",
  VOP_I32_I32_I32_I32, AMDGPUumax3
 >;
-//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>;
-//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>;
-//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>;
+defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32",
+  VOP_I32_I32_I32_I32
+>;
+defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32",
+  VOP_I32_I32_I32_I32
+>;
+
 //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
 //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
 //def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
@ -1742,21 +1735,36 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",

 } // isCommutable = 1, SchedRW = [WriteQuarterRate32]

+let SchedRW = [WriteFloatFMA, WriteSALU] in {
 defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+}

-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDouble, WriteSALU] in {
 // Double precision division pre-scale.
 defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
 } // let SchedRW = [WriteDouble]

-let isCommutable = 1 in {
-defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
+let isCommutable = 1, Uses = [VCC] in {
+
+// v_div_fmas_f32:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^32
+//
+defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
  VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
 >;
+
 let SchedRW = [WriteDouble] in {
-defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
+// v_div_fmas_f64:
+//   result = src0 * src1 + src2
+//   if (vcc)
+//     result *= 2^64
+//
+defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
  VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
 >;
+
 } // End SchedRW = [WriteDouble]
 } // End isCommutable = 1

@ -1774,23 +1782,29 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {

-defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
-  VOP_I64_I64_I32, shl
->;
-
-defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
-  VOP_I64_I64_I32, srl
->;
-
-defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
-  VOP_I64_I64_I32, sra
->;
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64", VOP_I64_I64_I32>;
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64", VOP_I64_I64_I32>;
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64", VOP_I64_I64_I32>;

 defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
  VOP_F32_F32_F32_F32>;

 } // End SubtargetPredicate = isSICI

+let SubtargetPredicate = isVI in {
+
+defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64",
+  VOP_I64_I32_I64
+>;
+defm V_LSHRREV_B64 : VOP3Inst <vop3<0, 0x290>, "v_lshrrev_b64",
+  VOP_I64_I32_I64
+>;
+defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64",
+  VOP_I64_I32_I64
+>;
+
+} // End SubtargetPredicate = isVI
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
@ -1809,8 +1823,8 @@ def SGPR_USE : InstSI <(outs),(ins), "", []>;
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.

-let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
-    Uses = [EXEC], Defs = [EXEC] in {
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+let Uses = [EXEC], Defs = [EXEC] in {

 let isBranch = 1, isTerminator = 1 in {

@ -1867,15 +1881,18 @@ def SI_END_CF : InstSI <
  [(int_SI_end_cf i64:$saved)]
 >;

+} // End Uses = [EXEC], Defs = [EXEC]
+
+let Uses = [EXEC], Defs = [EXEC,VCC] in {
 def SI_KILL : InstSI <
  (outs),
  (ins VSrc_32:$src),
  "si_kill $src",
  [(int_AMDGPU_kill f32:$src)]
 >;
+} // End Uses = [EXEC], Defs = [EXEC,VCC]

 } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
-  // Uses = [EXEC], Defs = [EXEC]

 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {

@ -2020,16 +2037,12 @@ def : Pat <
  (SI_KILL 0xbf800000)
 >;

-let Predicates = [isSICI] in {
-
 /* int_SI_vs_load_input */
 def : Pat<
  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;

-} // End Predicates = [isSICI]
-
 /* int_SI_export */
 def : Pat <
  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
@ -2156,9 +2169,13 @@ def : Pat <
 //===----------------------------------------------------------------------===//

 let Predicates = [UnsafeFPMath] in {
-def : RcpPat<V_RCP_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+//def : RcpPat<V_RCP_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F64_e32, f64>;
+//defm : RsqPat<V_RSQ_F32_e32, f32>;
+
+def : RsqPat<V_RSQ_F32_e32, f32>;
+def : RsqPat<V_RSQ_F64_e32, f64>;
 }

 //===----------------------------------------------------------------------===//
@ -2675,13 +2692,6 @@ def : Pat <
  (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
 >;

-def : Pat<
-  (fdiv f64:$src0, f64:$src1),
-  (V_MUL_F64 0 /* src0_modifiers */, $src0,
-             0 /* src1_modifiers */, (V_RCP_F64_e32 $src1),
-             0 /* clamp */, 0 /* omod */)
->;
-
 def : Pat <
  (int_AMDGPU_cube v4f32:$src),
  (REG_SEQUENCE VReg_128,
@ -2716,16 +2726,12 @@ class Ext32Pat <SDNode ext> : Pat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;

-let Predicates = [isSICI] in {
-
 // Offset in an 32Bit VGPR
 def : Pat <
  (SIload_constant v4i32:$sbase, i32:$voff),
  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
 >;

-} // End Predicates = [isSICI]
-
 // The multiplication scales from [0,1] to the unsigned integer range
 def : Pat <
  (AMDGPUurecip i32:$src0),
@ -2907,7 +2913,6 @@ class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
  (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
 >;

-let Predicates = [isSICI] in {
 def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
@ -2915,7 +2920,6 @@ def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
-} // End Predicates = [isSICI]

 // BUFFER_LOAD_DWORD*, addr64=0
 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
@ -2954,14 +2958,12 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
  >;
 }

-let Predicates = [isSICI] in {
 defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN,
                         BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>;
 defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN,
                         BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>;
 defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
                         BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
-} // End Predicates = [isSICI]

 class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
@ -2969,13 +2971,11 @@ class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
  (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
 >;

-let Predicates = [isSICI] in {
 def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
-} // End Predicates = [isSICI]

 /*
 class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
@ -3245,6 +3245,12 @@ def : Pat <
  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
 >;

+def : Pat <
+  (i1 (trunc i64:$a)),
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+                    (EXTRACT_SUBREG $a, sub0)), 1)
+>;
+
 def : Pat <
  (i32 (bswap i32:$a)),
  (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
@ -3257,6 +3263,28 @@ def : Pat <
  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
 >;

+//===----------------------------------------------------------------------===//
+// Fract Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCI] in {
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isCI]
+
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
--- a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
@ -88,7 +88,8 @@ class SILowerControlFlowPass : public MachineFunctionPass {
  void Kill(MachineInstr &MI);
  void Branch(MachineInstr &MI);

-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
+  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
  void IndirectSrc(MachineInstr &MI);
  void IndirectDst(MachineInstr &MI);

@ -323,7 +324,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
  MI.eraseFromParent();
 }

-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
+void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {

  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
@ -333,8 +334,14 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
  unsigned Idx = MI.getOperand(3).getReg();

  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(Idx);
+    if (Offset) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+              .addReg(Idx)
+              .addImm(Offset);
+    } else {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+              .addReg(Idx);
+    }
    MBB.insert(I, MovRel);
  } else {

@ -363,6 +370,11 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
            .addReg(AMDGPU::VCC);

+    if (Offset) {
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+              .addReg(AMDGPU::M0)
+              .addImm(Offset);
+    }
    // Do the actual move
    MBB.insert(I, MovRel);

@ -384,6 +396,33 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
  MI.eraseFromParent();
 }

+/// \param @VecReg The register which holds element zero of the vector
+///                 being addressed into.
+/// \param[out] @Reg The base register to use in the indirect addressing instruction.
+/// \param[in,out] @Offset As an input, this is the constant offset part of the
+//                         indirect Index. e.g. v0 = v[VecReg + Offset]
+//                         As an output, this is a constant value that needs
+//                         to be added to the value stored in M0.
+void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
+                                                         unsigned &Reg,
+                                                         int &Offset) {
+  unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = VecReg;
+
+  const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
+  int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+
+  if (RegIdx < 0) {
+    Offset = RegIdx;
+    RegIdx = 0;
+  } else {
+    Offset = 0;
+  }
+
+  Reg = RC->getRegister(RegIdx);
+}
+
 void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {

  MachineBasicBlock &MBB = *MI.getParent();
@ -391,18 +430,18 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {

  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Vec = MI.getOperand(2).getReg();
-  unsigned Off = MI.getOperand(4).getImm();
-  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
-  if (!SubReg)
-    SubReg = Vec;
+  int Off = MI.getOperand(4).getImm();
+  unsigned Reg;
+
+  computeIndirectRegAndOffset(Vec, Reg, Off);

  MachineInstr *MovRel =
    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(SubReg + Off)
+            .addReg(Reg)
            .addReg(AMDGPU::M0, RegState::Implicit)
            .addReg(Vec, RegState::Implicit);

-  LoadM0(MI, MovRel);
+  LoadM0(MI, MovRel, Off);
 }

 void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
@ -411,20 +450,20 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Off = MI.getOperand(4).getImm();
+  int Off = MI.getOperand(4).getImm();
  unsigned Val = MI.getOperand(5).getReg();
-  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
-  if (!SubReg)
-    SubReg = Dst;
+  unsigned Reg;
+
+  computeIndirectRegAndOffset(Dst, Reg, Off);

  MachineInstr *MovRel = 
    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(SubReg + Off, RegState::Define)
+            .addReg(Reg, RegState::Define)
            .addReg(Val)
            .addReg(AMDGPU::M0, RegState::Implicit)
            .addReg(Dst, RegState::Implicit);

-  LoadM0(MI, MovRel);
+  LoadM0(MI, MovRel, Off);
 }

 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
@ -447,7 +486,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
      Next = std::next(I);

      MachineInstr &MI = *I;
-      if (TII->isDS(MI.getOpcode()))
+      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
        NeedWQM = true;

      // Flat uses m0 in case it needs to access LDS.
@ -513,12 +552,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
        case AMDGPU::SI_INDIRECT_DST_V16:
          IndirectDst(MI);
          break;
-
-        case AMDGPU::V_INTERP_P1_F32:
-        case AMDGPU::V_INTERP_P2_F32:
-        case AMDGPU::V_INTERP_MOV_F32:
-          NeedWQM = true;
-          break;
      }
    }
  }
--- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp
@ -14,7 +14,6 @@


 #include "SIRegisterInfo.h"
-#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@ -47,13 +46,31 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
  Reserved.set(AMDGPU::VGPR255);
  Reserved.set(AMDGPU::VGPR254);

+  // Tonga and Iceland can only allocate a fixed number of SGPRs due
+  // to a hw bug.
+  if (ST.hasSGPRInitBug()) {
+    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
+    // Assume XNACK_MASK is unused.
+    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
+
+    for (unsigned i = Limit; i < NumSGPRs; ++i) {
+      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+      MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true);
+
+      for (; R.isValid(); ++R)
+        Reserved.set(*R);
+    }
+  }
+
  return Reserved;
 }

 unsigned SIRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {

  // FIXME: We should adjust the max number of waves based on LDS size.
-  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getMaxWavesPerCU());
+  unsigned SGPRLimit = getNumSGPRsAllowed(ST.getGeneration(),
+                                          ST.getMaxWavesPerCU());
  unsigned VGPRLimit = getNumVGPRsAllowed(ST.getMaxWavesPerCU());

  for (regclass_iterator I = regclass_begin(), E = regclass_end();
@ -204,7 +221,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
        }

-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+        BuildMI(*MBB, MI, DL,
+                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+                Spill.VGPR)
                .addReg(SubReg)
                .addImm(Spill.Lane);

@ -236,7 +255,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
        if (isM0)
          SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);

-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+        BuildMI(*MBB, MI, DL,
+                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+                SubReg)
                .addReg(Spill.VGPR)
                .addImm(Spill.Lane)
                .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
@ -245,7 +266,22 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                  .addReg(SubReg);
        }
      }
-      TII->insertNOPs(MI, 3);
+
+      // TODO: only do this when it is needed
+      switch (ST.getGeneration()) {
+      case AMDGPUSubtarget::SOUTHERN_ISLANDS:
+        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
+        TII->insertNOPs(MI, 3);
+        break;
+      case AMDGPUSubtarget::SEA_ISLANDS:
+        break;
+      default: // VOLCANIC_ISLANDS and later
+        // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
+        // and later. This also applies to VALUs which write VCC, but we're
+        // unlikely to see VMEM use VCC.
+        TII->insertNOPs(MI, 4);
+      }
+
      MI->eraseFromParent();
      break;
    }
@ -490,14 +526,24 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
  }
 }

-unsigned SIRegisterInfo::getNumSGPRsAllowed(unsigned WaveCount) const {
-  switch(WaveCount) {
-    case 10: return 48;
-    case 9:  return 56;
-    case 8:  return 64;
-    case 7:  return 72;
-    case 6:  return 80;
-    case 5:  return 96;
-    default: return 103;
+unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
+                                            unsigned WaveCount) const {
+  if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    switch (WaveCount) {
+      case 10: return 80;
+      case 9:  return 80;
+      case 8:  return 96;
+      default: return 102;
+    }
+  } else {
+    switch(WaveCount) {
+      case 10: return 48;
+      case 9:  return 56;
+      case 8:  return 64;
+      case 7:  return 72;
+      case 6:  return 80;
+      case 5:  return 96;
+      default: return 103;
+    }
  }
 }
--- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h
@ -17,6 +17,7 @@
 #define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H

 #include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/Support/Debug.h"

 namespace llvm {
@ -111,7 +112,8 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {

  /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
  ///        concurrent waves.
-  unsigned getNumSGPRsAllowed(unsigned WaveCount) const;
+  unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
+                              unsigned WaveCount) const;

  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
                              const TargetRegisterClass *RC) const;
--- a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td
@ -209,7 +209,9 @@ def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256

 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;

-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+  let Size = 32;
+}

 class RegImmOperand <RegisterClass rc> : RegisterOperand<rc> {
  let OperandNamespace = "AMDGPU";
--- a/contrib/llvm/lib/Target/R600/VIInstrFormats.td
+++ b/contrib/llvm/lib/Target/R600/VIInstrFormats.td
@ -136,6 +136,32 @@ class VOP3e_vi <bits<10> op> : Enc64 {
  let Inst{63} = src2_modifiers{0};
 }

+class VOP3be_vi <bits<10> op> : Enc64 {
+  bits<8> vdst;
+  bits<2> src0_modifiers;
+  bits<9> src0;
+  bits<2> src1_modifiers;
+  bits<9> src1;
+  bits<2> src2_modifiers;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+  bits<1> clamp;
+
+  let Inst{7-0} = vdst;
+  let Inst{14-8} = sdst;
+  let Inst{15} = clamp;
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
+}
+
 class EXPe_vi : EXPe {
  let Inst{31-26} = 0x31; //encoding
 }
--- a/contrib/llvm/lib/Target/R600/VIInstructions.td
+++ b/contrib/llvm/lib/Target/R600/VIInstructions.td
@ -9,18 +9,6 @@
 // Instruction definitions for VI and newer.
 //===----------------------------------------------------------------------===//

-let SubtargetPredicate = isVI in {
-
-defm BUFFER_LOAD_DWORD_VI : MUBUF_Load_Helper_vi <
-  0x14, "buffer_load_dword", VGPR_32, i32, global_load
->;
-
-defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi <
-  0x03, "buffer_load_format_xyzw", VReg_128
->;
-
-} // End SubtargetPredicate = isVI
-

 //===----------------------------------------------------------------------===//
 // SMEM Patterns
@ -28,37 +16,10 @@ defm BUFFER_LOAD_FORMAT_XYZW_VI : MUBUF_Load_Helper_vi <

 let Predicates = [isVI] in {

-// 1. Offset as 8bit DWORD immediate
+// 1. Offset as 20bit DWORD immediate
 def : Pat <
  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
 >;

-//===----------------------------------------------------------------------===//
-// MUBUF Patterns
-//===----------------------------------------------------------------------===//
-
-// Offset in an 32Bit VGPR
-def : Pat <
-  (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
->;
-
-// Offset in an 32Bit VGPR
-def : Pat <
-  (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_VI_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
->;
-
-/* int_SI_vs_load_input */
-def : Pat<
-  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_VI_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
->;
-
-defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_VI_OFFSET,
-                         BUFFER_LOAD_DWORD_VI_OFFEN,
-                         BUFFER_LOAD_DWORD_VI_IDXEN,
-                         BUFFER_LOAD_DWORD_VI_BOTHEN>;
-
 } // End Predicates = [isVI]
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@ -392,12 +392,25 @@ static bool usesTheStack(const MachineFunction &MF) {
  return false;
 }

-void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
-                                             unsigned &CallOp,
-                                             const char *&Symbol) {
-  CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32;
+void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          DebugLoc DL) {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  bool Is64Bit = STI.is64Bit();
+  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());

-  if (STI.is64Bit()) {
+  unsigned CallOp;
+  if (Is64Bit)
+    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
+  else
+    CallOp = X86::CALLpcrel32;
+
+  const char *Symbol;
+  if (Is64Bit) {
    if (STI.isTargetCygMing()) {
      Symbol = "___chkstk_ms";
    } else {
@ -407,6 +420,37 @@ void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
    Symbol = "_alloca";
  else
    Symbol = "_chkstk";
+
+  MachineInstrBuilder CI;
+
+  // All current stack probes take AX and SP as input, clobber flags, and
+  // preserve all registers. x86_64 probes leave RSP unmodified.
+  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
+    // For the large code model, we have to call through a register. Use R11,
+    // as it is scratch in all supported calling conventions.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
+        .addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
+  } else {
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+  }
+
+  unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
+  unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
+  CI.addReg(AX, RegState::Implicit)
+      .addReg(SP, RegState::Implicit)
+      .addReg(AX, RegState::Define | RegState::Implicit)
+      .addReg(SP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+
+  if (Is64Bit) {
+    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
+    // themselves. It also does not clobber %rax so we can reuse it when
+    // adjusting %rsp.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+  }
 }

 /// emitPrologue - Push callee-saved registers onto the stack, which
@ -739,11 +783,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
  // increments is necessary to ensure that the guard pages used by the OS
  // virtual memory manager are allocated in correct sequence.
  if (NumBytes >= StackProbeSize && UseStackProbe) {
-    const char *StackProbeSymbol;
-    unsigned CallOp;
-
-    getStackProbeFunction(STI, CallOp, StackProbeSymbol);
-
    // Check whether EAX is livein for this function.
    bool isEAXAlive = isEAXLiveIn(MF);

@ -772,22 +811,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
        .setMIFlag(MachineInstr::FrameSetup);
    }

-    BuildMI(MBB, MBBI, DL,
-            TII.get(CallOp))
-      .addExternalSymbol(StackProbeSymbol)
-      .addReg(StackPtr,    RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
-      .setMIFlag(MachineInstr::FrameSetup);
+    // Save a pointer to the MI where we set AX.
+    MachineBasicBlock::iterator SetRAX = MBBI;
+    --SetRAX;
+
+    // Call __chkstk, __chkstk_ms, or __alloca.
+    emitStackProbeCall(MF, MBB, MBBI, DL);
+
+    // Apply the frame setup flag to all inserted instrs.
+    for (; SetRAX != MBBI; ++SetRAX)
+      SetRAX->setFlag(MachineInstr::FrameSetup);

-    if (Is64Bit) {
-      // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
-      // themself. It also does not clobber %rax so we can reuse it when
-      // adjusting %rsp.
-      BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
-        .addReg(StackPtr)
-        .addReg(X86::RAX)
-        .setMIFlag(MachineInstr::FrameSetup);
-    }
    if (isEAXAlive) {
      // Restore EAX
      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@ -27,9 +27,11 @@ class X86FrameLowering : public TargetFrameLowering {
  explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
    : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}

-  static void getStackProbeFunction(const X86Subtarget &STI,
-                                    unsigned &CallOp,
-                                    const char *&Symbol);
+  /// Emit a call to the target's stack probe function. This is required for all
+  /// large stack allocations on Windows. The caller is required to materialize
+  /// the number of bytes to probe in RAX/EAX.
+  static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL);

  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -15,6 +15,7 @@
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86CallingConv.h"
+#include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
@ -10094,12 +10095,12 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                               VT.getVectorNumElements() / 2);
  // Check for patterns which can be matched with a single insert of a 128-bit
  // subvector.
-  if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
-      isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+  bool OnlyUsesV1 = isShuffleEquivalent(Mask, 0, 1, 0, 1);
+  if (OnlyUsesV1 || isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                              DAG.getIntPtrConstant(0));
    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                              Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
+                              OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
  }
  if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
@ -10112,7 +10113,15 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,

  // Otherwise form a 128-bit permutation.
  // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
-  unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
+  int MaskLO = Mask[0];
+  if (MaskLO == SM_SentinelUndef)
+    MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
+
+  int MaskHI = Mask[2];
+  if (MaskHI == SM_SentinelUndef)
+    MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+
+  unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
                     DAG.getConstant(PermMask, MVT::i8));
 }
@ -17172,6 +17181,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
  switch (IntNo) {
  default: return SDValue();    // Don't custom lower most intrinsics.

+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/instruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
+
  case Intrinsic::x86_avx512_mask_valign_q_512:
  case Intrinsic::x86_avx512_mask_valign_d_512:
    // Vector source operands are swapped.
@ -21076,47 +21092,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,

  assert(!Subtarget->isTargetMachO());

-  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
-  // non-trivial part is impdef of ESP.
-
-  if (Subtarget->isTargetWin64()) {
-    if (Subtarget->isTargetCygMing()) {
-      // ___chkstk(Mingw64):
-      // Clobbers R10, R11, RAX and EFLAGS.
-      // Updates RSP.
-      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
-        .addExternalSymbol("___chkstk")
-        .addReg(X86::RAX, RegState::Implicit)
-        .addReg(X86::RSP, RegState::Implicit)
-        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
-        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-    } else {
-      // __chkstk(MSVCRT): does not update stack pointer.
-      // Clobbers R10, R11 and EFLAGS.
-      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
-        .addExternalSymbol("__chkstk")
-        .addReg(X86::RAX, RegState::Implicit)
-        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-      // RAX has the offset to be subtracted from RSP.
-      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
-    }
-  } else {
-    const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
-                                    Subtarget->isTargetWindowsItanium())
-                                       ? "_chkstk"
-                                       : "_alloca";
-
-    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol(StackProbeSymbol)
-      .addReg(X86::EAX, RegState::Implicit)
-      .addReg(X86::ESP, RegState::Implicit)
-      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
-      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-  }
+  X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);

  MI->eraseFromParent();   // The pseudo instruction is gone now.
  return BB;
@ -25558,45 +25534,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
-                                   LHS.getValueType(), RHS, LHS.getOperand(1));
-        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
-                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS,
+                                   LHS.getOperand(1));
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
+                            DAG.getConstant(0, addV.getValueType()), CC);
      }
  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
-                                   RHS.getValueType(), LHS, RHS.getOperand(1));
-        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
-                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS,
+                                   RHS.getOperand(1));
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
+                            DAG.getConstant(0, addV.getValueType()), CC);
      }

-  if (VT.getScalarType() == MVT::i1) {
-    bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
-      (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
-    bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
-    if (!IsSEXT0 && !IsVZero0)
-      return SDValue();
-    bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
-      (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
+  if (VT.getScalarType() == MVT::i1 &&
+      (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
+    bool IsSEXT0 =
+        (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+        (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

-    if (!IsSEXT1 && !IsVZero1)
-      return SDValue();
+    if (!IsSEXT0 || !IsVZero1) {
+      // Swap the operands and update the condition code.
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+
+      IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+                (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
+      IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+    }

    if (IsSEXT0 && IsVZero1) {
-      assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
-      if (CC == ISD::SETEQ)
+      assert(VT == LHS.getOperand(0).getValueType() &&
+             "Uexpected operand type");
+      if (CC == ISD::SETGT)
+        return DAG.getConstant(0, VT);
+      if (CC == ISD::SETLE)
+        return DAG.getConstant(1, VT);
+      if (CC == ISD::SETEQ || CC == ISD::SETGE)
        return DAG.getNOT(DL, LHS.getOperand(0), VT);
+      
+      assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+             "Unexpected condition code!");
      return LHS.getOperand(0);
    }
-    if (IsSEXT1 && IsVZero0) {
-      assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
-      if (CC == ISD::SETEQ)
-        return DAG.getNOT(DL, RHS.getOperand(0), VT);
-      return RHS.getOperand(0);
-    }
  }

  return SDValue();
--- a/contrib/llvm/lib/Target/X86/X86InstrControl.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@ -279,7 +279,8 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
 }

 let isCall = 1, isCodeGenOnly = 1 in
-  // __chkstk(MSVC):     clobber R10, R11 and EFLAGS.
+  // __chkstk(MSVC):     clobber R10, R11 and EFLAGS
+  // ___chkstk_ms(Mingw64): clobber R10, R11 and EFLAGS
  // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
  let Defs = [RAX, R10, R11, RSP, EFLAGS],
      Uses = [RSP] in {
--- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@ -175,8 +175,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(avx2_packsswb,     INTR_TYPE_2OP, X86ISD::PACKSS, 0),
  X86_INTRINSIC_DATA(avx2_packusdw,     INTR_TYPE_2OP, X86ISD::PACKUS, 0),
  X86_INTRINSIC_DATA(avx2_packuswb,     INTR_TYPE_2OP, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx2_permd,        INTR_TYPE_2OP, X86ISD::VPERMV, 0),
-  X86_INTRINSIC_DATA(avx2_permps,       INTR_TYPE_2OP, X86ISD::VPERMV, 0),
  X86_INTRINSIC_DATA(avx2_phadd_d,      INTR_TYPE_2OP, X86ISD::HADD, 0),
  X86_INTRINSIC_DATA(avx2_phadd_w,      INTR_TYPE_2OP, X86ISD::HADD, 0),
  X86_INTRINSIC_DATA(avx2_phsub_d,      INTR_TYPE_2OP, X86ISD::HSUB, 0),
--- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@ -47,6 +47,8 @@ using namespace llvm;
 static cl::opt<std::string>
 DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
                   cl::ValueRequired);
+static cl::opt<bool> DefaultExitBlockBeforeBody("gcov-exit-block-before-body",
+                                                cl::init(false), cl::Hidden);

 GCOVOptions GCOVOptions::getDefault() {
  GCOVOptions Options;
@ -312,7 +314,7 @@ namespace {
  class GCOVFunction : public GCOVRecord {
   public:
     GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
-                  bool UseCfgChecksum)
+                  bool UseCfgChecksum, bool ExitBlockBeforeBody)
         : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
           ReturnBlock(1, os) {
      this->os = os;
@ -322,11 +324,13 @@ namespace {

      uint32_t i = 0;
      for (auto &BB : *F) {
-        // Skip index 1 (0, 2, 3, 4, ...) because that's assigned to the
-        // ReturnBlock.
-        bool first = i == 0;
-        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++ + !first, os)));
+        // Skip index 1 if it's assigned to the ReturnBlock.
+        if (i == 1 && ExitBlockBeforeBody)
+          ++i;
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++, os)));
      }
+      if (!ExitBlockBeforeBody)
+        ReturnBlock.Number = i;

      std::string FunctionNameAndLine;
      raw_string_ostream FNLOS(FunctionNameAndLine);
@ -469,7 +473,7 @@ static bool functionHasLines(Function *F) {
      if (Loc.isUnknown()) continue;

      // Artificial lines such as calls to the global constructors.
-      if (Loc.getLine() == 0) continue; 
+      if (Loc.getLine() == 0) continue;

      return true;
    }
@ -513,7 +517,8 @@ void GCOVProfiler::emitProfileNotes() {
      EntryBlock.splitBasicBlock(It);

      Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++,
-                                                Options.UseCfgChecksum));
+                                                Options.UseCfgChecksum,
+                                                DefaultExitBlockBeforeBody));
      GCOVFunction &Func = *Funcs.back();

      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@ -2183,12 +2183,16 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS,
      // Handle the floating point versions of equality comparisons too.
      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
-        // Floating point -0.0 and 0.0 compare equal, so we can't
-        // propagate a constant based on that comparison.
+
+        // Floating point -0.0 and 0.0 compare equal, so we can only
+        // propagate values if we know that we have a constant and that
+        // its value is non-zero.
+        
        // FIXME: We should do this optimization if 'no signed zeros' is
        // applicable via an instruction-level fast-math-flag or some other
        // indicator that relaxed FP semantics are being used.
-        if (!isa<ConstantFP>(Op1) || !cast<ConstantFP>(Op1)->isZero())
+
+        if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
          Worklist.push_back(std::make_pair(Op0, Op1));
      }
 
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@ -288,14 +288,11 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,

  IntegerType *IT = cast<IntegerType>(IVOperand->getType());
  Value *OtherOperand = nullptr;
-  int OtherOperandIdx = -1;
  if (BO->getOperand(0) == IVOperand) {
    OtherOperand = BO->getOperand(1);
-    OtherOperandIdx = 1;
  } else {
    assert(BO->getOperand(1) == IVOperand && "only other use!");
    OtherOperand = BO->getOperand(0);
-    OtherOperandIdx = 0;
  }

  bool Changed = false;
--- a/contrib/llvm/patches/README.TXT
+++ b/contrib/llvm/patches/README.TXT
@ -1,11 +1,11 @@
 This is a set of individual patches, which contain all the customizations to
 llvm/clang currently in the FreeBSD base system.  These can be applied in
-alphabetical order to a pristine llvm/clang 3.6.0 source tree, for example by
+alphabetical order to a pristine llvm/clang 3.6.1 source tree, for example by
 doing:

-svn co https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/final llvm-3.6.0 
-svn co https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_360/final llvm-3.6.0/tools/clang
-cd llvm-3.6.0
+svn co https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_361/final llvm-3.6.1 
+svn co https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_361/final llvm-3.6.1/tools/clang
+cd llvm-3.6.1
 for p in /usr/src/contrib/llvm/patches/patch-*.diff; do
 	patch -p0 -f -F0 -E -i $p -s || break
 done
--- a/contrib/llvm/patches/patch-01-freebsd-kprintf.diff
+++ b/contrib/llvm/patches/patch-01-freebsd-kprintf.diff
@ -44,7 +44,7 @@ Index: tools/clang/include/clang/Sema/Sema.h
 ===================================================================
 --- tools/clang/include/clang/Sema/Sema.h
 +++ tools/clang/include/clang/Sema/Sema.h
-@@ -8566,6 +8566,7 @@ class Sema {
+@@ -8567,6 +8567,7 @@ class Sema {
     FST_Strftime,
     FST_Strfmon,
     FST_Kprintf,
@ -230,7 +230,7 @@ Index: tools/clang/lib/Sema/SemaChecking.cpp
 ===================================================================
 --- tools/clang/lib/Sema/SemaChecking.cpp
 +++ tools/clang/lib/Sema/SemaChecking.cpp
-@@ -2584,6 +2584,7 @@ Sema::FormatStringType Sema::GetFormatStringType(c
+@@ -2603,6 +2603,7 @@ Sema::FormatStringType Sema::GetFormatStringType(c
   .Case("strftime", FST_Strftime)
   .Case("strfmon", FST_Strfmon)
   .Cases("kprintf", "cmn_err", "vcmn_err", "zcmn_err", FST_Kprintf)
@ -238,7 +238,7 @@ Index: tools/clang/lib/Sema/SemaChecking.cpp
   .Default(FST_Unknown);
 }
 
-@@ -3365,6 +3366,43 @@ CheckPrintfHandler::HandlePrintfSpecifier(const an
+@@ -3384,6 +3385,43 @@ CheckPrintfHandler::HandlePrintfSpecifier(const an
     CoveredArgs.set(argIndex);
   }
 
@ -282,7 +282,7 @@ Index: tools/clang/lib/Sema/SemaChecking.cpp
   // Check for using an Objective-C specific conversion specifier
   // in a non-ObjC literal.
   if (!ObjCContext && CS.isObjCArg()) {
-@@ -3988,7 +4026,8 @@ void Sema::CheckFormatString(const StringLiteral *
+@@ -4007,7 +4045,8 @@ void Sema::CheckFormatString(const StringLiteral *
     return;
   }
   
@ -292,7 +292,7 @@ Index: tools/clang/lib/Sema/SemaChecking.cpp
     CheckPrintfHandler H(*this, FExpr, OrigFormatExpr, firstDataArg,
                          numDataArgs, (Type == FST_NSString),
                          Str, HasVAListArg, Args, format_idx,
-@@ -3996,7 +4035,8 @@ void Sema::CheckFormatString(const StringLiteral *
+@@ -4015,7 +4054,8 @@ void Sema::CheckFormatString(const StringLiteral *
   
     if (!analyze_format_string::ParsePrintfString(H, Str, Str + StrLen,
                                                   getLangOpts(),
--- a/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff
+++ b/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff
@ -596,7 +596,7 @@ Index: lib/Target/X86/X86FrameLowering.cpp
 static unsigned getLEArOpcode(unsigned IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
-@@ -1848,100 +1865,6 @@ void X86FrameLowering::adjustForHiPEPrologue(Machi
+@@ -1882,100 +1899,6 @@ void X86FrameLowering::adjustForHiPEPrologue(Machi
 #endif
 }
 
@ -697,7 +697,7 @@ Index: lib/Target/X86/X86FrameLowering.cpp
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-@@ -1956,7 +1879,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
+@@ -1990,7 +1913,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
   uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
@ -706,7 +706,7 @@ Index: lib/Target/X86/X86FrameLowering.cpp
   I = MBB.erase(I);
 
   if (!reserveCallFrame) {
-@@ -1976,24 +1899,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
+@@ -2010,24 +1933,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
     MachineInstr *New = nullptr;
@ -740,7 +740,7 @@ Index: lib/Target/X86/X86FrameLowering.cpp
         unsigned Opc = getADDriOpcode(IsLP64, Amount);
         New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
           .addReg(StackPtr).addImm(Amount);
-@@ -2011,13 +1928,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
+@@ -2045,13 +1962,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
     return;
   }
 
@ -761,7 +761,7 @@ Index: lib/Target/X86/X86FrameLowering.h
 ===================================================================
 --- lib/Target/X86/X86FrameLowering.h
 +++ lib/Target/X86/X86FrameLowering.h
-@@ -64,6 +64,8 @@ class X86FrameLowering : public TargetFrameLowerin
+@@ -66,6 +66,8 @@ class X86FrameLowering : public TargetFrameLowerin
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
--- a/contrib/llvm/patches/patch-08-llvm-r227089-fix-mips-i128.diff
+++ b/contrib/llvm/patches/patch-08-llvm-r227089-fix-mips-i128.diff
--- a/contrib/llvm/patches/patch-08-llvm-r230348-arm-fix-bad-ha.diff
+++ b/contrib/llvm/patches/patch-08-llvm-r230348-arm-fix-bad-ha.diff
@ -67,7 +67,7 @@ Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
     }
   }
 
-@@ -7696,11 +7696,8 @@ void SelectionDAGISel::LowerArguments(const Functi
+@@ -7697,11 +7697,8 @@ void SelectionDAGISel::LowerArguments(const Functi
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
@ -80,7 +80,7 @@ Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
-@@ -7715,6 +7712,8 @@ void SelectionDAGISel::LowerArguments(const Functi
+@@ -7716,6 +7713,8 @@ void SelectionDAGISel::LowerArguments(const Functi
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
@ -287,7 +287,7 @@ Index: lib/Target/ARM/ARMISelLowering.cpp
 ===================================================================
 --- lib/Target/ARM/ARMISelLowering.cpp
 +++ lib/Target/ARM/ARMISelLowering.cpp
-@@ -11280,7 +11280,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABas
+@@ -11285,7 +11285,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABas
   return (Members > 0 && Members <= 4);
 }
 
@ -298,7 +298,7 @@ Index: lib/Target/ARM/ARMISelLowering.cpp
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   if (getEffectiveCallingConv(CallConv, isVarArg) !=
-@@ -11289,7 +11291,9 @@ bool ARMTargetLowering::functionArgumentNeedsConse
+@@ -11294,7 +11296,9 @@ bool ARMTargetLowering::functionArgumentNeedsConse
 
   HABaseType Base = HA_UNKNOWN;
   uint64_t Members = 0;
--- a/contrib/llvm/patches/patch-09-clang-r227115-constantarraytype.diff
+++ b/contrib/llvm/patches/patch-09-clang-r227115-constantarraytype.diff
--- a/contrib/llvm/patches/patch-09-llvm-r230058-indirectbrs-assert.diff
+++ b/contrib/llvm/patches/patch-09-llvm-r230058-indirectbrs-assert.diff
@ -1,55 +0,0 @@
-Pull in r230058 from upstream llvm trunk (by Benjamin Kramer):
-
-  LoopRotate: When reconstructing loop simplify form don't split edges
-  from indirectbrs.
-
-  Yet another chapter in the endless story. While this looks like we
-  leave the loop in a non-canonical state this replicates the logic in
-  LoopSimplify so it doesn't diverge from the canonical form in any way.
-
-  PR21968
-
-This fixes a "Cannot split critical edge from IndirectBrInst" assertion
-failure when building the devel/radare2 port.
-
-Introduced here: https://svnweb.freebsd.org/changeset/base/279161
-
-Index: lib/Transforms/Scalar/LoopRotation.cpp
-===================================================================
--- lib/Transforms/Scalar/LoopRotation.cpp
-+++ lib/Transforms/Scalar/LoopRotation.cpp
-@@ -535,6 +535,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool Simplifi
-       Loop *PredLoop = LI->getLoopFor(*PI);
-       if (!PredLoop || PredLoop->contains(Exit))
-         continue;
-+      if (isa<IndirectBrInst>((*PI)->getTerminator()))
-+        continue;
-       SplitLatchEdge |= L->getLoopLatch() == *PI;
-       BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this);
-       ExitSplit->moveBefore(Exit);
-Index: test/Transforms/LoopRotate/crash.ll
-===================================================================
--- test/Transforms/LoopRotate/crash.ll
-+++ test/Transforms/LoopRotate/crash.ll
-@@ -153,3 +153,21 @@ entry:
- "5":                                              ; preds = %"3", %entry
-   ret void
- }
-+
-+; PR21968
-+define void @test8(i1 %C, i8* %P) #0 {
-+entry:
-+  br label %for.cond
-+
-+for.cond:                                         ; preds = %for.inc, %entry
-+  br i1 %C, label %l_bad, label %for.body
-+
-+for.body:                                         ; preds = %for.cond
-+  indirectbr i8* %P, [label %for.inc, label %l_bad]
-+
-+for.inc:                                          ; preds = %for.body
-+  br label %for.cond
-+
-+l_bad:                                            ; preds = %for.body, %for.cond
-+  ret void
-+}
--- a/contrib/llvm/patches/patch-11-llvm-r231227-aarch64-tls-relocs.diff
+++ b/contrib/llvm/patches/patch-11-llvm-r231227-aarch64-tls-relocs.diff
@ -1,811 +0,0 @@
-Pull in r231227 from upstream llvm trunk (by Kristof Beyls):
-
-  Fix PR22408 - LLVM producing AArch64 TLS relocations that GNU linkers
-  cannot handle yet.
-
-  As is described at http://llvm.org/bugs/show_bug.cgi?id=22408, the
-  GNU linkers ld.bfd and ld.gold currently only support a subset of the
-  whole range of AArch64 ELF TLS relocations. Furthermore, they assume
-  that some of the code sequences to access thread-local variables are
-  produced in a very specific sequence. When the sequence is not as the
-  linker expects, it can silently mis-relaxe/mis-optimize the
-  instructions.
-  Even if that wouldn't be the case, it's good to produce the exact
-  sequence, as that ensures that linkers can perform optimizing
-  relaxations.
-
-  This patch:
-
-  * implements support for 16MiB TLS area size instead of 4GiB TLS area
-    size. Ideally clang would grow an -mtls-size option to allow
-    support for both, but that's not part of this patch.
-  * by default doesn't produce local dynamic access patterns, as even
-    modern ld.bfd and ld.gold linkers do not support the associated
-    relocations. An option (-aarch64-elf-ldtls-generation) is added to
-    enable generation of local dynamic code sequence, but is off by
-    default.
-  * makes sure that the exact expected code sequence for local dynamic
-    and general dynamic accesses is produced, by making use of a new
-    pseudo instruction. The patch also removes two
-    (AArch64ISD::TLSDESC_BLR, AArch64ISD::TLSDESC_CALL) pre-existing
-    AArch64-specific pseudo SDNode instructions that are superseded by
-    the new one (TLSDESC_CALLSEQ).
-
-Introduced here: https://svnweb.freebsd.org/changeset/base/280865
-
-Index: lib/Target/AArch64/AArch64AsmPrinter.cpp
-===================================================================
--- lib/Target/AArch64/AArch64AsmPrinter.cpp
-+++ lib/Target/AArch64/AArch64AsmPrinter.cpp
-@@ -12,6 +12,8 @@
- //
- //===----------------------------------------------------------------------===//
- 
-+#include "MCTargetDesc/AArch64AddressingModes.h"
-+#include "MCTargetDesc/AArch64MCExpr.h"
- #include "AArch64.h"
- #include "AArch64MCInstLower.h"
- #include "AArch64MachineFunctionInfo.h"
-@@ -494,12 +496,47 @@ void AArch64AsmPrinter::EmitInstruction(const Mach
-     EmitToStreamer(OutStreamer, TmpInst);
-     return;
-   }
-  case AArch64::TLSDESC_BLR: {
-    MCOperand Callee, Sym;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
-    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
-+  case AArch64::TLSDESC_CALLSEQ: {
-+    /// lower this to:
-+    ///    adrp  x0, :tlsdesc:var
-+    ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
-+    ///    add   x0, x0, #:tlsdesc_lo12:var
-+    ///    .tlsdesccall var
-+    ///    blr   x1
-+    ///    (TPIDR_EL0 offset now in x0)
-+    const MachineOperand &MO_Sym = MI->getOperand(0);
-+    MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
-+    MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
-+    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
-+                                   AArch64II::MO_NC);
-+    MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
-+    MCInstLowering.lowerOperand(MO_Sym, Sym);
-+    MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
-+    MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
- 
-    // First emit a relocation-annotation. This expands to no code, but requests
-+    MCInst Adrp;
-+    Adrp.setOpcode(AArch64::ADRP);
-+    Adrp.addOperand(MCOperand::CreateReg(AArch64::X0));
-+    Adrp.addOperand(SymTLSDesc);
-+    EmitToStreamer(OutStreamer, Adrp);
-+
-+    MCInst Ldr;
-+    Ldr.setOpcode(AArch64::LDRXui);
-+    Ldr.addOperand(MCOperand::CreateReg(AArch64::X1));
-+    Ldr.addOperand(MCOperand::CreateReg(AArch64::X0));
-+    Ldr.addOperand(SymTLSDescLo12);
-+    Ldr.addOperand(MCOperand::CreateImm(0));
-+    EmitToStreamer(OutStreamer, Ldr);
-+
-+    MCInst Add;
-+    Add.setOpcode(AArch64::ADDXri);
-+    Add.addOperand(MCOperand::CreateReg(AArch64::X0));
-+    Add.addOperand(MCOperand::CreateReg(AArch64::X0));
-+    Add.addOperand(SymTLSDescLo12);
-+    Add.addOperand(MCOperand::CreateImm(AArch64_AM::getShiftValue(0)));
-+    EmitToStreamer(OutStreamer, Add);
-+
-+    // Emit a relocation-annotation. This expands to no code, but requests
-     // the following instruction gets an R_AARCH64_TLSDESC_CALL.
-     MCInst TLSDescCall;
-     TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
-@@ -506,12 +543,10 @@ void AArch64AsmPrinter::EmitInstruction(const Mach
-     TLSDescCall.addOperand(Sym);
-     EmitToStreamer(OutStreamer, TLSDescCall);
- 
-    // Other than that it's just a normal indirect call to the function loaded
-    // from the descriptor.
-    MCInst BLR;
-    BLR.setOpcode(AArch64::BLR);
-    BLR.addOperand(Callee);
-    EmitToStreamer(OutStreamer, BLR);
-+    MCInst Blr;
-+    Blr.setOpcode(AArch64::BLR);
-+    Blr.addOperand(MCOperand::CreateReg(AArch64::X1));
-+    EmitToStreamer(OutStreamer, Blr);
- 
-     return;
-   }
-Index: lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
-===================================================================
--- lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
-+++ lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
-@@ -62,10 +62,10 @@ struct LDTLSCleanup : public MachineFunctionPass {
-     for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-          ++I) {
-       switch (I->getOpcode()) {
-      case AArch64::TLSDESC_BLR:
-+      case AArch64::TLSDESC_CALLSEQ:
-         // Make sure it's a local dynamic access.
-        if (!I->getOperand(1).isSymbol() ||
-            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
-+        if (!I->getOperand(0).isSymbol() ||
-+            strcmp(I->getOperand(0).getSymbolName(), "_TLS_MODULE_BASE_"))
-           break;
- 
-         if (TLSBaseAddrReg)
-Index: lib/Target/AArch64/AArch64ISelLowering.cpp
-===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
-+++ lib/Target/AArch64/AArch64ISelLowering.cpp
-@@ -64,10 +64,18 @@ EnableAArch64ExtrGeneration("aarch64-extr-generati
- 
- static cl::opt<bool>
- EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow AArch64 SLI/SRI formation"),
-                         cl::init(false));
-+                           cl::desc("Allow AArch64 SLI/SRI formation"),
-+                           cl::init(false));
- 
-+// FIXME: The necessary dtprel relocations don't seem to be supported
-+// well in the GNU bfd and gold linkers at the moment. Therefore, by
-+// default, for now, fall back to GeneralDynamic code generation.
-+cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
-+    "aarch64-elf-ldtls-generation", cl::Hidden,
-+    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
-+    cl::init(false));
- 
-+
- AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
-     : TargetLowering(TM) {
-   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-@@ -760,7 +768,7 @@ const char *AArch64TargetLowering::getTargetNodeNa
-   case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
-   case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
-   case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
-+  case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
-   case AArch64ISD::ADC:               return "AArch64ISD::ADC";
-   case AArch64ISD::SBC:               return "AArch64ISD::SBC";
-   case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
-@@ -3049,61 +3057,34 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress
- /// When accessing thread-local variables under either the general-dynamic or
- /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
- /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, NZCV) are preserved.
-+/// is a function pointer to carry out the resolution.
- ///
-/// Thus, the ideal call sequence on AArch64 is:
-+/// The sequence is:
-+///    adrp  x0, :tlsdesc:var
-+///    ldr   x1, [x0, #:tlsdesc_lo12:var]
-+///    add   x0, x0, #:tlsdesc_lo12:var
-+///    .tlsdesccall var
-+///    blr   x1
-+///    (TPIDR_EL0 offset now in x0)
- ///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
-///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
-///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                   SDValue DescAddr, SDLoc DL,
-                                                   SelectionDAG &DAG) const {
-+///  The above sequence must be produced unscheduled, to enable the linker to
-+///  optimize/relax this sequence.
-+///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
-+///  above sequence, and expanded really late in the compilation flow, to ensure
-+///  the sequence is produced as per above.
-+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
-+                                                      SelectionDAG &DAG) const {
-   EVT PtrVT = getPointerTy();
- 
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
-+  SDValue Chain = DAG.getEntryNode();
-+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- 
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
-+  SmallVector<SDValue, 2> Ops;
-   Ops.push_back(Chain);
-  Ops.push_back(Func);
-   Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
- 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
-  Glue = Chain.getValue(1);
-+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
-+  SDValue Glue = Chain.getValue(1);
- 
-   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
- }
-@@ -3114,9 +3095,18 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SD
-   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-   assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-          "ELF TLS only supported in small memory model");
-+  // Different choices can be made for the maximum size of the TLS area for a
-+  // module. For the small address model, the default TLS size is 16MiB and the
-+  // maximum TLS size is 4GiB.
-+  // FIXME: add -mtls-size command line option and make it control the 16MiB
-+  // vs. 4GiB code sequence generation.
-   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- 
-   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
-+  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
-+    if (Model == TLSModel::LocalDynamic)
-+      Model = TLSModel::GeneralDynamic;
-+  }
- 
-   SDValue TPOff;
-   EVT PtrVT = getPointerTy();
-@@ -3127,17 +3117,20 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SD
- 
-   if (Model == TLSModel::LocalExec) {
-     SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
-+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
-     SDValue LoVar = DAG.getTargetGlobalAddress(
-         GV, DL, PtrVT, 0,
-        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
-+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
- 
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
-+    SDValue TPWithOff_lo =
-+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
-+                                   HiVar, DAG.getTargetConstant(0, MVT::i32)),
-+                0);
-+    SDValue TPWithOff =
-+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
-+                                   LoVar, DAG.getTargetConstant(0, MVT::i32)),
-+                0);
-+    return TPWithOff;
-   } else if (Model == TLSModel::InitialExec) {
-     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
-     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
-@@ -3152,19 +3145,6 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SD
-         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
-     MFI->incNumLocalDynamicTLSAccesses();
- 
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-     // The call needs a relocation too for linker relaxation. It doesn't make
-     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-     // the address.
-@@ -3173,40 +3153,23 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SD
- 
-     // Now we can calculate the offset from TPIDR_EL0 to this module's
-     // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
- 
-     // Now use :dtprel_whatever: operations to calculate this variable's offset
-     // in its thread-storage area.
-     SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
-+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
-     SDValue LoVar = DAG.getTargetGlobalAddress(
-         GV, DL, MVT::i64, 0,
-        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
-+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
- 
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff =
-        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                   DAG.getTargetConstant(0, MVT::i32)),
-                0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
-+                                       DAG.getTargetConstant(0, MVT::i32)),
-+                    0);
-+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
-+                                       DAG.getTargetConstant(0, MVT::i32)),
-+                    0);
-   } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
-        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-     // The call needs a relocation too for linker relaxation. It doesn't make
-     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-     // the address.
-@@ -3214,7 +3177,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SD
-         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
- 
-     // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
-   } else
-     llvm_unreachable("Unsupported ELF TLS access model");
- 
-Index: lib/Target/AArch64/AArch64ISelLowering.h
-===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
-+++ lib/Target/AArch64/AArch64ISelLowering.h
-@@ -29,9 +29,9 @@ enum {
-   WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
-   CALL,         // Function call.
- 
-  // Almost the same as a normal call node, except that a TLSDesc relocation is
-  // needed so the linker can relax it correctly if possible.
-  TLSDESC_CALL,
-+  // Produces the full sequence of instructions for getting the thread pointer
-+  // offset of a variable into X0, using the TLSDesc model.
-+  TLSDESC_CALLSEQ,
-   ADRP,     // Page address of a TargetGlobalAddress operand.
-   ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
-   LOADgot,  // Load from automatically generated descriptor (e.g. Global
-@@ -399,8 +399,8 @@ class AArch64TargetLowering : public TargetLowerin
-   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                              SelectionDAG &DAG) const;
-+  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
-+                                 SelectionDAG &DAG) const;
-   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-Index: lib/Target/AArch64/AArch64InstrInfo.td
-===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
-+++ lib/Target/AArch64/AArch64InstrInfo.td
-@@ -96,6 +96,19 @@ def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCis
- 
- def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
-                                                  SDTCisPtrTy<1>]>;
-+
-+// Generates the general dynamic sequences, i.e.
-+//  adrp  x0, :tlsdesc:var
-+//  ldr   x1, [x0, #:tlsdesc_lo12:var]
-+//  add   x0, x0, #:tlsdesc_lo12:var
-+//  .tlsdesccall var
-+//  blr   x1
-+
-+// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
-+// number of operands (the variable)
-+def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
-+                                          [SDTCisPtrTy<0>]>;
-+
- def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
-                                         [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
-                                          SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
-@@ -229,11 +242,12 @@ def AArch64Prefetch        : SDNode<"AArch64ISD::P
- def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
- def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
- 
-def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
-                                 SDT_AArch64TLSDescCall,
-                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                  SDNPVariadic]>;
-+def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
-+                                    SDT_AArch64TLSDescCallSeq,
-+                                    [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-+                                     SDNPVariadic]>;
- 
-+
- def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
-                                  SDT_AArch64WrapperLarge>;
- 
-@@ -1049,15 +1063,16 @@ def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym)
-   let AsmString = ".tlsdesccall $sym";
- }
- 
-// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
-// gets expanded to two MCInsts during lowering.
-let isCall = 1, Defs = [LR] in
-def TLSDESC_BLR
-    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
-             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
-+// FIXME: maybe the scratch register used shouldn't be fixed to X1?
-+// FIXME: can "hasSideEffects be dropped?
-+let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
-+    isCodeGenOnly = 1 in
-+def TLSDESC_CALLSEQ
-+    : Pseudo<(outs), (ins i64imm:$sym),
-+             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>;
-+def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
-+          (TLSDESC_CALLSEQ texternalsym:$sym)>;
- 
-def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
-          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
- //===----------------------------------------------------------------------===//
- // Conditional branch (immediate) instruction.
- //===----------------------------------------------------------------------===//
-Index: lib/Target/AArch64/AArch64MCInstLower.cpp
-===================================================================
--- lib/Target/AArch64/AArch64MCInstLower.cpp
-+++ lib/Target/AArch64/AArch64MCInstLower.cpp
-@@ -22,9 +22,12 @@
- #include "llvm/MC/MCExpr.h"
- #include "llvm/MC/MCInst.h"
- #include "llvm/Support/CodeGen.h"
-+#include "llvm/Support/CommandLine.h"
- #include "llvm/Target/TargetMachine.h"
- using namespace llvm;
- 
-+extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
-+
- AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
-     : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
- 
-@@ -84,10 +87,16 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandEL
-     if (MO.isGlobal()) {
-       const GlobalValue *GV = MO.getGlobal();
-       Model = Printer.TM.getTLSModel(GV);
-+      if (!EnableAArch64ELFLocalDynamicTLSGeneration &&
-+          Model == TLSModel::LocalDynamic)
-+        Model = TLSModel::GeneralDynamic;
-+
-     } else {
-       assert(MO.isSymbol() &&
-              StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
-              "unexpected external TLS symbol");
-+      // The general dynamic access sequence is used to get the
-+      // address of _TLS_MODULE_BASE_.
-       Model = TLSModel::GeneralDynamic;
-     }
-     switch (Model) {
-@@ -123,6 +132,8 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandEL
-     RefFlags |= AArch64MCExpr::VK_G1;
-   else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
-     RefFlags |= AArch64MCExpr::VK_G0;
-+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_HI12)
-+    RefFlags |= AArch64MCExpr::VK_HI12;
- 
-   if (MO.getTargetFlags() & AArch64II::MO_NC)
-     RefFlags |= AArch64MCExpr::VK_NC;
-Index: lib/Target/AArch64/Utils/AArch64BaseInfo.h
-===================================================================
--- lib/Target/AArch64/Utils/AArch64BaseInfo.h
-+++ lib/Target/AArch64/Utils/AArch64BaseInfo.h
-@@ -1229,7 +1229,7 @@ namespace AArch64II {
- 
-     MO_NO_FLAG,
- 
-    MO_FRAGMENT = 0x7,
-+    MO_FRAGMENT = 0xf,
- 
-     /// MO_PAGE - A symbol operand with this flag represents the pc-relative
-     /// offset of the 4K page containing the symbol.  This is used with the
-@@ -1257,26 +1257,31 @@ namespace AArch64II {
-     /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
-     MO_G0 = 6,
- 
-+    /// MO_HI12 - This flag indicates that a symbol operand represents the bits
-+    /// 13-24 of a 64-bit address, used in a arithmetic immediate-shifted-left-
-+    /// by-12-bits instruction.
-+    MO_HI12 = 7,
-+
-     /// MO_GOT - This flag indicates that a symbol operand represents the
-     /// address of the GOT entry for the symbol, rather than the address of
-     /// the symbol itself.
-    MO_GOT = 8,
-+    MO_GOT = 0x10,
- 
-     /// MO_NC - Indicates whether the linker is expected to check the symbol
-     /// reference for overflow. For example in an ADRP/ADD pair of relocations
-     /// the ADRP usually does check, but not the ADD.
-    MO_NC = 0x10,
-+    MO_NC = 0x20,
- 
-     /// MO_TLS - Indicates that the operand being accessed is some kind of
-     /// thread-local symbol. On Darwin, only one type of thread-local access
-     /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
-     /// referee will affect interpretation.
-    MO_TLS = 0x20,
-+    MO_TLS = 0x40,
- 
-     /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
-     /// the address of a constant pool entry for the symbol, rather than the
-     /// address of the symbol itself.
-    MO_CONSTPOOL = 0x40
-+    MO_CONSTPOOL = 0x80
-   };
- } // end namespace AArch64II
- 
-Index: test/CodeGen/AArch64/arm64-tls-dynamics.ll
-===================================================================
--- test/CodeGen/AArch64/arm64-tls-dynamics.ll
-+++ test/CodeGen/AArch64/arm64-tls-dynamics.ll
-@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -aarch64-elf-ldtls-generation=1 -verify-machineinstrs < %s | FileCheck %s
-+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -aarch64-elf-ldtls-generation=1 -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOLD %s
-+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-NOLD-RELOC %s
- 
- @general_dynamic_var = external thread_local global i32
- 
-@@ -9,22 +11,34 @@ define i32 @test_generaldynamic() {
-   %val = load i32* @general_dynamic_var
-   ret i32 %val
- 
-  ; FIXME: the adrp instructions are redundant (if harmless).
-; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
- ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-+; CHECK-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-+; CHECK-NEXT: .tlsdesccall general_dynamic_var
- ; CHECK-NEXT: blr [[CALLEE]]
- 
-+; CHECK-NOLD: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-+; CHECK-NOLD-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-+; CHECK-NOLD-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-+; CHECK-NOLD-NEXT: .tlsdesccall general_dynamic_var
-+; CHECK-NOLD-NEXT: blr [[CALLEE]]
-+
-+
- ; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
- ; CHECK: ldr w0, [x[[TP]], x0]
-+; CHECK-NOLD: mrs x[[TP:[0-9]+]], TPIDR_EL0
-+; CHECK-NOLD: ldr w0, [x[[TP]], x0]
- 
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
- 
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_CALL
-+
- }
- 
- define i32* @test_generaldynamic_addr() {
-@@ -32,12 +46,10 @@ define i32* @test_generaldynamic_addr() {
- 
-   ret i32* @general_dynamic_var
- 
-  ; FIXME: the adrp instructions are redundant (if harmless).
-; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
- ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-+; CHECK-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-+; CHECK-NEXT: .tlsdesccall general_dynamic_var
- ; CHECK-NEXT: blr [[CALLEE]]
- 
- ; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-@@ -44,9 +56,15 @@ define i32* @test_generaldynamic_addr() {
- ; CHECK: add x0, [[TP]], x0
- 
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-+
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_CALL
-+
- }
- 
- @local_dynamic_var = external thread_local(localdynamic) global i32
-@@ -58,54 +76,71 @@ define i32 @test_localdynamic() {
-   ret i32 %val
- 
- ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-+; CHECK-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-+; CHECK-NEXT: .tlsdesccall _TLS_MODULE_BASE_
- ; CHECK-NEXT: blr [[CALLEE]]
-+; CHECK-NEXT: add x[[TPOFF:[0-9]+]], x0, :dtprel_hi12:local_dynamic_var
-+; CHECK-NEXT: add x[[TPOFF]], x[[TPOFF]], :dtprel_lo12_nc:local_dynamic_var
-+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-+; CHECK: ldr w0, [x[[TPIDR]], x[[TPOFF]]]
- 
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-+; CHECK-NOLD: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:local_dynamic_var
-+; CHECK-NOLD-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:local_dynamic_var]
-+; CHECK-NOLD-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:local_dynamic_var
-+; CHECK-NOLD-NEXT: .tlsdesccall local_dynamic_var
-+; CHECK-NOLD-NEXT: blr [[CALLEE]]
-+; CHECK-NOLD: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-+; CHECK-NOLD: ldr w0, [x[[TPIDR]], x0]
- 
-; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
- 
-; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-
-; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
-
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-+; CHECK-RELOC: R_AARCH64_TLSLD_ADD_DTPREL_HI12
-+; CHECK-RELOC: R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC
- 
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_CALL
-+
- }
- 
- define i32* @test_localdynamic_addr() {
- ; CHECK-LABEL: test_localdynamic_addr:
- 
-  ret i32* @local_dynamic_var
-
- ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-+; CHECK-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-+; CHECK-NEXT: .tlsdesccall _TLS_MODULE_BASE_
- ; CHECK-NEXT: blr [[CALLEE]]
-+; CHECK-NEXT: add x[[TPOFF:[0-9]+]], x0, :dtprel_hi12:local_dynamic_var
-+; CHECK-NEXT: add x[[TPOFF]], x[[TPOFF]], :dtprel_lo12_nc:local_dynamic_var
-+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-+; CHECK: add x0, x[[TPIDR]], x[[TPOFF]]
- 
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-+; CHECK-NOLD: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:local_dynamic_var
-+; CHECK-NOLD-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:local_dynamic_var]
-+; CHECK-NOLD-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:local_dynamic_var
-+; CHECK-NOLD-NEXT: .tlsdesccall local_dynamic_var
-+; CHECK-NOLD-NEXT: blr [[CALLEE]]
-+; CHECK-NOLD: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-+; CHECK-NOLD: add x0, x[[TPIDR]], x0
-+  ret i32* @local_dynamic_var
- 
-; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
-
-; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
-
-; CHECK: add x0, [[TPIDR]], [[TPREL]]
-
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
- ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-+; CHECK-RELOC: R_AARCH64_TLSLD_ADD_DTPREL_HI12
-+; CHECK-RELOC: R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC
- 
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADR_PAGE21
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-+; CHECK-NOLD-RELOC: R_AARCH64_TLSDESC_CALL
- }
- 
- ; The entire point of the local-dynamic access model is to have a single call to
-@@ -122,11 +157,10 @@ define i32 @test_localdynamic_deduplicate() {
-   %sum = add i32 %val, %val2
-   ret i32 %sum
- 
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-+; CHECK: adrp x[[DTPREL_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-+; CHECK-NEXT: ldr [[CALLEE:x[0-9]+]], [x[[DTPREL_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-+; CHECK-NEXT: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE
-+; CHECK-NEXT: .tlsdesccall _TLS_MODULE_BASE_
- ; CHECK-NEXT: blr [[CALLEE]]
- 
- ; CHECK-NOT: _TLS_MODULE_BASE_
-Index: test/CodeGen/AArch64/arm64-tls-execs.ll
-===================================================================
--- test/CodeGen/AArch64/arm64-tls-execs.ll
-+++ test/CodeGen/AArch64/arm64-tls-execs.ll
-@@ -38,14 +38,13 @@ define i32 @test_local_exec() {
- ; CHECK-LABEL: test_local_exec:
-   %val = load i32* @local_exec_var
- 
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
-+; CHECK: mrs x[[R1:[0-9]+]], TPIDR_EL0
-+; CHECK: add x[[R2:[0-9]+]], x[[R1]], :tprel_hi12:local_exec_var
-+; CHECK: add x[[R3:[0-9]+]], x[[R2]], :tprel_lo12_nc:local_exec_var
-+; CHECK: ldr w0, [x[[R3]]]
- 
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-
-+; CHECK-RELOC: R_AARCH64_TLSLE_ADD_TPREL_HI12
-+; CHECK-RELOC: R_AARCH64_TLSLE_ADD_TPREL_LO12_NC
-   ret i32 %val
- }
- 
-@@ -53,11 +52,11 @@ define i32* @test_local_exec_addr() {
- ; CHECK-LABEL: test_local_exec_addr:
-   ret i32* @local_exec_var
- 
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-+; CHECK: mrs x[[R1:[0-9]+]], TPIDR_EL0
-+; CHECK: add x[[R2:[0-9]+]], x[[R1]], :tprel_hi12:local_exec_var
-+; CHECK: add x0, x[[R2]], :tprel_lo12_nc:local_exec_var
-+; CHECK: ret
- 
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-+; CHECK-RELOC: R_AARCH64_TLSLE_ADD_TPREL_HI12
-+; CHECK-RELOC: R_AARCH64_TLSLE_ADD_TPREL_LO12_NC
- }
--- a/contrib/llvm/patches/patch-13-llvm-r229911-uleb128-commas.diff
+++ b/contrib/llvm/patches/patch-13-llvm-r229911-uleb128-commas.diff
@ -1,77 +0,0 @@
-Pull in r229911 from upstream llvm trunk (by Benjamin Kramer):
-
-  MC: Allow multiple comma-separated expressions on the .uleb128 directive.
-
-  For compatiblity with GNU as. Binutils documents this as
-  '.uleb128 expressions'. Subtle, isn't it?
-
-Introduced here: http://svnweb.freebsd.org/changeset/base/281775
-
-Index: lib/MC/MCParser/AsmParser.cpp
-===================================================================
--- lib/MC/MCParser/AsmParser.cpp
-+++ lib/MC/MCParser/AsmParser.cpp
-@@ -3636,22 +3636,28 @@ bool AsmParser::parseDirectiveSpace(StringRef IDVa
- }
- 
- /// parseDirectiveLEB128
-/// ::= (.sleb128 | .uleb128) expression
-+/// ::= (.sleb128 | .uleb128) [ expression (, expression)* ]
- bool AsmParser::parseDirectiveLEB128(bool Signed) {
-   checkForValidSection();
-   const MCExpr *Value;
- 
-  if (parseExpression(Value))
-    return true;
-+  for (;;) {
-+    if (parseExpression(Value))
-+      return true;
- 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-+    if (Signed)
-+      getStreamer().EmitSLEB128Value(Value);
-+    else
-+      getStreamer().EmitULEB128Value(Value);
- 
-  if (Signed)
-    getStreamer().EmitSLEB128Value(Value);
-  else
-    getStreamer().EmitULEB128Value(Value);
-+    if (getLexer().is(AsmToken::EndOfStatement))
-+      break;
- 
-+    if (getLexer().isNot(AsmToken::Comma))
-+      return TokError("unexpected token in directive");
-+    Lex();
-+  }
-+
-   return false;
- }
- 
-Index: test/MC/ELF/uleb.s
-===================================================================
--- test/MC/ELF/uleb.s
-+++ test/MC/ELF/uleb.s
-@@ -11,16 +11,17 @@ foo:
- 	.uleb128	128
- 	.uleb128	16383
- 	.uleb128	16384
-+        .uleb128	23, 42
- 
- // ELF_32:   Name: .text
- // ELF_32:   SectionData (
-// ELF_32:     0000: 00017F80 01FF7F80 8001
-+// ELF_32:     0000: 00017F80 01FF7F80 8001172A
- // ELF_32:   )
- // ELF_64:   Name: .text
- // ELF_64:   SectionData (
-// ELF_64:     0000: 00017F80 01FF7F80 8001
-+// ELF_64:     0000: 00017F80 01FF7F80 8001172A
- // ELF_64:   )
- // MACHO_32: ('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// MACHO_32: ('_section_data', '00017f80 01ff7f80 8001')
-+// MACHO_32: ('_section_data', '00017f80 01ff7f80 8001172a')
- // MACHO_64: ('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// MACHO_64: ('_section_data', '00017f80 01ff7f80 8001')
-+// MACHO_64: ('_section_data', '00017f80 01ff7f80 8001172a')
--- a/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@ -6972,6 +6972,11 @@ def note_neon_vector_initializer_non_portable_q : Note<
  "vcombine_%0%1(vcreate_%0%1(), vcreate_%0%1()) to initialize from integer "
  "constants">;

+def err_builtin_longjmp_unsupported : Error<
+  "__builtin_longjmp is not supported for the current target">;
+def err_builtin_setjmp_unsupported : Error<
+  "__builtin_setjmp is not supported for the current target">;
+
 def err_builtin_longjmp_invalid_val : Error<
  "argument to __builtin_longjmp must be a constant 1">;
 def err_builtin_requires_language : Error<"'%0' is only available in %1">;
--- a/contrib/llvm/tools/clang/include/clang/Basic/TargetCXXABI.h
+++ b/contrib/llvm/tools/clang/include/clang/Basic/TargetCXXABI.h
@ -79,6 +79,12 @@ class TargetCXXABI {
    ///   - guard variables  are smaller.
    GenericAArch64,

+    /// The generic Mips ABI is a modified version of the Itanium ABI.
+    ///
+    /// At the moment, only change from the generic ABI in this case is:
+    ///   - representation of member function pointers adjusted as in ARM.
+    GenericMIPS,
+
    /// The Microsoft ABI is the ABI used by Microsoft Visual Studio (and
    /// compatible compilers).
    ///
@ -114,6 +120,7 @@ class TargetCXXABI {
    case GenericARM:
    case iOS:
    case iOS64:
+    case GenericMIPS:
      return true;

    case Microsoft:
@ -130,6 +137,7 @@ class TargetCXXABI {
    case GenericARM:
    case iOS:
    case iOS64:
+    case GenericMIPS:
      return false;

    case Microsoft:
@ -212,6 +220,7 @@ class TargetCXXABI {
    case GenericItanium:
    case iOS:   // old iOS compilers did not follow this rule
    case Microsoft:
+    case GenericMIPS:
      return true;
    }
    llvm_unreachable("bad ABI kind");
@ -257,6 +266,7 @@ class TargetCXXABI {
    case GenericAArch64:
    case GenericARM:
    case iOS:
+    case GenericMIPS:
      return UseTailPaddingUnlessPOD03;

    // iOS on ARM64 uses the C++11 POD rules.  It does not honor the
--- a/contrib/llvm/tools/clang/include/clang/Basic/TargetInfo.h
+++ b/contrib/llvm/tools/clang/include/clang/Basic/TargetInfo.h
@ -852,6 +852,12 @@ class TargetInfo : public RefCountedBase<TargetInfo> {
    }
  }

+  /// Controls if __builtin_longjmp / __builtin_setjmp can be lowered to
+  /// llvm.eh.sjlj.longjmp / llvm.eh.sjlj.setjmp.
+  virtual bool hasSjLjLowering() const {
+    return false;
+  }
+
 protected:
  virtual uint64_t getPointerWidthV(unsigned AddrSpace) const {
    return PointerWidth;
--- a/contrib/llvm/tools/clang/include/clang/Sema/Sema.h
+++ b/contrib/llvm/tools/clang/include/clang/Sema/Sema.h
@ -8550,6 +8550,7 @@ class Sema {
  bool SemaBuiltinAssume(CallExpr *TheCall);
  bool SemaBuiltinAssumeAligned(CallExpr *TheCall);
  bool SemaBuiltinLongjmp(CallExpr *TheCall);
+  bool SemaBuiltinSetjmp(CallExpr *TheCall);
  ExprResult SemaBuiltinAtomicOverloaded(ExprResult TheCallResult);
  ExprResult SemaAtomicOpsOverloaded(ExprResult TheCallResult,
                                     AtomicExpr::AtomicOp Op);
--- a/contrib/llvm/tools/clang/include/clang/Sema/Template.h
+++ b/contrib/llvm/tools/clang/include/clang/Sema/Template.h
@ -273,6 +273,11 @@ namespace clang {
    /// outermost scope.
    LocalInstantiationScope *cloneScopes(LocalInstantiationScope *Outermost) {
      if (this == Outermost) return this;
+
+      // Save the current scope from SemaRef since the LocalInstantiationScope
+      // will overwrite it on construction
+      LocalInstantiationScope *oldScope = SemaRef.CurrentInstantiationScope;
+
      LocalInstantiationScope *newScope =
        new LocalInstantiationScope(SemaRef, CombineWithOuterScope);

@ -299,6 +304,8 @@ namespace clang {
          newScope->ArgumentPacks.push_back(NewPack);
        }
      }
+      // Restore the saved scope to SemaRef
+      SemaRef.CurrentInstantiationScope = oldScope;
      return newScope;
    }

--- a/contrib/llvm/tools/clang/lib/AST/ASTContext.cpp
+++ b/contrib/llvm/tools/clang/lib/AST/ASTContext.cpp
@ -682,6 +682,7 @@ CXXABI *ASTContext::createCXXABI(const TargetInfo &T) {
  case TargetCXXABI::iOS:
  case TargetCXXABI::iOS64:
  case TargetCXXABI::GenericAArch64:
+  case TargetCXXABI::GenericMIPS:
  case TargetCXXABI::GenericItanium:
    return CreateItaniumCXXABI(*this);
  case TargetCXXABI::Microsoft:
@ -7873,7 +7874,7 @@ static GVALinkage basicGVALinkageForFunction(const ASTContext &Context,
  // Functions specified with extern and inline in -fms-compatibility mode
  // forcibly get emitted.  While the body of the function cannot be later
  // replaced, the function definition cannot be discarded.
-  if (FD->getMostRecentDecl()->isMSExternInline())
+  if (FD->isMSExternInline())
    return GVA_StrongODR;

  return GVA_DiscardableODR;
@ -8064,6 +8065,7 @@ MangleContext *ASTContext::createMangleContext() {
  case TargetCXXABI::GenericAArch64:
  case TargetCXXABI::GenericItanium:
  case TargetCXXABI::GenericARM:
+  case TargetCXXABI::GenericMIPS:
  case TargetCXXABI::iOS:
  case TargetCXXABI::iOS64:
    return ItaniumMangleContext::create(*this, getDiagnostics());
--- a/contrib/llvm/tools/clang/lib/AST/ASTDiagnostic.cpp
+++ b/contrib/llvm/tools/clang/lib/AST/ASTDiagnostic.cpp
@ -998,29 +998,27 @@ class TemplateDiff {
            (!HasFromValueDecl && !HasToValueDecl)) &&
           "Template argument cannot be both integer and declaration");

-    unsigned ParamWidth = 128; // Safe default
-    if (FromDefaultNonTypeDecl->getType()->isIntegralOrEnumerationType())
-      ParamWidth = Context.getIntWidth(FromDefaultNonTypeDecl->getType());
-
    if (!HasFromInt && !HasToInt && !HasFromValueDecl && !HasToValueDecl) {
      Tree.SetNode(FromExpr, ToExpr);
      Tree.SetDefault(FromIter.isEnd() && FromExpr, ToIter.isEnd() && ToExpr);
      if (FromDefaultNonTypeDecl->getType()->isIntegralOrEnumerationType()) {
        if (FromExpr)
-          HasFromInt = GetInt(Context, FromIter, FromExpr, FromInt);
+          HasFromInt = GetInt(Context, FromIter, FromExpr, FromInt,
+                              FromDefaultNonTypeDecl->getType());
        if (ToExpr)
-          HasToInt = GetInt(Context, ToIter, ToExpr, ToInt);
+          HasToInt = GetInt(Context, ToIter, ToExpr, ToInt,
+                            ToDefaultNonTypeDecl->getType());
      }
      if (HasFromInt && HasToInt) {
        Tree.SetNode(FromInt, ToInt, HasFromInt, HasToInt);
-        Tree.SetSame(IsSameConvertedInt(ParamWidth, FromInt, ToInt));
+        Tree.SetSame(FromInt == ToInt);
        Tree.SetKind(DiffTree::Integer);
      } else if (HasFromInt || HasToInt) {
        Tree.SetNode(FromInt, ToInt, HasFromInt, HasToInt);
        Tree.SetSame(false);
        Tree.SetKind(DiffTree::Integer);
      } else {
-        Tree.SetSame(IsEqualExpr(Context, ParamWidth, FromExpr, ToExpr) ||
+        Tree.SetSame(IsEqualExpr(Context, FromExpr, ToExpr) ||
                     (FromNullPtr && ToNullPtr));
        Tree.SetNullPtr(FromNullPtr, ToNullPtr);
        Tree.SetKind(DiffTree::Expression);
@ -1030,11 +1028,17 @@ class TemplateDiff {

    if (HasFromInt || HasToInt) {
      if (!HasFromInt && FromExpr)
-        HasFromInt = GetInt(Context, FromIter, FromExpr, FromInt);
+        HasFromInt = GetInt(Context, FromIter, FromExpr, FromInt,
+                            FromDefaultNonTypeDecl->getType());
      if (!HasToInt && ToExpr)
-        HasToInt = GetInt(Context, ToIter, ToExpr, ToInt);
+        HasToInt = GetInt(Context, ToIter, ToExpr, ToInt,
+                          ToDefaultNonTypeDecl->getType());
      Tree.SetNode(FromInt, ToInt, HasFromInt, HasToInt);
-      Tree.SetSame(IsSameConvertedInt(ParamWidth, FromInt, ToInt));
+      if (HasFromInt && HasToInt) {
+        Tree.SetSame(FromInt == ToInt);
+      } else {
+        Tree.SetSame(false);
+      }
      Tree.SetDefault(FromIter.isEnd() && HasFromInt,
                      ToIter.isEnd() && HasToInt);
      Tree.SetKind(DiffTree::Integer);
@ -1210,9 +1214,11 @@ class TemplateDiff {
  }

  /// GetInt - Retrieves the template integer argument, including evaluating
-  /// default arguments.
+  /// default arguments.  If the value comes from an expression, extend the
+  /// APSInt to size of IntegerType to match the behavior in
+  /// Sema::CheckTemplateArgument
  static bool GetInt(ASTContext &Context, const TSTiterator &Iter,
-                     Expr *ArgExpr, llvm::APInt &Int) {
+                     Expr *ArgExpr, llvm::APSInt &Int, QualType IntegerType) {
    // Default, value-depenedent expressions require fetching
    // from the desugared TemplateArgument, otherwise expression needs to
    // be evaluatable.
@ -1224,12 +1230,14 @@ class TemplateDiff {
        case TemplateArgument::Expression:
          ArgExpr = Iter.getDesugar().getAsExpr();
          Int = ArgExpr->EvaluateKnownConstInt(Context);
+          Int = Int.extOrTrunc(Context.getTypeSize(IntegerType));
          return true;
        default:
          llvm_unreachable("Unexpected template argument kind");
      }
    } else if (ArgExpr->isEvaluatable(Context)) {
      Int = ArgExpr->EvaluateKnownConstInt(Context);
+      Int = Int.extOrTrunc(Context.getTypeSize(IntegerType));
      return true;
    }

@ -1302,18 +1310,8 @@ class TemplateDiff {
    return nullptr;
  }

-  /// IsSameConvertedInt - Returns true if both integers are equal when
-  /// converted to an integer type with the given width.
-  static bool IsSameConvertedInt(unsigned Width, const llvm::APSInt &X,
-                                 const llvm::APSInt &Y) {
-    llvm::APInt ConvertedX = X.extOrTrunc(Width);
-    llvm::APInt ConvertedY = Y.extOrTrunc(Width);
-    return ConvertedX == ConvertedY;
-  }
-
  /// IsEqualExpr - Returns true if the expressions evaluate to the same value.
-  static bool IsEqualExpr(ASTContext &Context, unsigned ParamWidth,
-                          Expr *FromExpr, Expr *ToExpr) {
+  static bool IsEqualExpr(ASTContext &Context, Expr *FromExpr, Expr *ToExpr) {
    if (FromExpr == ToExpr)
      return true;

@ -1345,7 +1343,7 @@ class TemplateDiff {

    switch (FromVal.getKind()) {
      case APValue::Int:
-        return IsSameConvertedInt(ParamWidth, FromVal.getInt(), ToVal.getInt());
+        return FromVal.getInt() == ToVal.getInt();
      case APValue::LValue: {
        APValue::LValueBase FromBase = FromVal.getLValueBase();
        APValue::LValueBase ToBase = ToVal.getLValueBase();
@ -1655,11 +1653,14 @@ class TemplateDiff {
    }
    Unbold();
  }
-  
+
  /// HasExtraInfo - Returns true if E is not an integer literal or the
  /// negation of an integer literal
  bool HasExtraInfo(Expr *E) {
    if (!E) return false;
+
+    E = E->IgnoreImpCasts();
+
    if (isa<IntegerLiteral>(E)) return false;

    if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E))
--- a/contrib/llvm/tools/clang/lib/AST/Decl.cpp
+++ b/contrib/llvm/tools/clang/lib/AST/Decl.cpp
@ -2674,7 +2674,8 @@ bool FunctionDecl::isMSExternInline() const {
  if (!Context.getLangOpts().MSVCCompat && !hasAttr<DLLExportAttr>())
    return false;

-  for (const FunctionDecl *FD = this; FD; FD = FD->getPreviousDecl())
+  for (const FunctionDecl *FD = getMostRecentDecl(); FD;
+       FD = FD->getPreviousDecl())
    if (FD->getStorageClass() == SC_Extern)
      return true;

--- a/contrib/llvm/tools/clang/lib/Analysis/ThreadSafety.cpp
+++ b/contrib/llvm/tools/clang/lib/Analysis/ThreadSafety.cpp
@ -2108,8 +2108,8 @@ void ThreadSafetyAnalyzer::runAnalysis(AnalysisDeclContext &AC) {

          // Create a dummy expression,
          VarDecl *VD = const_cast<VarDecl*>(AD.getVarDecl());
-          DeclRefExpr DRE(VD, false, VD->getType(), VK_LValue,
-                          AD.getTriggerStmt()->getLocEnd());
+          DeclRefExpr DRE(VD, false, VD->getType().getNonReferenceType(),
+                          VK_LValue, AD.getTriggerStmt()->getLocEnd());
          LocksetBuilder.handleCall(&DRE, DD);
          break;
        }
--- a/contrib/llvm/tools/clang/lib/Basic/SourceManager.cpp
+++ b/contrib/llvm/tools/clang/lib/Basic/SourceManager.cpp
@ -2076,22 +2076,33 @@ bool SourceManager::isBeforeInTranslationUnit(SourceLocation LHS,
    return IsBeforeInTUCache.getCachedResult(LOffs.second, ROffs.second);
  }

-  // This can happen if a location is in a built-ins buffer.
-  // But see PR5662.
+  // If we arrived here, the location is either in a built-ins buffer or
+  // associated with global inline asm. PR5662 and PR22576 are examples.
+
  // Clear the lookup cache, it depends on a common location.
  IsBeforeInTUCache.clear();
-  bool LIsBuiltins = strcmp("<built-in>",
-                            getBuffer(LOffs.first)->getBufferIdentifier()) == 0;
-  bool RIsBuiltins = strcmp("<built-in>",
-                            getBuffer(ROffs.first)->getBufferIdentifier()) == 0;
-  // built-in is before non-built-in
-  if (LIsBuiltins != RIsBuiltins)
-    return LIsBuiltins;
-  assert(LIsBuiltins && RIsBuiltins &&
-         "Non-built-in locations must be rooted in the main file");
-  // Both are in built-in buffers, but from different files. We just claim that
-  // lower IDs come first.
-  return LOffs.first < ROffs.first;
+  llvm::MemoryBuffer *LBuf = getBuffer(LOffs.first);
+  llvm::MemoryBuffer *RBuf = getBuffer(ROffs.first);
+  bool LIsBuiltins = strcmp("<built-in>", LBuf->getBufferIdentifier()) == 0;
+  bool RIsBuiltins = strcmp("<built-in>", RBuf->getBufferIdentifier()) == 0;
+  // Sort built-in before non-built-in.
+  if (LIsBuiltins || RIsBuiltins) {
+    if (LIsBuiltins != RIsBuiltins)
+      return LIsBuiltins;
+    // Both are in built-in buffers, but from different files. We just claim that
+    // lower IDs come first.
+    return LOffs.first < ROffs.first;
+  }
+  bool LIsAsm = strcmp("<inline asm>", LBuf->getBufferIdentifier()) == 0;
+  bool RIsAsm = strcmp("<inline asm>", RBuf->getBufferIdentifier()) == 0;
+  // Sort assembler after built-ins, but before the rest.
+  if (LIsAsm || RIsAsm) {
+    if (LIsAsm != RIsAsm)
+      return RIsAsm;
+    assert(LOffs.first == ROffs.first);
+    return false;
+  }
+  llvm_unreachable("Unsortable locations found");
 }

 void SourceManager::PrintStats() const {
--- a/contrib/llvm/tools/clang/lib/Basic/TargetInfo.cpp
+++ b/contrib/llvm/tools/clang/lib/Basic/TargetInfo.cpp
@ -655,6 +655,7 @@ bool TargetCXXABI::tryParse(llvm::StringRef name) {
    .Case("ios", iOS)
    .Case("itanium", GenericItanium)
    .Case("microsoft", Microsoft)
+    .Case("mips", GenericMIPS)
    .Default(unknown);
  if (kind == unknown) return false;

--- a/contrib/llvm/tools/clang/lib/Basic/Targets.cpp
+++ b/contrib/llvm/tools/clang/lib/Basic/Targets.cpp
@ -419,6 +419,7 @@ class NetBSDTargetInfo : public OSTargetInfo<Target> {
 public:
  NetBSDTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {
    this->UserLabelPrefix = "";
+    this->MCountName = "_mcount";
  }
 };

@ -919,6 +920,10 @@ class PPCTargetInfo : public TargetInfo {
    if (RegNo == 1) return 4;
    return -1;
  }
+
+  bool hasSjLjLowering() const override {
+    return true;
+  }
 };

 const Builtin::Info PPCTargetInfo::BuiltinInfo[] = {
@ -2181,6 +2186,10 @@ class X86TargetInfo : public TargetInfo {
  CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
    return MT == CCMT_Member ? CC_X86ThisCall : CC_C;
  }
+
+  bool hasSjLjLowering() const override {
+    return true;
+  }
 };

 bool X86TargetInfo::setFPMath(StringRef Name) {
@ -3354,7 +3363,10 @@ class WindowsX86_32TargetInfo : public WindowsTargetInfo<X86_32TargetInfo> {
      : WindowsTargetInfo<X86_32TargetInfo>(Triple) {
    WCharType = UnsignedShort;
    DoubleAlign = LongLongAlign = 64;
-    DescriptionString = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32";
+    bool IsWinCOFF =
+        getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
+    DescriptionString = IsWinCOFF ? "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+                                  : "e-m:e-p:32:32-i64:64-f80:32-n8:16:32-S32";
  }
  void getTargetDefines(const LangOptions &Opts,
                        MacroBuilder &Builder) const override {
@ -5628,7 +5640,9 @@ class MipsTargetInfoBase : public TargetInfo {
                     const std::string &CPUStr)
      : TargetInfo(Triple), CPU(CPUStr), IsMips16(false), IsMicromips(false),
        IsNan2008(false), IsSingleFloat(false), FloatABI(HardFloat),
-        DspRev(NoDSP), HasMSA(false), HasFP64(false), ABI(ABIStr) {}
+        DspRev(NoDSP), HasMSA(false), HasFP64(false), ABI(ABIStr) {
+    TheCXXABI.set(TargetCXXABI::GenericMIPS);
+  }

  bool isNaN2008Default() const {
    return CPU == "mips32r6" || CPU == "mips64r6";
@ -6662,6 +6676,8 @@ static TargetInfo *AllocateTarget(const llvm::Triple &Triple) {
    switch (os) {
    case llvm::Triple::Linux:
      return new LinuxTargetInfo<PPC64TargetInfo>(Triple);
+    case llvm::Triple::NetBSD:
+      return new NetBSDTargetInfo<PPC64TargetInfo>(Triple);
    default:
      return new PPC64TargetInfo(Triple);
    }
--- a/contrib/llvm/tools/clang/lib/Basic/Version.cpp
+++ b/contrib/llvm/tools/clang/lib/Basic/Version.cpp
@ -36,7 +36,7 @@ std::string getClangRepositoryPath() {

  // If the SVN_REPOSITORY is empty, try to use the SVN keyword. This helps us
  // pick up a tag in an SVN export, for example.
-  StringRef SVNRepository("$URL: https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_360/final/lib/Basic/Version.cpp $");
+  StringRef SVNRepository("$URL: https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_361/final/lib/Basic/Version.cpp $");
  if (URL.empty()) {
    URL = SVNRepository.slice(SVNRepository.find(':'),
                              SVNRepository.find("/lib/Basic"));
--- a/contrib/llvm/tools/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/contrib/llvm/tools/clang/lib/CodeGen/CodeGenModule.cpp
@ -64,6 +64,7 @@ static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
  case TargetCXXABI::GenericARM:
  case TargetCXXABI::iOS:
  case TargetCXXABI::iOS64:
+  case TargetCXXABI::GenericMIPS:
  case TargetCXXABI::GenericItanium:
    return CreateItaniumCXXABI(CGM);
  case TargetCXXABI::Microsoft:
--- a/contrib/llvm/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/contrib/llvm/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
@ -339,6 +339,9 @@ CodeGen::CGCXXABI *CodeGen::CreateItaniumCXXABI(CodeGenModule &CGM) {
    return new ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true,
                             /* UseARMGuardVarABI = */ true);

+  case TargetCXXABI::GenericMIPS:
+    return new ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true);
+
  case TargetCXXABI::GenericItanium:
    if (CGM.getContext().getTargetInfo().getTriple().getArch()
        == llvm::Triple::le32) {
--- a/contrib/llvm/tools/clang/lib/CodeGen/TargetInfo.cpp
+++ b/contrib/llvm/tools/clang/lib/CodeGen/TargetInfo.cpp
@ -664,7 +664,6 @@ class X86_32TargetCodeGenInfo : public TargetCodeGenInfo {
                   ('T' << 24);
    return llvm::ConstantInt::get(CGM.Int32Ty, Sig);
  }
-
 };

 }
@ -4455,7 +4454,6 @@ class ARMTargetCodeGenInfo : public TargetCodeGenInfo {
                                              llvm::AttributeSet::FunctionIndex,
                                              B));
  }
-
 };

 }
--- a/Show More
+++ b/Show More