Vendor import of llvm release_90 branch r369369:

https://llvm.org/svn/llvm-project/llvm/branches/release_90@369369
2019-08-20 21:35:15 +00:00 · 2019-08-20 21:35:15 +00:00 · 464f838b7b
commit 464f838b7b
parent e6d1592492
49 changed files with 833 additions and 337 deletions
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@ -949,7 +949,7 @@ template <typename DerivedT> class AAResultBase {

  /// A pointer to the AAResults object that this AAResult is
  /// aggregated within. May be null if not aggregated.
-  AAResults *AAR;
+  AAResults *AAR = nullptr;

  /// Helper to dispatch calls back through the derived type.
  DerivedT &derived() { return static_cast<DerivedT &>(*this); }
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@ -269,7 +269,13 @@ class SelectionDAG {

  using CallSiteInfo = MachineFunction::CallSiteInfo;
  using CallSiteInfoImpl = MachineFunction::CallSiteInfoImpl;
-  DenseMap<const SDNode *, CallSiteInfo> SDCallSiteInfo;
+
+  struct CallSiteDbgInfo {
+    CallSiteInfo CSInfo;
+    MDNode *HeapAllocSite = nullptr;
+  };
+
+  DenseMap<const SDNode *, CallSiteDbgInfo> SDCallSiteDbgInfo;

  uint16_t NextPersistentId = 0;

@ -1664,16 +1670,28 @@ public:
  }

  void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo) {
-    SDCallSiteInfo[CallNode] = std::move(CallInfo);
+    SDCallSiteDbgInfo[CallNode].CSInfo = std::move(CallInfo);
  }

  CallSiteInfo getSDCallSiteInfo(const SDNode *CallNode) {
-    auto I = SDCallSiteInfo.find(CallNode);
-    if (I != SDCallSiteInfo.end())
-      return std::move(I->second);
+    auto I = SDCallSiteDbgInfo.find(CallNode);
+    if (I != SDCallSiteDbgInfo.end())
+      return std::move(I->second).CSInfo;
    return CallSiteInfo();
  }

+  void addHeapAllocSite(const SDNode *Node, MDNode *MD) {
+    SDCallSiteDbgInfo[Node].HeapAllocSite = MD;
+  }
+
+  /// Return the HeapAllocSite type associated with the SDNode, if it exists.
+  MDNode *getHeapAllocSite(const SDNode *Node) {
+    auto It = SDCallSiteDbgInfo.find(Node);
+    if (It == SDCallSiteDbgInfo.end())
+      return nullptr;
+    return It->second.HeapAllocSite;
+  }
+
 private:
  void InsertNode(SDNode *N);
  bool RemoveNodeFromCSEMaps(SDNode *N);
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@ -3665,6 +3665,7 @@ public:
    C_Register,            // Constraint represents specific register(s).
    C_RegisterClass,       // Constraint represents any of register(s) in class.
    C_Memory,              // Memory constraint.
+    C_Immediate,           // Requires an immediate.
    C_Other,               // Something else.
    C_Unknown              // Unsupported constraint.
  };
--- a/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
+++ b/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
@ -16,6 +16,7 @@

 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
 #include <memory>

 namespace llvm {
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@ -112,6 +112,9 @@ namespace llvm {
    /// number of section symbols with the same name).
    StringMap<bool, BumpPtrAllocator &> UsedNames;

+    /// Keeps track of labels that are used in inline assembly.
+    SymbolTable InlineAsmUsedLabelNames;
+
    /// The next ID to dole out to an unnamed assembler temporary symbol with
    /// a given prefix.
    StringMap<unsigned> NextID;
@ -377,6 +380,16 @@ namespace llvm {
    /// APIs.
    const SymbolTable &getSymbols() const { return Symbols; }

+    /// isInlineAsmLabel - Return true if the name is a label referenced in
+    /// inline assembly.
+    MCSymbol *getInlineAsmLabel(StringRef Name) const {
+      return InlineAsmUsedLabelNames.lookup(Name);
+    }
+
+    /// registerInlineAsmLabel - Records that the name is a label referenced in
+    /// inline assembly.
+    void registerInlineAsmLabel(MCSymbol *Sym);
+
    /// @}

    /// \name Section Management
--- a/include/llvm/Support/AArch64TargetParser.def
+++ b/include/llvm/Support/AArch64TargetParser.def
@ -50,35 +50,35 @@ AARCH64_ARCH("armv8.5-a", ARMV8_5A, "8.5-A", "v8.5a",
 #define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)
 #endif
 // FIXME: This would be nicer were it tablegen
-AARCH64_ARCH_EXT_NAME("invalid",   AArch64::AEK_INVALID,  nullptr,  nullptr)
-AARCH64_ARCH_EXT_NAME("none",      AArch64::AEK_NONE,     nullptr,  nullptr)
-AARCH64_ARCH_EXT_NAME("crc",       AArch64::AEK_CRC,      "+crc",   "-crc")
-AARCH64_ARCH_EXT_NAME("lse",       AArch64::AEK_LSE,      "+lse",   "-lse")
-AARCH64_ARCH_EXT_NAME("rdm",       AArch64::AEK_RDM,      "+rdm",   "-rdm")
-AARCH64_ARCH_EXT_NAME("crypto",    AArch64::AEK_CRYPTO,   "+crypto","-crypto")
-AARCH64_ARCH_EXT_NAME("sm4",       AArch64::AEK_SM4,      "+sm4",   "-sm4")
-AARCH64_ARCH_EXT_NAME("sha3",      AArch64::AEK_SHA3,     "+sha3",  "-sha3")
-AARCH64_ARCH_EXT_NAME("sha2",      AArch64::AEK_SHA2,     "+sha2",  "-sha2")
-AARCH64_ARCH_EXT_NAME("aes",       AArch64::AEK_AES,      "+aes",   "-aes")
-AARCH64_ARCH_EXT_NAME("dotprod",   AArch64::AEK_DOTPROD,  "+dotprod","-dotprod")
-AARCH64_ARCH_EXT_NAME("fp",        AArch64::AEK_FP,       "+fp-armv8",  "-fp-armv8")
-AARCH64_ARCH_EXT_NAME("simd",      AArch64::AEK_SIMD,     "+neon",  "-neon")
-AARCH64_ARCH_EXT_NAME("fp16",      AArch64::AEK_FP16,     "+fullfp16",  "-fullfp16")
-AARCH64_ARCH_EXT_NAME("fp16fml",   AArch64::AEK_FP16FML,  "+fp16fml", "-fp16fml")
-AARCH64_ARCH_EXT_NAME("profile",   AArch64::AEK_PROFILE,  "+spe",  "-spe")
-AARCH64_ARCH_EXT_NAME("ras",       AArch64::AEK_RAS,      "+ras",  "-ras")
-AARCH64_ARCH_EXT_NAME("sve",       AArch64::AEK_SVE,      "+sve",  "-sve")
-AARCH64_ARCH_EXT_NAME("sve2",      AArch64::AEK_SVE2,     "+sve2", "-sve2")
-AARCH64_ARCH_EXT_NAME("sve2-aes",  AArch64::AEK_SVE2AES,  "+sve2-aes", "-sve2-aes")
-AARCH64_ARCH_EXT_NAME("sve2-sm4",  AArch64::AEK_SVE2SM4,  "+sve2-sm4", "-sve2-sm4")
-AARCH64_ARCH_EXT_NAME("sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3")
-AARCH64_ARCH_EXT_NAME("bitperm",   AArch64::AEK_BITPERM,  "+bitperm", "-bitperm")
-AARCH64_ARCH_EXT_NAME("rcpc",      AArch64::AEK_RCPC,     "+rcpc", "-rcpc")
-AARCH64_ARCH_EXT_NAME("rng",       AArch64::AEK_RAND,     "+rand",  "-rand")
-AARCH64_ARCH_EXT_NAME("memtag",    AArch64::AEK_MTE,      "+mte",   "-mte")
-AARCH64_ARCH_EXT_NAME("ssbs",      AArch64::AEK_SSBS,     "+ssbs",  "-ssbs")
-AARCH64_ARCH_EXT_NAME("sb",        AArch64::AEK_SB,       "+sb",    "-sb")
-AARCH64_ARCH_EXT_NAME("predres",   AArch64::AEK_PREDRES,  "+predres", "-predres")
+AARCH64_ARCH_EXT_NAME("invalid",      AArch64::AEK_INVALID,     nullptr,  nullptr)
+AARCH64_ARCH_EXT_NAME("none",         AArch64::AEK_NONE,        nullptr,  nullptr)
+AARCH64_ARCH_EXT_NAME("crc",          AArch64::AEK_CRC,         "+crc",   "-crc")
+AARCH64_ARCH_EXT_NAME("lse",          AArch64::AEK_LSE,         "+lse",   "-lse")
+AARCH64_ARCH_EXT_NAME("rdm",          AArch64::AEK_RDM,         "+rdm",   "-rdm")
+AARCH64_ARCH_EXT_NAME("crypto",       AArch64::AEK_CRYPTO,      "+crypto","-crypto")
+AARCH64_ARCH_EXT_NAME("sm4",          AArch64::AEK_SM4,         "+sm4",   "-sm4")
+AARCH64_ARCH_EXT_NAME("sha3",         AArch64::AEK_SHA3,        "+sha3",  "-sha3")
+AARCH64_ARCH_EXT_NAME("sha2",         AArch64::AEK_SHA2,        "+sha2",  "-sha2")
+AARCH64_ARCH_EXT_NAME("aes",          AArch64::AEK_AES,         "+aes",   "-aes")
+AARCH64_ARCH_EXT_NAME("dotprod",      AArch64::AEK_DOTPROD,     "+dotprod","-dotprod")
+AARCH64_ARCH_EXT_NAME("fp",           AArch64::AEK_FP,          "+fp-armv8",  "-fp-armv8")
+AARCH64_ARCH_EXT_NAME("simd",         AArch64::AEK_SIMD,        "+neon",  "-neon")
+AARCH64_ARCH_EXT_NAME("fp16",         AArch64::AEK_FP16,        "+fullfp16",  "-fullfp16")
+AARCH64_ARCH_EXT_NAME("fp16fml",      AArch64::AEK_FP16FML,     "+fp16fml", "-fp16fml")
+AARCH64_ARCH_EXT_NAME("profile",      AArch64::AEK_PROFILE,     "+spe",  "-spe")
+AARCH64_ARCH_EXT_NAME("ras",          AArch64::AEK_RAS,         "+ras",  "-ras")
+AARCH64_ARCH_EXT_NAME("sve",          AArch64::AEK_SVE,         "+sve",  "-sve")
+AARCH64_ARCH_EXT_NAME("sve2",         AArch64::AEK_SVE2,        "+sve2", "-sve2")
+AARCH64_ARCH_EXT_NAME("sve2-aes",     AArch64::AEK_SVE2AES,     "+sve2-aes", "-sve2-aes")
+AARCH64_ARCH_EXT_NAME("sve2-sm4",     AArch64::AEK_SVE2SM4,     "+sve2-sm4", "-sve2-sm4")
+AARCH64_ARCH_EXT_NAME("sve2-sha3",    AArch64::AEK_SVE2SHA3,    "+sve2-sha3", "-sve2-sha3")
+AARCH64_ARCH_EXT_NAME("sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm")
+AARCH64_ARCH_EXT_NAME("rcpc",         AArch64::AEK_RCPC,        "+rcpc", "-rcpc")
+AARCH64_ARCH_EXT_NAME("rng",          AArch64::AEK_RAND,        "+rand",  "-rand")
+AARCH64_ARCH_EXT_NAME("memtag",       AArch64::AEK_MTE,         "+mte",   "-mte")
+AARCH64_ARCH_EXT_NAME("ssbs",         AArch64::AEK_SSBS,        "+ssbs",  "-ssbs")
+AARCH64_ARCH_EXT_NAME("sb",           AArch64::AEK_SB,          "+sb",    "-sb")
+AARCH64_ARCH_EXT_NAME("predres",      AArch64::AEK_PREDRES,     "+predres", "-predres")
 #undef AARCH64_ARCH_EXT_NAME

 #ifndef AARCH64_CPU_NAME
--- a/include/llvm/Support/AArch64TargetParser.h
+++ b/include/llvm/Support/AArch64TargetParser.h
@ -53,7 +53,7 @@ enum ArchExtKind : unsigned {
  AEK_SVE2AES =     1 << 24,
  AEK_SVE2SM4 =     1 << 25,
  AEK_SVE2SHA3 =    1 << 26,
-  AEK_BITPERM =     1 << 27,
+  AEK_SVE2BITPERM = 1 << 27,
 };

 enum class ArchKind {
--- a/include/llvm/Support/ARMTargetParser.h
+++ b/include/llvm/Support/ARMTargetParser.h
@ -39,19 +39,13 @@ enum ArchExtKind : unsigned {
  AEK_DSP =         1 << 10,
  AEK_FP16 =        1 << 11,
  AEK_RAS =         1 << 12,
-  AEK_SVE =         1 << 13,
-  AEK_DOTPROD =     1 << 14,
-  AEK_SHA2    =     1 << 15,
-  AEK_AES     =     1 << 16,
-  AEK_FP16FML =     1 << 17,
-  AEK_SB      =     1 << 18,
-  AEK_SVE2 =        1 << 19,
-  AEK_SVE2AES =     1 << 20,
-  AEK_SVE2SM4 =     1 << 21,
-  AEK_SVE2SHA3 =    1 << 22,
-  AEK_BITPERM =     1 << 23,
-  AEK_FP_DP   =     1 << 24,
-  AEK_LOB     =     1 << 25,
+  AEK_DOTPROD =     1 << 13,
+  AEK_SHA2    =     1 << 14,
+  AEK_AES     =     1 << 15,
+  AEK_FP16FML =     1 << 16,
+  AEK_SB      =     1 << 17,
+  AEK_FP_DP   =     1 << 18,
+  AEK_LOB     =     1 << 19,
  // Unsupported extensions.
  AEK_OS = 0x8000000,
  AEK_IWMMXT = 0x10000000,
--- a/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@ -19,6 +19,7 @@

 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/IR/ValueHandle.h"
 #include <cstdint>

 namespace llvm {
@ -28,8 +29,8 @@ class Value;

 struct DivRemMapKey {
  bool SignedOp;
-  Value *Dividend;
-  Value *Divisor;
+  AssertingVH<Value> Dividend;
+  AssertingVH<Value> Divisor;

  DivRemMapKey(bool InSignedOp, Value *InDividend, Value *InDivisor)
      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
@ -50,8 +51,10 @@ template <> struct DenseMapInfo<DivRemMapKey> {
  }

  static unsigned getHashValue(const DivRemMapKey &Val) {
-    return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
-                      reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+    return (unsigned)(reinterpret_cast<uintptr_t>(
+                          static_cast<Value *>(Val.Dividend)) ^
+                      reinterpret_cast<uintptr_t>(
+                          static_cast<Value *>(Val.Divisor))) ^
           (unsigned)Val.SignedOp;
  }
 };
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@ -432,6 +432,7 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
              const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
              MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
              Sym->print(OS, AP->MAI);
+              MMI->getContext().registerInlineAsmLabel(Sym);
            } else if (MI->getOperand(OpNo).isMBB()) {
              const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
              Sym->print(OS, AP->MAI);
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@ -1682,10 +1682,11 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
    TheUse = InsertedShift;
  }

-  // If we removed all uses, nuke the shift.
+  // If we removed all uses, or there are none, nuke the shift.
  if (ShiftI->use_empty()) {
    salvageDebugInfo(*ShiftI);
    ShiftI->eraseFromParent();
+    MadeChange = true;
  }

  return MadeChange;
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@ -691,9 +691,17 @@ void LiveDebugValues::insertTransferDebugPair(
           "No register supplied when handling a restore of a debug value");
    MachineFunction *MF = MI.getMF();
    DIBuilder DIB(*const_cast<Function &>(MF->getFunction()).getParent());
+
+    const DIExpression *NewExpr;
+    if (auto Fragment = DebugInstr->getDebugExpression()->getFragmentInfo())
+      NewExpr = *DIExpression::createFragmentExpression(DIB.createExpression(),
+        Fragment->OffsetInBits, Fragment->SizeInBits);
+    else
+      NewExpr = DIB.createExpression();
+
    NewDebugInstr =
        BuildMI(*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), false,
-                NewReg, DebugInstr->getDebugVariable(), DIB.createExpression());
+                NewReg, DebugInstr->getDebugVariable(), NewExpr);
    VarLoc VL(*NewDebugInstr, LS);
    ProcessVarLoc(VL, NewDebugInstr);
    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register restore: ";
@ -848,9 +856,14 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
                      << "\n");
  }
  // Check if the register or spill location is the location of a debug value.
+  // FIXME: Don't create a spill transfer if there is a complex expression,
+  // because we currently cannot recover the original expression on restore.
  for (unsigned ID : OpenRanges.getVarLocs()) {
+    const MachineInstr *DebugInstr = &VarLocIDs[ID].MI;
+
    if (TKind == TransferKind::TransferSpill &&
-        VarLocIDs[ID].isDescribedByReg() == Reg) {
+        VarLocIDs[ID].isDescribedByReg() == Reg &&
+        !DebugInstr->getDebugExpression()->isComplex()) {
      LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
                        << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
    } else if (TKind == TransferKind::TransferRestore &&
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@ -21,6 +21,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@ -66,6 +67,7 @@ namespace {
    AliasAnalysis *AA;
    MachineDominatorTree *DT;
    MachineRegisterInfo *MRI;
+    MachineBlockFrequencyInfo *MBFI;

  public:
    static char ID; // Pass identification
@ -83,6 +85,8 @@ namespace {
      AU.addPreservedID(MachineLoopInfoID);
      AU.addRequired<MachineDominatorTree>();
      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
    }

    void releaseMemory() override {
@ -133,6 +137,11 @@ namespace {
    bool isPRECandidate(MachineInstr *MI);
    bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
    bool PerformSimplePRE(MachineDominatorTree *DT);
+    /// Heuristics to see if it's beneficial to move common computations of MBB
+    /// and MBB1 to CandidateBB.
+    bool isBeneficalToHoistInto(MachineBasicBlock *CandidateBB,
+                                MachineBasicBlock *MBB,
+                                MachineBasicBlock *MBB1);
  };

 } // end anonymous namespace
@ -802,6 +811,9 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
    if (!CMBB->isLegalToHoistInto())
      continue;

+    if (!isBeneficalToHoistInto(CMBB, MBB, MBB1))
+      continue;
+
    // Two instrs are partial redundant if their basic blocks are reachable
    // from one to another but one doesn't dominate another.
    if (CMBB != MBB1) {
@ -854,6 +866,18 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
  return Changed;
 }

+bool MachineCSE::isBeneficalToHoistInto(MachineBasicBlock *CandidateBB,
+                                        MachineBasicBlock *MBB,
+                                        MachineBasicBlock *MBB1) {
+  if (CandidateBB->getParent()->getFunction().hasMinSize())
+    return true;
+  assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
+  assert(DT->dominates(CandidateBB, MBB1) &&
+         "CandidateBB should dominate MBB1");
+  return MBFI->getBlockFreq(CandidateBB) <=
+         MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
+}
+
 bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
  if (skipFunction(MF.getFunction()))
    return false;
@ -863,6 +887,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
  MRI = &MF.getRegInfo();
  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  DT = &getAnalysis<MachineDominatorTree>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
  LookAheadLimit = TII->getMachineCSELookAheadLimit();
  bool ChangedPRE, ChangedCSE;
  ChangedPRE = PerformSimplePRE(DT);
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@ -121,7 +121,7 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
  BBCallbacks.back().setMap(this);
  Entry.Index = BBCallbacks.size() - 1;
  Entry.Fn = BB->getParent();
-  Entry.Symbols.push_back(Context.createTempSymbol());
+  Entry.Symbols.push_back(Context.createTempSymbol(!BB->hasAddressTaken()));
  return Entry.Symbols;
 }

--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@ -909,6 +909,12 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
      // Remember the source order of the inserted instruction.
      if (HasDbg)
        ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen, NewInsn);
+
+      if (MDNode *MD = DAG->getHeapAllocSite(N)) {
+        if (NewInsn && NewInsn->isCall())
+          MF.addCodeViewHeapAllocSite(NewInsn, MD);
+      }
+
      GluedNodes.pop_back();
    }
    auto NewInsn =
@ -917,6 +923,10 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
    if (HasDbg)
      ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen,
                        NewInsn);
+    if (MDNode *MD = DAG->getHeapAllocSite(SU->getNode())) {
+      if (NewInsn && NewInsn->isCall())
+        MF.addCodeViewHeapAllocSite(NewInsn, MD);
+    }
  }

  // Insert all the dbg_values which have not already been inserted in source
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -1084,6 +1084,7 @@ void SelectionDAG::clear() {
  ExternalSymbols.clear();
  TargetExternalSymbols.clear();
  MCSymbols.clear();
+  SDCallSiteDbgInfo.clear();
  std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
            static_cast<CondCodeSDNode*>(nullptr));
  std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -8021,6 +8021,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
    // Compute the constraint code and ConstraintType to use.
    TLI.ComputeConstraintToUse(T, SDValue());

+    if (T.ConstraintType == TargetLowering::C_Immediate &&
+        OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
+      // We've delayed emitting a diagnostic like the "n" constraint because
+      // inlining could cause an integer showing up.
+      return emitInlineAsmError(
+          CS, "constraint '" + Twine(T.ConstraintCode) + "' expects an "
+                  "integer constant expression");
+
    ExtraInfo.update(T);
  }

@ -8105,7 +8113,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
    switch (OpInfo.Type) {
    case InlineAsm::isOutput:
      if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
-          (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+            OpInfo.ConstraintType == TargetLowering::C_Other) &&
           OpInfo.isIndirect)) {
        unsigned ConstraintID =
            TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
@ -8119,13 +8128,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
                                                        MVT::i32));
        AsmNodeOperands.push_back(OpInfo.CallOperand);
        break;
-      } else if ((OpInfo.ConstraintType == TargetLowering::C_Other &&
+      } else if (((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+                   OpInfo.ConstraintType == TargetLowering::C_Other) &&
                  !OpInfo.isIndirect) ||
                 OpInfo.ConstraintType == TargetLowering::C_Register ||
                 OpInfo.ConstraintType == TargetLowering::C_RegisterClass) {
        // Otherwise, this outputs to a register (directly for C_Register /
-        // C_RegisterClass, and a target-defined fashion for C_Other). Find a
-        // register that we can use.
+        // C_RegisterClass, and a target-defined fashion for
+        // C_Immediate/C_Other). Find a register that we can use.
        if (OpInfo.AssignedRegs.Regs.empty()) {
          emitInlineAsmError(
              CS, "couldn't allocate output register for constraint '" +
@ -8205,15 +8215,24 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
      }

      // Treat indirect 'X' constraint as memory.
-      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+      if ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+           OpInfo.ConstraintType == TargetLowering::C_Other) &&
          OpInfo.isIndirect)
        OpInfo.ConstraintType = TargetLowering::C_Memory;

-      if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+          OpInfo.ConstraintType == TargetLowering::C_Other) {
        std::vector<SDValue> Ops;
        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                          Ops, DAG);
        if (Ops.empty()) {
+          if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
+            if (isa<ConstantSDNode>(InOperandVal)) {
+              emitInlineAsmError(CS, "value out of range for constraint '" +
+                                 Twine(OpInfo.ConstraintCode) + "'");
+              return;
+            }
+
          emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
                                     Twine(OpInfo.ConstraintCode) + "'");
          return;
@ -8250,7 +8269,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
      }

      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
-              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+              OpInfo.ConstraintType == TargetLowering::C_Register ||
+              OpInfo.ConstraintType == TargetLowering::C_Immediate) &&
             "Unknown constraint type!");

      // TODO: Support this.
@ -8356,6 +8376,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
        Val = OpInfo.AssignedRegs.getCopyFromRegs(
            DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction());
        break;
+      case TargetLowering::C_Immediate:
      case TargetLowering::C_Other:
        Val = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(),
                                              OpInfo, DAG);
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -3567,15 +3567,17 @@ TargetLowering::getConstraintType(StringRef Constraint) const {
  if (S == 1) {
    switch (Constraint[0]) {
    default: break;
-    case 'r': return C_RegisterClass;
+    case 'r':
+      return C_RegisterClass;
    case 'm': // memory
    case 'o': // offsetable
    case 'V': // not offsetable
      return C_Memory;
-    case 'i': // Simple Integer or Relocatable Constant
    case 'n': // Simple Integer
    case 'E': // Floating Point Constant
    case 'F': // Floating Point Constant
+      return C_Immediate;
+    case 'i': // Simple Integer or Relocatable Constant
    case 's': // Relocatable Constant
    case 'p': // Address.
    case 'X': // Allow ANY value.
@ -3950,6 +3952,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
 /// Return an integer indicating how general CT is.
 static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
  switch (CT) {
+  case TargetLowering::C_Immediate:
  case TargetLowering::C_Other:
  case TargetLowering::C_Unknown:
    return 0;
@ -4069,11 +4072,12 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
    TargetLowering::ConstraintType CType =
      TLI.getConstraintType(OpInfo.Codes[i]);

-    // If this is an 'other' constraint, see if the operand is valid for it.
-    // For example, on X86 we might have an 'rI' constraint.  If the operand
-    // is an integer in the range [0..31] we want to use I (saving a load
-    // of a register), otherwise we must use 'r'.
-    if (CType == TargetLowering::C_Other && Op.getNode()) {
+    // If this is an 'other' or 'immediate' constraint, see if the operand is
+    // valid for it. For example, on X86 we might have an 'rI' constraint. If
+    // the operand is an integer in the range [0..31] we want to use I (saving a
+    // load of a register), otherwise we must use 'r'.
+    if ((CType == TargetLowering::C_Other ||
+         CType == TargetLowering::C_Immediate) && Op.getNode()) {
      assert(OpInfo.Codes[i].size() == 1 &&
             "Unhandled multi-letter 'other' constraint");
      std::vector<SDValue> ResultOps;
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@ -61,6 +61,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                     bool DoAutoReset)
    : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
      Symbols(Allocator), UsedNames(Allocator),
+      InlineAsmUsedLabelNames(Allocator),
      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
      AutoReset(DoAutoReset) {
  SecureLogFile = AsSecureLogFileName;
@ -90,6 +91,7 @@ void MCContext::reset() {
  XCOFFAllocator.DestroyAll();

  MCSubtargetAllocator.DestroyAll();
+  InlineAsmUsedLabelNames.clear();
  UsedNames.clear();
  Symbols.clear();
  Allocator.Reset();
@ -272,6 +274,10 @@ void MCContext::setSymbolValue(MCStreamer &Streamer,
  Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Val, *this));
 }

+void MCContext::registerInlineAsmLabel(MCSymbol *Sym) {
+  InlineAsmUsedLabelNames[Sym->getName()] = Sym;
+}
+
 //===----------------------------------------------------------------------===//
 // Section Management
 //===----------------------------------------------------------------------===//
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@ -1142,7 +1142,9 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
      }
    }

-    MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
+    MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
+    if (!Sym)
+      Sym = getContext().getOrCreateSymbol(SymbolName);

    // If this is an absolute variable reference, substitute it now to preserve
    // semantics in the face of reassignment.
--- a/lib/Object/RelocationResolver.cpp
+++ b/lib/Object/RelocationResolver.cpp
@ -90,9 +90,9 @@ static bool supportsBPF(uint64_t Type) {
 static uint64_t resolveBPF(RelocationRef R, uint64_t S, uint64_t A) {
  switch (R.getType()) {
  case ELF::R_BPF_64_32:
-    return S & 0xFFFFFFFF;
+    return (S + A) & 0xFFFFFFFF;
  case ELF::R_BPF_64_64:
-    return S;
+    return S + A;
  default:
    llvm_unreachable("Invalid relocation type");
  }
--- a/lib/Support/AArch64TargetParser.cpp
+++ b/lib/Support/AArch64TargetParser.cpp
@ -96,8 +96,8 @@ bool AArch64::getExtensionFeatures(unsigned Extensions,
    Features.push_back("+sve2-sm4");
  if (Extensions & AEK_SVE2SHA3)
    Features.push_back("+sve2-sha3");
-  if (Extensions & AEK_BITPERM)
-    Features.push_back("+bitperm");
+  if (Extensions & AEK_SVE2BITPERM)
+    Features.push_back("+sve2-bitperm");
  if (Extensions & AEK_RCPC)
    Features.push_back("+rcpc");

--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@ -1200,7 +1200,7 @@ namespace fs {
 /// implementation.
 std::error_code copy_file(const Twine &From, const Twine &To) {
  uint32_t Flag = COPYFILE_DATA;
-#if __has_builtin(__builtin_available)
+#if __has_builtin(__builtin_available) && defined(COPYFILE_CLONE)
  if (__builtin_available(macos 10.12, *)) {
    bool IsSymlink;
    if (std::error_code Error = is_symlink_file(From, IsSymlink))
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -115,7 +115,7 @@ def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
 def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
  "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;

-def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
+def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
  "Enable bit permutation SVE2 instructions", [FeatureSVE2]>;

 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -606,6 +606,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

  MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;

+  MaxLoadsPerMemcmpOptSize = 4;
+  MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
+                      ? MaxLoadsPerMemcmpOptSize : 8;
+
  setStackPointerRegisterToSaveRestore(AArch64::SP);

  setSchedulingPreference(Sched::Hybrid);
@ -5661,8 +5665,6 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
    switch (Constraint[0]) {
    default:
      break;
-    case 'z':
-      return C_Other;
    case 'x':
    case 'w':
      return C_RegisterClass;
@ -5670,6 +5672,16 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
    // currently handle addresses it is the same as 'r'.
    case 'Q':
      return C_Memory;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'Y':
+    case 'Z':
+      return C_Immediate;
+    case 'z':
    case 'S': // A symbolic address
      return C_Other;
    }
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@ -116,7 +116,7 @@ def HasSVE2SM4       : Predicate<"Subtarget->hasSVE2SM4()">,
 def HasSVE2SHA3      : Predicate<"Subtarget->hasSVE2SHA3()">,
                                 AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
 def HasSVE2BitPerm   : Predicate<"Subtarget->hasSVE2BitPerm()">,
-                                 AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
+                                 AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                 AssemblerPredicate<"FeatureRCPC", "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -1164,6 +1164,13 @@ let Predicates = [HasSVE2] in {
  defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
  defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;

+  // SVE2 predicated shifts
+  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
+  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
+  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
+  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
+  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+
  // SVE2 integer add/subtract long
  defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
  defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
@ -1199,14 +1206,14 @@ let Predicates = [HasSVE2] in {
  defm PMULLT_ZZZ   : sve2_pmul_long<0b1, "pmullt">;

  // SVE2 bitwise shift and insert
-  defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
-  defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
+  defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
+  defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;

  // SVE2 bitwise shift right and accumulate
-  defm SSRA_ZZI  : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
-  defm USRA_ZZI  : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
-  defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
-  defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
+  defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
+  defm USRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
+  defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
+  defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;

  // SVE2 complex integer add
  defm CADD_ZZI   : sve2_int_cadd<0b0, "cadd">;
@ -1228,41 +1235,47 @@ let Predicates = [HasSVE2] in {
  defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
  defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;

-  // SVE2 bitwise shift right narrow
-  defm SQSHRUNB_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
-  defm SQSHRUNT_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
-  defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
-  defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
-  defm SHRNB_ZZI     : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
-  defm SHRNT_ZZI     : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
-  defm RSHRNB_ZZI    : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
-  defm RSHRNT_ZZI    : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
-  defm SQSHRNB_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
-  defm SQSHRNT_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
-  defm SQRSHRNB_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
-  defm SQRSHRNT_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
-  defm UQSHRNB_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
-  defm UQSHRNT_ZZI   : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
-  defm UQRSHRNB_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
-  defm UQRSHRNT_ZZI  : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
+  // SVE2 bitwise shift right narrow (bottom)
+  defm SQSHRUNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
+  defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
+  defm SHRNB_ZZI     : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
+  defm RSHRNB_ZZI    : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
+  defm SQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
+  defm SQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
+  defm UQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
+  defm UQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;

-  // SVE2 integer add/subtract narrow high part
-  defm ADDHNB_ZZZ  : sve2_int_addsub_narrow_high<0b000, "addhnb">;
-  defm ADDHNT_ZZZ  : sve2_int_addsub_narrow_high<0b001, "addhnt">;
-  defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
-  defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
-  defm SUBHNB_ZZZ  : sve2_int_addsub_narrow_high<0b100, "subhnb">;
-  defm SUBHNT_ZZZ  : sve2_int_addsub_narrow_high<0b101, "subhnt">;
-  defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
-  defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
+  // SVE2 bitwise shift right narrow (top)
+  defm SQSHRUNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
+  defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
+  defm SHRNT_ZZI     : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
+  defm RSHRNT_ZZI    : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
+  defm SQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
+  defm SQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
+  defm UQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
+  defm UQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;

-  // SVE2 saturating extract narrow
-  defm SQXTNB_ZZ  : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
-  defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
-  defm UQXTNB_ZZ  : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
-  defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
-  defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
-  defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
+  // SVE2 integer add/subtract narrow high part (bottom)
+  defm ADDHNB_ZZZ  : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
+  defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
+  defm SUBHNB_ZZZ  : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
+  defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;
+
+  // SVE2 integer add/subtract narrow high part (top)
+  defm ADDHNT_ZZZ  : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
+  defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
+  defm SUBHNT_ZZZ  : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
+  defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
+
+  // SVE2 saturating extract narrow (bottom)
+  defm SQXTNB_ZZ  : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
+  defm UQXTNB_ZZ  : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
+  defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
+
+  // SVE2 saturating extract narrow (top)
+  defm SQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
+  defm UQXTNT_ZZ  : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
+  defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;

  // SVE2 character match
  defm MATCH_PPzZZ  : sve2_char_match<0b0, "match">;
@ -1289,10 +1302,14 @@ let Predicates = [HasSVE2] in {
  // SVE2 histogram generation (vector)
  defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;

+  // SVE2 floating-point base 2 logarithm as integer
+  defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+
  // SVE2 floating-point convert precision
  defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
  defm FCVTNT_ZPmZ  : sve2_fp_convert_down_narrow<"fcvtnt">;
  defm FCVTLT_ZPmZ  : sve2_fp_convert_up_long<"fcvtlt">;
+  def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;

  // SVE2 floating-point pairwise operations
  defm FADDP_ZPmZZ   : sve2_fp_pairwise_pred<0b000, "faddp">;
@ -1321,58 +1338,45 @@ let Predicates = [HasSVE2] in {
  def BSL2N_ZZZZ_D  : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
  def NBSL_ZZZZ_D   : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;

-  // sve_int_rotate_imm
+  // SVE2 bitwise xor and rotate right by immediate
  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;

  // SVE2 extract vector (immediate offset, constructive)
  def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;

-  // SVE floating-point convert precision
-  def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
+  // SVE2 non-temporal gather loads
+  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
+  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
+  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
+  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
+  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;

-  // SVE floating-point convert to integer
-  defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
-
-  // Non-temporal contiguous loads (vector + register)
-  defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
-  defm LDNT1B_ZZR_S  : sve2_mem_cldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
-  defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
-  defm LDNT1H_ZZR_S  : sve2_mem_cldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
-  defm LDNT1W_ZZR_S  : sve2_mem_cldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;
-
-  defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
-  defm LDNT1B_ZZR_D  : sve2_mem_cldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
-  defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
-  defm LDNT1H_ZZR_D  : sve2_mem_cldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
-  defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
-  defm LDNT1W_ZZR_D  : sve2_mem_cldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
-  defm LDNT1D_ZZR_D  : sve2_mem_cldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;
+  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
+  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
+  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
+  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
+  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
+  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
+  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;

  // SVE2 vector splice (constructive)
  defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;

-  // Predicated shifts
-  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
-  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
-  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
-  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
-  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+  // SVE2 non-temporal scatter stores
+  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
+  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
+  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;

-  // Non-temporal contiguous stores (vector + register)
-  defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
-  defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
-  defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
+  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
+  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
+  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;

-  defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
-  defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
-  defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
-  defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
-
-  // SVE table lookup (three sources)
+  // SVE2 table lookup (three sources)
  defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
  defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx">;

-  // SVE integer compare scalar count and limit
+  // SVE2 integer compare scalar count and limit
  defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
  defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
  defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
@ -1383,7 +1387,7 @@ let Predicates = [HasSVE2] in {
  defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
  defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;

-  // SVE pointer conflict compare
+  // SVE2 pointer conflict compare
  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
 }
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }

+AArch64TTIImpl::TTI::MemCmpExpansionOptions
+AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+  Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  // TODO: Though vector loads usually perform well on AArch64, in some targets
+  // they may wake up the FP unit, which raises the power consumption.  Perhaps
+  // they could be used with no holds barred (-O3).
+  Options.LoadSizes = {8, 4, 2, 1};
+  return Options;
+}
+
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                    unsigned Alignment, unsigned AddressSpace,
                                    const Instruction *I) {
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -130,6 +130,9 @@ public:
  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                         const Instruction *I = nullptr);

+  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+                                                    bool IsZeroCmp) const;
+
  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                      unsigned AddressSpace, const Instruction *I = nullptr);

--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@ -2840,7 +2840,7 @@ static const struct Extension {
    {"sve2-aes", {AArch64::FeatureSVE2AES}},
    {"sve2-sm4", {AArch64::FeatureSVE2SM4}},
    {"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
-    {"bitperm", {AArch64::FeatureSVE2BitPerm}},
+    {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
    // FIXME: Unsupported extensions
    {"pan", {}},
    {"lor", {}},
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@ -403,12 +403,12 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
 }

 class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
-                      ZPRRegOp zprty>
-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
-  asm, "\t$Zdn, $Pg",
+                      ZPRRegOp zprty, PPRRegOp pprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
+  asm, "\t$Zdn, $Pm",
  "",
  []>, Sched<[]> {
-  bits<4> Pg;
+  bits<4> Pm;
  bits<5> Zdn;
  let Inst{31-24} = 0b00100101;
  let Inst{23-22} = sz8_64;
@ -416,7 +416,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
  let Inst{18-16} = opc{4-2};
  let Inst{15-11} = 0b10000;
  let Inst{10-9}  = opc{1-0};
-  let Inst{8-5}   = Pg;
+  let Inst{8-5}   = Pm;
  let Inst{4-0}   = Zdn;

  let Constraints = "$Zdn = $_Zdn";
@ -425,9 +425,16 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
 }

 multiclass sve_int_count_v<bits<5> opc, string asm> {
-  def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
-  def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
-  def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
+  def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
+  def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
+  def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
+
+  def : InstAlias<asm # "\t$Zdn, $Pm",
+                 (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
+  def : InstAlias<asm # "\t$Zdn, $Pm",
+                 (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
+  def : InstAlias<asm # "\t$Zdn, $Pm",
+                  (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
 }

 class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
@ -744,7 +751,7 @@ multiclass sve2_int_perm_tbl<string asm> {
 }

 class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
  asm, "\t$Zd, $Zn, $Zm",
  "",
  []>, Sched<[]> {
@ -758,6 +765,8 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
  let Inst{15-10} = 0b001011;
  let Inst{9-5}   = Zn;
  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
 }

 multiclass sve2_int_perm_tbx<string asm> {
@ -1489,7 +1498,7 @@ multiclass sve_fp_fcadd<string asm> {

 class sve2_fp_convert_precision<bits<4> opc, string asm,
                                ZPRRegOp zprty1, ZPRRegOp zprty2>
-: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
  asm, "\t$Zd, $Pg/m, $Zn",
  "",
  []>, Sched<[]> {
@ -1504,6 +1513,8 @@ class sve2_fp_convert_precision<bits<4> opc, string asm,
  let Inst{12-10} = Pg;
  let Inst{9-5}   = Zn;
  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
 }

 multiclass sve2_fp_convert_down_narrow<string asm> {
@ -2399,21 +2410,40 @@ multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
  def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
 }

-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
-  let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
-    def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8,  ZPR8>;
-    def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
-    def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
-    def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
-  }
-}
-
 multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
  def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
  def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
  def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
 }

+class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
+                                   ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+  asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01000101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Zm;
+  let Inst{15-11} = 0b10010;
+  let Inst{10}    = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+  let DestructiveInstType = Destructive;
+  let ElementSize = ElementSizeNone;
+}
+
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+  def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8,  ZPR8>;
+  def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
+  def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
+  def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+}
+
 class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
                                   ZPRRegOp zprty1, ZPRRegOp zprty2,
                                   Operand immtype>
@ -2451,9 +2481,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
 // SVE2 Accumulate Group
 //===----------------------------------------------------------------------===//

-class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
-                                  ZPRRegOp zprty, Operand immtype>
-: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
+class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
+                             ZPRRegOp zprty, Operand immtype>
+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
  asm, "\t$Zd, $Zn, $imm",
  "", []>, Sched<[]> {
  bits<5> Zd;
@ -2468,38 +2498,40 @@ class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
  let Inst{10}    = opc;
  let Inst{9-5}   = Zn;
  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
 }

-multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
-  def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+  def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+  def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
    let Inst{19} = imm{3};
  }
-  def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+  def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
    let Inst{20-19} = imm{4-3};
  }
-  def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+  def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
    let Inst{22}    = imm{5};
    let Inst{20-19} = imm{4-3};
  }
 }

-multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
-  def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
-  def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+  def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
    let Inst{19} = imm{3};
  }
-  def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+  def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
    let Inst{20-19} = imm{4-3};
  }
-  def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+  def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
    let Inst{22}    = imm{5};
    let Inst{20-19} = imm{4-3};
  }
 }

-class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
-                                        ZPRRegOp zprty, Operand immtype>
+class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
+                                   ZPRRegOp zprty, Operand immtype>
 : I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
  asm, "\t$Zda, $Zn, $imm",
  "", []>, Sched<[]> {
@ -2521,15 +2553,15 @@ class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm
  let ElementSize = ElementSizeNone;
 }

-multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
-  def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
-  def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+  def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
    let Inst{19} = imm{3};
  }
-  def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+  def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
    let Inst{20-19} = imm{4-3};
  }
-  def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+  def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
    let Inst{22}    = imm{5};
    let Inst{20-19} = imm{4-3};
  }
@ -2607,9 +2639,9 @@ multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
 // SVE2 Narrowing Group
 //===----------------------------------------------------------------------===//

-class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
-                                         string asm, ZPRRegOp zprty1,
-                                         ZPRRegOp zprty2, Operand immtype>
+class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
+                                           string asm, ZPRRegOp zprty1,
+                                           ZPRRegOp zprty2, Operand immtype>
 : I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
  asm, "\t$Zd, $Zn, $imm",
  "", []>, Sched<[]> {
@ -2622,26 +2654,63 @@ class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
  let Inst{20-19} = tsz8_64{1-0};
  let Inst{18-16} = imm{2-0}; // imm3
  let Inst{15-14} = 0b00;
-  let Inst{13-10} = opc;
+  let Inst{13-11} = opc;
+  let Inst{10}    = 0b0;
  let Inst{9-5}   = Zn;
  let Inst{4-0}   = Zd;
 }

-multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
-  def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
-                                              vecshiftR8>;
-  def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
-                                              vecshiftR16> {
+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
+  def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
+                                                vecshiftR8>;
+  def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
+                                                vecshiftR16> {
    let Inst{19} = imm{3};
  }
-  def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
-                                              vecshiftR32> {
+  def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
+                                                vecshiftR32> {
    let Inst{20-19} = imm{4-3};
  }
 }

-class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
-                                  ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
+                                        string asm, ZPRRegOp zprty1,
+                                        ZPRRegOp zprty2, Operand immtype>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
+  asm, "\t$Zd, $Zn, $imm",
+  "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> imm;
+  let Inst{31-23} = 0b010001010;
+  let Inst{22}    = tsz8_64{2};
+  let Inst{21}    = 0b1;
+  let Inst{20-19} = tsz8_64{1-0};
+  let Inst{18-16} = imm{2-0}; // imm3
+  let Inst{15-14} = 0b00;
+  let Inst{13-11} = opc;
+  let Inst{10}    = 0b1;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
+  def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
+                                             vecshiftR8>;
+  def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
+                                             vecshiftR16> {
+    let Inst{19} = imm{3};
+  }
+  def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
+                                             vecshiftR32> {
+    let Inst{20-19} = imm{4-3};
+  }
+}
+
+class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
+                                         ZPRRegOp zprty1, ZPRRegOp zprty2>
 : I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
  asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
  bits<5> Zd;
@ -2652,19 +2721,46 @@ class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
  let Inst{21}    = 0b1;
  let Inst{20-16} = Zm;
  let Inst{15-13} = 0b011;
-  let Inst{12-10} = opc; // S, R, T
+  let Inst{12-11} = opc; // S, R
+  let Inst{10}    = 0b0; // Top
  let Inst{9-5}   = Zn;
  let Inst{4-0}   = Zd;
 }

-multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
-  def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
-  def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
-  def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> {
+  def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
 }

-class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
-                                  ZPRRegOp zprty1, ZPRRegOp zprty2>
+class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
+                                      ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
+  asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01000101;
+  let Inst{23-22} = sz;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b011;
+  let Inst{12-11} = opc; // S, R
+  let Inst{10}    = 0b1; // Top
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> {
+  def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
+                                         ZPRRegOp zprty1, ZPRRegOp zprty2>
 : I<(outs zprty1:$Zd), (ins zprty2:$Zn),
  asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
  bits<5> Zd;
@ -2674,15 +2770,41 @@ class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
  let Inst{21}    = 0b1;
  let Inst{20-19} = tsz8_64{1-0};
  let Inst{18-13} = 0b000010;
-  let Inst{12-10} = opc;
+  let Inst{12-11} = opc;
+  let Inst{10}    = 0b0;
  let Inst{9-5}   = Zn;
  let Inst{4-0}   = Zd;
 }

-multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
-  def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
-  def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
-  def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
+multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> {
+  def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
+}
+
+class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
+                                      ZPRRegOp zprty1, ZPRRegOp zprty2>
+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
+  asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-23} = 0b010001010;
+  let Inst{22}    = tsz8_64{2};
+  let Inst{21}    = 0b1;
+  let Inst{20-19} = tsz8_64{1-0};
+  let Inst{18-13} = 0b000010;
+  let Inst{12-11} = opc;
+  let Inst{10}    = 0b1;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+}
+
+multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> {
+  def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
+  def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
+  def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
 }

 //===----------------------------------------------------------------------===//
@ -3886,9 +4008,9 @@ multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
                 (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
 }

-class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
-                             RegisterOperand VecList>
-: I<(outs VecList:$Zt), iops,
+class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
+                             RegisterOperand listty, ZPRRegOp zprty>
+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
  asm, "\t$Zt, $Pg, [$Zn, $Rm]",
  "",
  []>, Sched<[]> {
@ -3908,17 +4030,14 @@ class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
  let mayStore = 1;
 }

-multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
+multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
-                                     asm, listty>;
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;

  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
-  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
 }
@ -5094,7 +5213,7 @@ multiclass sve_mem_p_fill<string asm> {
                  (!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
 }

-class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
+class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
                             RegisterOperand VecList>
 : I<(outs VecList:$Zt), iops,
  asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
@ -5119,17 +5238,15 @@ class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
  let mayLoad = 1;
 }

-multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
+multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
                                     asm, listty>;

  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
-  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
 }
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -14369,7 +14369,8 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
-  if (Constraint.size() == 1) {
+  unsigned S = Constraint.size();
+  if (S == 1) {
    switch (Constraint[0]) {
    default:  break;
    case 'l': return C_RegisterClass;
@ -14377,12 +14378,12 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const {
    case 'h': return C_RegisterClass;
    case 'x': return C_RegisterClass;
    case 't': return C_RegisterClass;
-    case 'j': return C_Other; // Constant for movw.
-      // An address with a single base register. Due to the way we
-      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'j': return C_Immediate; // Constant for movw.
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as an 'r' memory constraint.
    case 'Q': return C_Memory;
    }
-  } else if (Constraint.size() == 2) {
+  } else if (S == 2) {
    switch (Constraint[0]) {
    default: break;
    case 'T': return C_RegisterClass;
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@ -592,6 +592,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
                      [(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
                      Sched<[WriteBrTbl]> {
    let Size = 2;
+    let isNotDuplicable = 1;
    list<Predicate> Predicates = [IsThumb, IsThumb1Only];
  }
 }
@ -1465,7 +1466,7 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
 // and make use of the same compressed jump table format as Thumb-2.
 let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
-    isIndirectBranch = 1 in {
+    isIndirectBranch = 1, isNotDuplicable = 1 in {
 def tTBB_JT : tPseudoInst<(outs),
        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
         IIC_Br, []>, Sched<[WriteBr]>;
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@ -1689,6 +1689,8 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
  if (Constraint.size() == 1) {
    // See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
    switch (Constraint[0]) {
+    default:
+      break;
    case 'a': // Simple upper registers
    case 'b': // Base pointer registers pairs
    case 'd': // Upper register
@ -1715,9 +1717,7 @@ AVRTargetLowering::getConstraintType(StringRef Constraint) const {
    case 'O': // Integer constant (Range: 8, 16, 24)
    case 'P': // Integer constant (Range: 1)
    case 'R': // Integer constant (Range: -6 to 5)x
-      return C_Other;
-    default:
-      break;
+      return C_Immediate;
    }
  }

--- a/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@ -116,9 +116,8 @@ private:
  void replaceWithGEP(std::vector<CallInst *> &CallList,
                      uint32_t NumOfZerosIndex, uint32_t DIIndex);

-  Value *computeBaseAndAccessStr(CallInst *Call, std::string &AccessStr,
-                                 std::string &AccessKey, uint32_t Kind,
-                                 MDNode *&TypeMeta);
+  Value *computeBaseAndAccessKey(CallInst *Call, std::string &AccessKey,
+                                 uint32_t Kind, MDNode *&TypeMeta);
  bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex);
  bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind);
 };
@ -340,8 +339,7 @@ bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue,
 /// Compute the base of the whole preserve_*_access_index chains, i.e., the base
 /// pointer of the first preserve_*_access_index call, and construct the access
 /// string, which will be the name of a global variable.
-Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
-                                                        std::string &AccessStr,
+Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
                                                        std::string &AccessKey,
                                                        uint32_t Kind,
                                                        MDNode *&TypeMeta) {
@ -392,16 +390,16 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
  if (!LastTypeName.size() || AccessIndices.size() > TypeNameIndex + 2)
    return nullptr;

-  // Construct the type string AccessStr.
+  // Construct the type string AccessKey.
  for (unsigned I = 0; I < AccessIndices.size(); ++I)
-    AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr;
+    AccessKey = std::to_string(AccessIndices[I]) + ":" + AccessKey;

  if (TypeNameIndex == AccessIndices.size() - 1)
-    AccessStr = "0:" + AccessStr;
+    AccessKey = "0:" + AccessKey;

  // Access key is the type name + access string, uniquely identifying
  // one kernel memory access.
-  AccessKey = LastTypeName + ":" + AccessStr;
+  AccessKey = LastTypeName + ":" + AccessKey;

  return Base;
 }
@ -410,10 +408,10 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst *Call,
 /// transformation to a chain of relocable GEPs.
 bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
                                                uint32_t Kind) {
-  std::string AccessStr, AccessKey;
+  std::string AccessKey;
  MDNode *TypeMeta = nullptr;
  Value *Base =
-      computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta);
+      computeBaseAndAccessKey(Call, AccessKey, Kind, TypeMeta);
  if (!Base)
    return false;

@ -432,7 +430,7 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,

  if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
    GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false,
-                            GlobalVariable::ExternalLinkage, NULL, AccessStr);
+                            GlobalVariable::ExternalLinkage, NULL, AccessKey);
    GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
    // Set the metadata (debuginfo types) for the global.
    if (TypeMeta)
--- a/lib/Target/BPF/BTFDebug.cpp
+++ b/lib/Target/BPF/BTFDebug.cpp
@ -30,6 +30,18 @@ static const char *BTFKindStr[] = {
 #include "BTF.def"
 };

+static const DIType * stripQualifiers(const DIType *Ty) {
+  while (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    unsigned Tag = DTy->getTag();
+    if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
+        Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_restrict_type)
+      break;
+    Ty = DTy->getBaseType();
+  }
+
+  return Ty;
+}
+
 /// Emit a BTF common type.
 void BTFTypeBase::emitType(MCStreamer &OS) {
  OS.AddComment(std::string(BTFKindStr[Kind]) + "(id = " + std::to_string(Id) +
@ -184,9 +196,9 @@ void BTFTypeEnum::emitType(MCStreamer &OS) {
  }
 }

-BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize,
-                           uint32_t NumElems)
-    : ElemSize(ElemSize) {
+BTFTypeArray::BTFTypeArray(const DIType *Ty, uint32_t ElemTypeId,
+                           uint32_t ElemSize, uint32_t NumElems)
+    : ElemTyNoQual(Ty), ElemSize(ElemSize) {
  Kind = BTF::BTF_KIND_ARRAY;
  BTFType.NameOff = 0;
  BTFType.Info = Kind << 24;
@ -207,6 +219,9 @@ void BTFTypeArray::completeType(BTFDebug &BDebug) {
  // created during initial type traversal. Just
  // retrieve that type id.
  ArrayInfo.IndexType = BDebug.getArrayIndexTypeId();
+
+  ElemTypeNoQual = ElemTyNoQual ? BDebug.getTypeId(ElemTyNoQual)
+                                : ArrayInfo.ElemType;
 }

 void BTFTypeArray::emitType(MCStreamer &OS) {
@ -218,7 +233,7 @@ void BTFTypeArray::emitType(MCStreamer &OS) {

 void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset,
                              uint32_t &ElementTypeId) {
-  ElementTypeId = ArrayInfo.ElemType;
+  ElementTypeId = ElemTypeNoQual;
  LocOffset = Loc * ElemSize;
 }

@ -251,7 +266,9 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
    } else {
      BTFMember.Offset = DDTy->getOffsetInBits();
    }
-    BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType());
+    const auto *BaseTy = DDTy->getBaseType();
+    BTFMember.Type = BDebug.getTypeId(BaseTy);
+    MemberTypeNoQual.push_back(BDebug.getTypeId(stripQualifiers(BaseTy)));
    Members.push_back(BTFMember);
  }
 }
@ -270,7 +287,7 @@ std::string BTFTypeStruct::getName() { return STy->getName(); }

 void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset,
                                  uint32_t &MemberType) {
-  MemberType = Members[Loc].Type;
+  MemberType = MemberTypeNoQual[Loc];
  MemberOffset =
      HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset;
 }
@ -492,10 +509,13 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
  uint32_t ElemTypeId, ElemSize;
  const DIType *ElemType = CTy->getBaseType();
  visitTypeEntry(ElemType, ElemTypeId, false, false);
+
+  // Strip qualifiers from element type to get accurate element size.
+  ElemType = stripQualifiers(ElemType);
  ElemSize = ElemType->getSizeInBits() >> 3;

  if (!CTy->getSizeInBits()) {
-    auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0);
+    auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemType, ElemTypeId, 0, 0);
    ArrayTypes.push_back(TypeEntry.get());
    ElemTypeId = addType(std::move(TypeEntry), CTy);
  } else {
@ -507,9 +527,11 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
          const DISubrange *SR = cast<DISubrange>(Element);
          auto *CI = SR->getCount().dyn_cast<ConstantInt *>();
          int64_t Count = CI->getSExtValue();
+          const DIType *ArrayElemTy = (I == 0) ? ElemType : nullptr;

          auto TypeEntry =
-              llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count);
+              llvm::make_unique<BTFTypeArray>(ArrayElemTy, ElemTypeId,
+                                              ElemSize, Count);
          ArrayTypes.push_back(TypeEntry.get());
          if (I == 0)
            ElemTypeId = addType(std::move(TypeEntry), CTy);
@ -1006,19 +1028,20 @@ void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
  unsigned RootId = populateStructType(RootTy);
  setTypeFromId(RootId, &PrevStructType, &PrevArrayType);
  unsigned RootTySize = PrevStructType->getStructSize();
+  StringRef IndexPattern = AccessPattern.substr(AccessPattern.find_first_of(':') + 1);

  BTFOffsetReloc OffsetReloc;
  OffsetReloc.Label = ORSym;
-  OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back());
+  OffsetReloc.OffsetNameOff = addString(IndexPattern.drop_back());
  OffsetReloc.TypeID = RootId;

  uint32_t Start = 0, End = 0, Offset = 0;
  bool FirstAccess = true;
-  for (auto C : AccessPattern) {
+  for (auto C : IndexPattern) {
    if (C != ':') {
      End++;
    } else {
-      std::string SubStr = AccessPattern.substr(Start, End - Start);
+      std::string SubStr = IndexPattern.substr(Start, End - Start);
      int Loc = std::stoi(SubStr);

      if (FirstAccess) {
@ -1038,12 +1061,15 @@ void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
        Offset += LocOffset;
        PrevArrayType = nullptr;
        setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType);
+      } else {
+        llvm_unreachable("Internal Error: BTF offset relocation type traversal error");
      }
+
      Start = End + 1;
      End = Start;
    }
  }
-  AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset;
+  AccessOffsets[AccessPattern.str()] = Offset;
  OffsetRelocTable[SecNameOff].push_back(OffsetReloc);
 }

@ -1227,7 +1253,7 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
        MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
        DIType *Ty = dyn_cast<DIType>(MDN);
        std::string TypeName = Ty->getName();
-        int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()];
+        int64_t Imm = AccessOffsets[GVar->getName().str()];

        // Emit "mov ri, <imm>" for abstract member accesses.
        OutMI.setOpcode(BPF::MOV_ri);
--- a/lib/Target/BPF/BTFDebug.h
+++ b/lib/Target/BPF/BTFDebug.h
@ -104,11 +104,14 @@ public:

 /// Handle array type.
 class BTFTypeArray : public BTFTypeBase {
+  const DIType *ElemTyNoQual;
  uint32_t ElemSize;
  struct BTF::BTFArray ArrayInfo;
+  uint32_t ElemTypeNoQual;

 public:
-  BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems);
+  BTFTypeArray(const DIType *Ty, uint32_t ElemTypeId,
+               uint32_t ElemSize, uint32_t NumElems);
  uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
  void completeType(BTFDebug &BDebug);
  void emitType(MCStreamer &OS);
@ -120,6 +123,7 @@ class BTFTypeStruct : public BTFTypeBase {
  const DICompositeType *STy;
  bool HasBitField;
  std::vector<struct BTF::BTFMember> Members;
+  std::vector<uint32_t> MemberTypeNoQual;

 public:
  BTFTypeStruct(const DICompositeType *STy, bool IsStruct, bool HasBitField,
--- a/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@ -1208,6 +1208,24 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
    Res = V;
  } else
    Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+  MCBinaryExpr::Opcode Opcode;
+  switch (getLexer().getKind()) {
+  default:
+    Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+    return MatchOperand_Success;
+  case AsmToken::Plus:
+    Opcode = MCBinaryExpr::Add;
+    break;
+  case AsmToken::Minus:
+    Opcode = MCBinaryExpr::Sub;
+    break;
+  }
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+  Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
  Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
  return MatchOperand_Success;
 }
--- a/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/lib/Target/RISCV/RISCVFrameLowering.cpp
@ -40,8 +40,16 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
  uint64_t FrameSize = MFI.getStackSize();

  // Get the alignment.
-  uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
-                                                      : getStackAlignment();
+  unsigned StackAlign = getStackAlignment();
+  if (RI->needsStackRealignment(MF)) {
+    unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment());
+    FrameSize += (MaxStackAlign - StackAlign);
+    StackAlign = MaxStackAlign;
+  }
+
+  // Set Max Call Frame Size
+  uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
+  MFI.setMaxCallFrameSize(MaxCallSize);

  // Make sure the frame is aligned.
  FrameSize = alignTo(FrameSize, StackAlign);
@ -101,6 +109,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
  const RISCVInstrInfo *TII = STI.getInstrInfo();
  MachineBasicBlock::iterator MBBI = MBB.begin();

+  if (RI->needsStackRealignment(MF) && MFI.hasVarSizedObjects()) {
+    report_fatal_error(
+        "RISC-V backend can't currently handle functions that need stack "
+        "realignment and have variable sized objects");
+  }
+
  unsigned FPReg = getFPReg(STI);
  unsigned SPReg = getSPReg(STI);

@ -158,6 +172,29 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
        nullptr, RI->getDwarfRegNum(FPReg, true), 0));
    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
        .addCFIIndex(CFIIndex);
+
+    // Realign Stack
+    const RISCVRegisterInfo *RI = STI.getRegisterInfo();
+    if (RI->needsStackRealignment(MF)) {
+      unsigned MaxAlignment = MFI.getMaxAlignment();
+
+      const RISCVInstrInfo *TII = STI.getInstrInfo();
+      if (isInt<12>(-(int)MaxAlignment)) {
+        BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
+            .addReg(SPReg)
+            .addImm(-(int)MaxAlignment);
+      } else {
+        unsigned ShiftAmount = countTrailingZeros(MaxAlignment);
+        unsigned VR =
+            MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+        BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
+            .addReg(SPReg)
+            .addImm(ShiftAmount);
+        BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg)
+            .addReg(VR)
+            .addImm(ShiftAmount);
+      }
+    }
  }
 }

@ -257,6 +294,13 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
  if (FI >= MinCSFI && FI <= MaxCSFI) {
    FrameReg = RISCV::X2;
    Offset += MF.getFrameInfo().getStackSize();
+  } else if (RI->needsStackRealignment(MF)) {
+    assert(!MFI.hasVarSizedObjects() &&
+           "Unexpected combination of stack realignment and varsized objects");
+    // If the stack was realigned, the frame pointer is set in order to allow
+    // SP to be restored, but we still access stack objects using SP.
+    FrameReg = RISCV::X2;
+    Offset += MF.getFrameInfo().getStackSize();
  } else {
    FrameReg = RI->getFrameRegister(MF);
    if (hasFP(MF))
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@ -1007,12 +1007,14 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
      // We can materialise `c1 << c2` into an add immediate, so it's "free",
      // and the combine should happen, to potentially allow further combines
      // later.
-      if (isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
+      if (ShiftedC1Int.getMinSignedBits() <= 64 &&
+          isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
        return true;

      // We can materialise `c1` in an add immediate, so it's "free", and the
      // combine should be prevented.
-      if (isLegalAddImmediate(C1Int.getSExtValue()))
+      if (C1Int.getMinSignedBits() <= 64 &&
+          isLegalAddImmediate(C1Int.getSExtValue()))
        return false;

      // Neither constant will fit into an immediate, so find materialisation
@ -2397,6 +2399,25 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
  return nullptr;
 }

+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+RISCVTargetLowering::ConstraintType
+RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'f':
+      return C_RegisterClass;
+    case 'I':
+    case 'J':
+    case 'K':
+      return C_Immediate;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                  StringRef Constraint,
@ -2407,6 +2428,12 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
    switch (Constraint[0]) {
    case 'r':
      return std::make_pair(0U, &RISCV::GPRRegClass);
+    case 'f':
+      if (Subtarget.hasStdExtF() && VT == MVT::f32)
+        return std::make_pair(0U, &RISCV::FPR32RegClass);
+      if (Subtarget.hasStdExtD() && VT == MVT::f64)
+        return std::make_pair(0U, &RISCV::FPR64RegClass);
+      break;
    default:
      break;
    }
--- a/lib/Target/RISCV/RISCVISelLowering.h
+++ b/lib/Target/RISCV/RISCVISelLowering.h
@ -92,6 +92,7 @@ public:
  // This method returns the name of a target specific DAG node.
  const char *getTargetNodeName(unsigned Opcode) const override;

+  ConstraintType getConstraintType(StringRef Constraint) const override;
  std::pair<unsigned, const TargetRegisterClass *>
  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                               StringRef Constraint, MVT VT) const override;
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@ -3183,7 +3183,7 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
    case 'e':
      return C_RegisterClass;
    case 'I': // SIMM13
-      return C_Other;
+      return C_Immediate;
    }
  }

--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -956,7 +956,7 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
    case 'K': // Signed 16-bit constant
    case 'L': // Signed 20-bit displacement (on all targets we support)
    case 'M': // 0x7fffffff
-      return C_Other;
+      return C_Immediate;

    default:
      break;
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@ -95,7 +95,8 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                      "Support 64-bit instructions">;
 def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
-                                      "64-bit with cmpxchg16b">;
+                                      "64-bit with cmpxchg16b",
+                                      [FeatureCMPXCHG8B]>;
 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                       "SHLD instruction is slow">;
 def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -2464,6 +2464,37 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
      Complexity += 2;
  }

+  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
+  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
+  // duplicating flag-producing instructions later in the pipeline.
+  if (N.getOpcode() == ISD::ADD) {
+    auto isMathWithFlags = [](SDValue V) {
+      switch (V.getOpcode()) {
+      case X86ISD::ADD:
+      case X86ISD::SUB:
+      case X86ISD::ADC:
+      case X86ISD::SBB:
+      /* TODO: These opcodes can be added safely, but we may want to justify
+               their inclusion for different reasons (better for reg-alloc).
+      case X86ISD::SMUL:
+      case X86ISD::UMUL:
+      case X86ISD::OR:
+      case X86ISD::XOR:
+      case X86ISD::AND:
+      */
+        // Value 1 is the flag output of the node - verify it's not dead.
+        return !SDValue(V.getNode(), 1).use_empty();
+      default:
+        return false;
+      }
+    };
+    // TODO: This could be an 'or' rather than 'and' to make the transform more
+    //       likely to happen. We might want to factor in whether there's a
+    //       load folding opportunity for the math op that disappears with LEA.
+    if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+      Complexity++;
+  }
+
  if (AM.Disp)
    Complexity++;

@ -3302,8 +3333,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
  SDValue ImplDef = SDValue(
      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
-  NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
-                                        NBits);
+
+  SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
+  insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
+  NBits = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
+                             NBits, SRIdxVal), 0);
  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);

  if (Subtarget->hasBMI2()) {
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -4069,6 +4069,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  InFlag = Chain.getValue(1);
  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

+  // Save heapallocsite metadata.
+  if (CLI.CS)
+    if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+      DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
+
  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPop;
  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
@ -5500,6 +5505,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
        Idx == (VT.getVectorNumElements() / 2) &&
        Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        Src.getOperand(1).getValueType() == SubVT &&
        isNullConstant(Src.getOperand(2))) {
      Ops.push_back(Src.getOperand(1));
      Ops.push_back(Sub);
@ -34062,25 +34068,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
      return true;
    break;
  }
-  case X86ISD::SUBV_BROADCAST: {
-    // Reduce size of broadcast if we don't need the upper half.
-    unsigned HalfElts = NumElts / 2;
-    if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
-      SDValue Src = Op.getOperand(0);
-      MVT SrcVT = Src.getSimpleValueType();
-
-      SDValue Half = Src;
-      if (SrcVT.getVectorNumElements() != HalfElts) {
-        MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
-        Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
-      }
-
-      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
-                                               TLO.DAG, SDLoc(Op),
-                                               Half.getValueSizeInBits()));
-    }
-    break;
-  }
  case X86ISD::VPERMV: {
    SDValue Mask = Op.getOperand(0);
    APInt MaskUndef, MaskZero;
@ -34134,6 +34121,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
      SDValue Insert =
          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
      return TLO.CombineTo(Op, Insert);
+    }
+      // Subvector broadcast.
+    case X86ISD::SUBV_BROADCAST: {
+      SDLoc DL(Op);
+      SDValue Src = Op.getOperand(0);
+      if (Src.getValueSizeInBits() > ExtSizeInBits)
+        Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
+      else if (Src.getValueSizeInBits() < ExtSizeInBits) {
+        MVT SrcSVT = Src.getSimpleValueType().getScalarType();
+        MVT SrcVT =
+            MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
+        Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
+      }
+      return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+                                               TLO.DAG, DL, ExtSizeInBits));
    }
      // Byte shifts by immediate.
    case X86ISD::VSHLDQ:
@ -43839,6 +43841,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
      isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
+      Vec.getOperand(1).getValueSizeInBits() == SubVecVT.getSizeInBits() &&
      Vec.hasOneUse()) {
    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
                      Vec.getOperand(1), Vec.getOperand(2));
@ -44660,10 +44663,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
    case 'I':
    case 'J':
    case 'K':
-    case 'L':
-    case 'M':
    case 'N':
    case 'G':
+    case 'L':
+    case 'M':
+      return C_Immediate;
    case 'C':
    case 'e':
    case 'Z':
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@ -3288,26 +3288,35 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,

  // Look for an 'and' of two (opposite) logical shifts.
  // Pick the single-use shift as XShift.
-  Value *XShift, *YShift;
+  Instruction *XShift, *YShift;
  if (!match(I.getOperand(0),
-             m_c_And(m_OneUse(m_CombineAnd(m_AnyLogicalShift, m_Value(XShift))),
-                     m_CombineAnd(m_AnyLogicalShift, m_Value(YShift)))))
+             m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
+                     m_CombineAnd(m_AnyLogicalShift, m_Instruction(YShift)))))
    return nullptr;

-  // If YShift is a single-use 'lshr', swap the shifts around.
-  if (match(YShift, m_OneUse(m_AnyLShr)))
+  // If YShift is a 'lshr', swap the shifts around.
+  if (match(YShift, m_AnyLShr))
    std::swap(XShift, YShift);

  // The shifts must be in opposite directions.
-  Instruction::BinaryOps XShiftOpcode =
-      cast<BinaryOperator>(XShift)->getOpcode();
-  if (XShiftOpcode == cast<BinaryOperator>(YShift)->getOpcode())
+  auto XShiftOpcode = XShift->getOpcode();
+  if (XShiftOpcode == YShift->getOpcode())
    return nullptr; // Do not care about same-direction shifts here.

  Value *X, *XShAmt, *Y, *YShAmt;
  match(XShift, m_BinOp(m_Value(X), m_Value(XShAmt)));
  match(YShift, m_BinOp(m_Value(Y), m_Value(YShAmt)));

+  // If one of the values being shifted is a constant, then we will end with
+  // and+icmp, and shift instr will be constant-folded. If they are not,
+  // however, we will need to ensure that we won't increase instruction count.
+  if (!isa<Constant>(X) && !isa<Constant>(Y)) {
+    // At least one of the hands of the 'and' should be one-use shift.
+    if (!match(I.getOperand(0),
+               m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
+      return nullptr;
+  }
+
  // Can we fold (XShAmt+YShAmt) ?
  Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, XShAmt, YShAmt,
                                  SQ.getWithInstruction(&I));
--- a/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@ -23,6 +23,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
 using namespace llvm;

 #define DEBUG_TYPE "div-rem-pairs"
@ -32,24 +33,44 @@ STATISTIC(NumDecomposed, "Number of instructions decomposed");
 DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
              "Controls transformations in div-rem-pairs pass");

-/// Find matching pairs of integer div/rem ops (they have the same numerator,
-/// denominator, and signedness). If they exist in different basic blocks, bring
-/// them together by hoisting or replace the common division operation that is
-/// implicit in the remainder:
-/// X % Y <--> X - ((X / Y) * Y).
-///
-/// We can largely ignore the normal safety and cost constraints on speculation
-/// of these ops when we find a matching pair. This is because we are already
-/// guaranteed that any exceptions and most cost are already incurred by the
-/// first member of the pair.
-///
-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
-/// SimplifyCFG, but it's split off on its own because it's different enough
-/// that it doesn't quite match the stated objectives of those passes.
-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
-                           const DominatorTree &DT) {
-  bool Changed = false;
+/// A thin wrapper to store two values that we matched as div-rem pair.
+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
+struct DivRemPairWorklistEntry {
+  /// The actual udiv/sdiv instruction. Source of truth.
+  AssertingVH<Instruction> DivInst;

+  /// The instruction that we have matched as a remainder instruction.
+  /// Should only be used as Value, don't introspect it.
+  AssertingVH<Instruction> RemInst;
+
+  DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
+      : DivInst(DivInst_), RemInst(RemInst_) {
+    assert((DivInst->getOpcode() == Instruction::UDiv ||
+            DivInst->getOpcode() == Instruction::SDiv) &&
+           "Not a division.");
+    assert(DivInst->getType() == RemInst->getType() && "Types should match.");
+    // We can't check anything else about remainder instruction,
+    // it's not strictly required to be a urem/srem.
+  }
+
+  /// The type for this pair, identical for both the div and rem.
+  Type *getType() const { return DivInst->getType(); }
+
+  /// Is this pair signed or unsigned?
+  bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
+
+  /// In this pair, what are the divident and divisor?
+  Value *getDividend() const { return DivInst->getOperand(0); }
+  Value *getDivisor() const { return DivInst->getOperand(1); }
+};
+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). Place those pairs into a worklist for further
+/// processing. This indirection is needed because we have to use TrackingVH<>
+/// because we will be doing RAUW, and if one of the rem instructions we change
+/// happens to be an input to another div/rem in the maps, we'd have problems.
+static DivRemWorklistTy getWorklist(Function &F) {
  // Insert all divide and remainder instructions into maps keyed by their
  // operands and opcode (signed or unsigned).
  DenseMap<DivRemMapKey, Instruction *> DivMap;
@ -69,6 +90,9 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
    }
  }

+  // We'll accumulate the matching pairs of div-rem instructions here.
+  DivRemWorklistTy Worklist;
+
  // We can iterate over either map because we are only looking for matched
  // pairs. Choose remainders for efficiency because they are usually even more
  // rare than division.
@ -78,12 +102,45 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
    if (!DivInst)
      continue;

-    // We have a matching pair of div/rem instructions. If one dominates the
-    // other, hoist and/or replace one.
+    // We have a matching pair of div/rem instructions.
    NumPairs++;
    Instruction *RemInst = RemPair.second;
-    bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
-    bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+
+    // Place it in the worklist.
+    Worklist.emplace_back(DivInst, RemInst);
+  }
+
+  return Worklist;
+}
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+                           const DominatorTree &DT) {
+  bool Changed = false;
+
+  // Get the matching pairs of div-rem instructions. We want this extra
+  // indirection to avoid dealing with having to RAUW the keys of the maps.
+  DivRemWorklistTy Worklist = getWorklist(F);
+
+  // Process each entry in the worklist.
+  for (DivRemPairWorklistEntry &E : Worklist) {
+    bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
+
+    auto &DivInst = E.DivInst;
+    auto &RemInst = E.RemInst;

    // If the target supports div+rem and the instructions are in the same block
    // already, there's nothing to do. The backend should handle this. If the
@ -110,8 +167,8 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
      // The target does not have a single div/rem operation. Decompose the
      // remainder calculation as:
      // X % Y --> X - ((X / Y) * Y).
-      Value *X = RemInst->getOperand(0);
-      Value *Y = RemInst->getOperand(1);
+      Value *X = E.getDividend();
+      Value *Y = E.getDivisor();
      Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
      Instruction *Sub = BinaryOperator::CreateSub(X, Mul);

@ -152,8 +209,13 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,

      // Now kill the explicit remainder. We have replaced it with:
      // (sub X, (mul (div X, Y), Y)
-      RemInst->replaceAllUsesWith(Sub);
-      RemInst->eraseFromParent();
+      Sub->setName(RemInst->getName() + ".decomposed");
+      Instruction *OrigRemInst = RemInst;
+      // Update AssertingVH<> with new instruction so it doesn't assert.
+      RemInst = Sub;
+      // And replace the original instruction with the new one.
+      OrigRemInst->replaceAllUsesWith(Sub);
+      OrigRemInst->eraseFromParent();
      NumDecomposed++;
    }
    Changed = true;
@ -188,7 +250,7 @@ struct DivRemPairsLegacyPass : public FunctionPass {
    return optimizeDivRem(F, TTI, DT);
  }
 };
-}
+} // namespace

 char DivRemPairsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
--- a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@ -777,8 +777,10 @@ static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
    // speculation if the predecessor is an invoke. This doesn't seem
    // fundamental and we should probably be splitting critical edges
    // differently.
-    if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
-        isa<InvokeInst>(PredBB->getTerminator())) {
+    const auto *TermInst = PredBB->getTerminator();
+    if (isa<IndirectBrInst>(TermInst) ||
+        isa<InvokeInst>(TermInst) ||
+        isa<CallBrInst>(TermInst)) {
      LLVM_DEBUG(dbgs() << "  Invalid: predecessor terminator: "
                        << PredBB->getName() << "\n");
      return false;