From fa40418fea35c68de2a358bce3539cdc5cbcd21a Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Thu, 29 Jul 2021 22:31:35 +0200
Subject: [PATCH 1/3] Vendor import of llvm-project branch release/13.x
 llvmorg-13-init-16854-g6b2e4c5a58d7.

---
 clang/lib/Format/WhitespaceManager.cpp        |   2 +-
 .../Utils/ScalarEvolutionExpander.h           |   5 -
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |   2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +
 .../Transforms/Scalar/LoopStrengthReduce.cpp  | 482 +++---------------
 .../Utils/ScalarEvolutionExpander.cpp         |   5 +-
 6 files changed, 87 insertions(+), 415 deletions(-)
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index ca2222d1feff..a822e0aaf1f9 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -347,7 +347,7 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
         if (ScopeStart > Start + 1 &&
             Changes[ScopeStart - 2].Tok->is(tok::identifier) &&
             Changes[ScopeStart - 1].Tok->is(tok::l_paren))
-          return true;
+          return Style.BinPackArguments;
 
         // Ternary operator
         if (Changes[i].Tok->is(TT_ConditionalExpr))
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 59bf3a342caa..8662dbf385dc 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -83,9 +83,6 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
   /// InsertedValues/InsertedPostIncValues.
   SmallPtrSet<Value *, 16> ReusedValues;
 
-  // The induction variables generated.
-  SmallVector<WeakVH, 2> InsertedIVs;
-
   /// A memoization of the "relevant" loop for a given SCEV.
   DenseMap<const SCEV *, const Loop *> RelevantLoops;
 
@@ -202,11 +199,9 @@ public:
     InsertedPostIncValues.clear();
     ReusedValues.clear();
     ChainedPhis.clear();
-    InsertedIVs.clear();
   }
 
   ScalarEvolution *getSE() { return &SE; }
-  const SmallVectorImpl<WeakVH> &getInsertedIVs() const { return InsertedIVs; }
 
   /// Return a vector containing all instructions inserted during expansion.
   SmallVector<Instruction *, 32> getAllInsertedInstructions() const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 1415cce3b1df..09627ee6a164 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1660,7 +1660,7 @@ static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
     // physical registers if there is debug info associated with the terminator
     // of our mbb. We want to include said debug info in our terminator
     // sequence, so we return true in that case.
-    return MI.isDebugValue();
+    return MI.isDebugInstr();
 
   // We have left the terminator sequence if we are not doing one of the
   // following:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e7282aad05e2..ae702eedcd66 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4348,6 +4348,9 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
     IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
     MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
+    Mask = DAG.getNode(
+        ISD::ZERO_EXTEND, DL,
+        VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
   }
 
   if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
@@ -4452,6 +4455,9 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
         ISD::ANY_EXTEND, DL,
         VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
     StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
+    Mask = DAG.getNode(
+        ISD::ZERO_EXTEND, DL,
+        VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask);
   } else if (VT.isFloatingPoint()) {
     // Handle FP data by casting the data so an integer scatter can be used.
     EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 5f210380ae5a..b585818af595 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1981,9 +1981,6 @@ class LSRInstance {
   /// IV users that belong to profitable IVChains.
   SmallPtrSet<Use*, MaxChains> IVIncSet;
 
-  /// Induction variables that were generated and inserted by the SCEV Expander.
-  SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
-
   void OptimizeShadowIV();
   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
@@ -2088,9 +2085,6 @@ public:
               TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
 
   bool getChanged() const { return Changed; }
-  const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
-    return ScalarEvolutionIVs;
-  }
 
   void print_factors_and_types(raw_ostream &OS) const;
   void print_fixups(raw_ostream &OS) const;
@@ -5595,11 +5589,6 @@ void LSRInstance::ImplementSolution(
     GenerateIVChain(Chain, Rewriter, DeadInsts);
     Changed = true;
   }
-
-  for (const WeakVH &IV : Rewriter.getInsertedIVs())
-    if (IV && dyn_cast<Instruction>(&*IV)->getParent())
-      ScalarEvolutionIVs.push_back(IV);
-
   // Clean up after ourselves. This must be done before deleting any
   // instructions.
   Rewriter.clear();
@@ -5870,389 +5859,87 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MemorySSAWrapperPass>();
 }
 
-struct SCEVDbgValueBuilder {
-  SCEVDbgValueBuilder() = default;
-  SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) {
-    Values = Base.Values;
-    Expr = Base.Expr;
-  }
+using EqualValues = SmallVector<std::tuple<WeakVH, int64_t>, 4>;
+using EqualValuesMap =
+    DenseMap<DbgValueInst *, SmallVector<std::pair<unsigned, EqualValues>>>;
+using LocationMap =
+    DenseMap<DbgValueInst *, std::pair<DIExpression *, Metadata *>>;
 
-  /// The DIExpression as we translate the SCEV.
-  SmallVector<uint64_t, 6> Expr;
-  /// The location ops of the DIExpression.
-  SmallVector<llvm::ValueAsMetadata *, 2> Values;
-
-  void pushOperator(uint64_t Op) { Expr.push_back(Op); }
-  void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
-
-  /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
-  /// in the set of values referenced by the expression.
-  void pushValue(llvm::Value *V) {
-    Expr.push_back(llvm::dwarf::DW_OP_LLVM_arg);
-    auto *It =
-        std::find(Values.begin(), Values.end(), llvm::ValueAsMetadata::get(V));
-    unsigned ArgIndex = 0;
-    if (It != Values.end()) {
-      ArgIndex = std::distance(Values.begin(), It);
-    } else {
-      ArgIndex = Values.size();
-      Values.push_back(llvm::ValueAsMetadata::get(V));
-    }
-    Expr.push_back(ArgIndex);
-  }
-
-  void pushValue(const SCEVUnknown *U) {
-    llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
-    pushValue(V);
-  }
-
-  void pushConst(const SCEVConstant *C) {
-    Expr.push_back(llvm::dwarf::DW_OP_consts);
-    Expr.push_back(C->getAPInt().getSExtValue());
-  }
-
-  /// Several SCEV types are sequences of the same arithmetic operator applied
-  /// to constants and values that may be extended or truncated.
-  bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
-                          uint64_t DwarfOp) {
-    assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
-           "Expected arithmetic SCEV type");
-    bool Success = true;
-    unsigned EmitOperator = 0;
-    for (auto &Op : CommExpr->operands()) {
-      Success &= pushSCEV(Op);
-
-      if (EmitOperator >= 1)
-        pushOperator(DwarfOp);
-      ++EmitOperator;
-    }
-    return Success;
-  }
-
-  // TODO: Identify and omit noop casts.
-  bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
-    const llvm::SCEV *Inner = C->getOperand(0);
-    const llvm::Type *Type = C->getType();
-    uint64_t ToWidth = Type->getIntegerBitWidth();
-    bool Success = pushSCEV(Inner);
-    uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
-                          IsSigned ? llvm::dwarf::DW_ATE_signed
-                                   : llvm::dwarf::DW_ATE_unsigned};
-    for (const auto &Op : CastOps)
-      pushOperator(Op);
-    return Success;
-  }
-
-  // TODO: MinMax - although these haven't been encountered in the test suite.
-  bool pushSCEV(const llvm::SCEV *S) {
-    bool Success = true;
-    if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
-      pushConst(StartInt);
-
-    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
-      if(!U->getValue())
-        return false;
-      pushValue(U->getValue());
-
-    } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
-      Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
-
-    } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
-      Success &= pushSCEV(UDiv->getLHS());
-      Success &= pushSCEV(UDiv->getRHS());
-      pushOperator(llvm::dwarf::DW_OP_div);
-
-    } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
-      // Assert if a new and unknown SCEVCastEXpr type is encountered.
-      assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
-              isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
-             "Unexpected cast type in SCEV.");
-      Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
-
-    } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
-      Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
-
-    } else if (isa<SCEVAddRecExpr>(S)) {
-      // Nested SCEVAddRecExpr are generated by nested loops and are currently
-      // unsupported.
-      return false;
-
-    } else {
-      return false;
-    }
-    return Success;
-  }
-
-  void setFinalExpression(llvm::DbgValueInst &DI, const DIExpression *OldExpr) {
-    // Re-state assumption that this dbg.value is not variadic. Any remaining
-    // opcodes in its expression operate on a single value already on the
-    // expression stack. Prepend our operations, which will re-compute and
-    // place that value on the expression stack.
-    assert(!DI.hasArgList());
-    auto *NewExpr =
-        DIExpression::prependOpcodes(OldExpr, Expr, /*StackValue*/ true);
-    DI.setExpression(NewExpr);
-
-    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(Values);
-    DI.setRawLocation(llvm::DIArgList::get(DI.getContext(), ValArrayRef));
-  }
-
-  /// If a DVI can be emitted without a DIArgList, omit DW_OP_llvm_arg and the
-  /// location op index 0.
-  void setShortFinalExpression(llvm::DbgValueInst &DI,
-                               const DIExpression *OldExpr) {
-    assert((Expr[0] == llvm::dwarf::DW_OP_LLVM_arg && Expr[1] == 0) &&
-           "Expected DW_OP_llvm_arg and 0.");
-    DI.replaceVariableLocationOp(
-        0u, llvm::MetadataAsValue::get(DI.getContext(), Values[0]));
-
-    // See setFinalExpression: prepend our opcodes on the start of any old
-    // expression opcodes.
-    assert(!DI.hasArgList());
-    llvm::SmallVector<uint64_t, 6> FinalExpr(Expr.begin() + 2, Expr.end());
-    auto *NewExpr =
-        DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true);
-    DI.setExpression(NewExpr);
-  }
-
-  /// Once the IV and variable SCEV translation is complete, write it to the
-  /// source DVI.
-  void applyExprToDbgValue(llvm::DbgValueInst &DI,
-                           const DIExpression *OldExpr) {
-    assert(!Expr.empty() && "Unexpected empty expression.");
-    // Emit a simpler form if only a single location is referenced.
-    if (Values.size() == 1 && Expr[0] == llvm::dwarf::DW_OP_LLVM_arg &&
-        Expr[1] == 0) {
-      setShortFinalExpression(DI, OldExpr);
-    } else {
-      setFinalExpression(DI, OldExpr);
-    }
-  }
-
-  /// Return true if the combination of arithmetic operator and underlying
-  /// SCEV constant value is an identity function.
-  bool isIdentityFunction(uint64_t Op, const SCEV *S) {
-    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
-      int64_t I = C->getAPInt().getSExtValue();
-      switch (Op) {
-      case llvm::dwarf::DW_OP_plus:
-      case llvm::dwarf::DW_OP_minus:
-        return I == 0;
-      case llvm::dwarf::DW_OP_mul:
-      case llvm::dwarf::DW_OP_div:
-        return I == 1;
-      }
-    }
-    return false;
-  }
-
-  /// Convert a SCEV of a value to a DIExpression that is pushed onto the
-  /// builder's expression stack. The stack should already contain an
-  /// expression for the iteration count, so that it can be multiplied by
-  /// the stride and added to the start.
-  /// Components of the expression are omitted if they are an identity function.
-  /// Chain (non-affine) SCEVs are not supported.
-  bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
-    assert(SAR.isAffine() && "Expected affine SCEV");
-    // TODO: Is this check needed?
-    if (isa<SCEVAddRecExpr>(SAR.getStart()))
-      return false;
-
-    const SCEV *Start = SAR.getStart();
-    const SCEV *Stride = SAR.getStepRecurrence(SE);
-
-    // Skip pushing arithmetic noops.
-    if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
-      if (!pushSCEV(Stride))
-        return false;
-      pushOperator(llvm::dwarf::DW_OP_mul);
-    }
-    if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
-      if (!pushSCEV(Start))
-        return false;
-      pushOperator(llvm::dwarf::DW_OP_plus);
-    }
-    return true;
-  }
-
-  /// Convert a SCEV of a value to a DIExpression that is pushed onto the
-  /// builder's expression stack. The stack should already contain an
-  /// expression for the iteration count, so that it can be multiplied by
-  /// the stride and added to the start.
-  /// Components of the expression are omitted if they are an identity function.
-  bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
-                           ScalarEvolution &SE) {
-    assert(SAR.isAffine() && "Expected affine SCEV");
-    if (isa<SCEVAddRecExpr>(SAR.getStart())) {
-      LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
-                        << SAR << '\n');
-      return false;
-    }
-    const SCEV *Start = SAR.getStart();
-    const SCEV *Stride = SAR.getStepRecurrence(SE);
-
-    // Skip pushing arithmetic noops.
-    if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
-      if (!pushSCEV(Start))
-        return false;
-      pushOperator(llvm::dwarf::DW_OP_minus);
-    }
-    if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
-      if (!pushSCEV(Stride))
-        return false;
-      pushOperator(llvm::dwarf::DW_OP_div);
-    }
-    return true;
-  }
-};
-
-struct DVIRecoveryRec {
-  DbgValueInst *DVI;
-  DIExpression *Expr;
-  Metadata *LocationOp;
-  const llvm::SCEV *SCEV;
-};
-
-static bool RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI,
-                                     const SCEVDbgValueBuilder &IterationCount,
-                                     ScalarEvolution &SE) {
-  // LSR may add locations to previously single location-op DVIs which
-  // are currently not supported.
-  if (CachedDVI.DVI->getNumVariableLocationOps() != 1)
-    return false;
-
-  // SCEVs for SSA values are most frquently of the form
-  // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
-  // This is because %a is a PHI node that is not the IV. However, these
-  // SCEVs have not been observed to result in debuginfo-lossy optimisations,
-  // so its not expected this point will be reached.
-  if (!isa<SCEVAddRecExpr>(CachedDVI.SCEV))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "scev-salvage: Value to salvage SCEV: "
-                    << *CachedDVI.SCEV << '\n');
-
-  const auto *Rec = cast<SCEVAddRecExpr>(CachedDVI.SCEV);
-  if (!Rec->isAffine())
-    return false;
-
-  // Initialise a new builder with the iteration count expression. In
-  // combination with the value's SCEV this enables recovery.
-  SCEVDbgValueBuilder RecoverValue(IterationCount);
-  if (!RecoverValue.SCEVToValueExpr(*Rec, SE))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *CachedDVI.DVI << '\n');
-  RecoverValue.applyExprToDbgValue(*CachedDVI.DVI, CachedDVI.Expr);
-  LLVM_DEBUG(dbgs() << "scev-salvage: to: " << *CachedDVI.DVI << '\n');
-  return true;
-}
-
-static bool
-DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
-                          llvm::PHINode *LSRInductionVar,
-                          SmallVector<DVIRecoveryRec, 2> &DVIToUpdate) {
-  if (DVIToUpdate.empty())
-    return false;
-
-  const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
-  assert(SCEVInductionVar &&
-         "Anticipated a SCEV for the post-LSR induction variable");
-
-  bool Changed = false;
-  if (const SCEVAddRecExpr *IVAddRec =
-          dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
-    SCEVDbgValueBuilder IterCountExpr;
-    IterCountExpr.pushValue(LSRInductionVar);
-    if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
-      return false;
-
-    LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
-                      << '\n');
-
-    // Needn't salvage if the location op hasn't been undef'd by LSR.
-    for (auto &DVIRec : DVIToUpdate) {
-      if (!DVIRec.DVI->isUndef())
-        continue;
-
-      // Some DVIs that were single location-op when cached are now multi-op,
-      // due to LSR optimisations. However, multi-op salvaging is not yet
-      // supported by SCEV salvaging. But, we can attempt a salvage by restoring
-      // the pre-LSR single-op expression.
-      if (DVIRec.DVI->hasArgList()) {
-        llvm::Type *Ty = DVIRec.DVI->getVariableLocationOp(0)->getType();
-        DVIRec.DVI->setRawLocation(
-            llvm::ValueAsMetadata::get(UndefValue::get(Ty)));
-        DVIRec.DVI->setExpression(DVIRec.Expr);
-      }
-
-      Changed |= RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE);
-    }
-  }
-  return Changed;
-}
-
-/// Identify and cache salvageable DVI locations and expressions along with the
-/// corresponding SCEV(s). Also ensure that the DVI is not deleted before
-static void
-DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE,
-                       SmallVector<DVIRecoveryRec, 2> &SalvageableDVISCEVs,
-                       SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
+static void DbgGatherEqualValues(Loop *L, ScalarEvolution &SE,
+                                 EqualValuesMap &DbgValueToEqualSet,
+                                 LocationMap &DbgValueToLocation) {
   for (auto &B : L->getBlocks()) {
     for (auto &I : *B) {
       auto DVI = dyn_cast<DbgValueInst>(&I);
       if (!DVI)
         continue;
-
-      if (DVI->hasArgList())
-        continue;
-
-      if (!SE.isSCEVable(DVI->getVariableLocationOp(0)->getType()))
-        continue;
-
-      SalvageableDVISCEVs.push_back(
-          {DVI, DVI->getExpression(), DVI->getRawLocation(),
-           SE.getSCEV(DVI->getVariableLocationOp(0))});
-      DVIHandles.insert(DVI);
+      for (unsigned Idx = 0; Idx < DVI->getNumVariableLocationOps(); ++Idx) {
+        // TODO: We can duplicate results if the same arg appears more than
+        // once.
+        Value *V = DVI->getVariableLocationOp(Idx);
+        if (!V || !SE.isSCEVable(V->getType()))
+          continue;
+        auto DbgValueSCEV = SE.getSCEV(V);
+        EqualValues EqSet;
+        for (PHINode &Phi : L->getHeader()->phis()) {
+          if (V->getType() != Phi.getType())
+            continue;
+          if (!SE.isSCEVable(Phi.getType()))
+            continue;
+          auto PhiSCEV = SE.getSCEV(&Phi);
+          Optional<APInt> Offset =
+              SE.computeConstantDifference(DbgValueSCEV, PhiSCEV);
+          if (Offset && Offset->getMinSignedBits() <= 64)
+            EqSet.emplace_back(
+                std::make_tuple(&Phi, Offset.getValue().getSExtValue()));
+        }
+        DbgValueToEqualSet[DVI].push_back({Idx, std::move(EqSet)});
+        // If we fall back to using this raw location, at least one location op
+        // must be dead. A DIArgList will automatically undef arguments when
+        // they become unavailable, but a ValueAsMetadata will not; since we
+        // know the value should be undef, we use the undef value directly here.
+        Metadata *RawLocation =
+            DVI->hasArgList() ? DVI->getRawLocation()
+                              : ValueAsMetadata::get(UndefValue::get(
+                                    DVI->getVariableLocationOp(0)->getType()));
+        DbgValueToLocation[DVI] = {DVI->getExpression(), RawLocation};
+      }
     }
   }
 }
 
-/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
-/// any PHi from the loop header is usable, but may have less chance of
-/// surviving subsequent transforms.
-static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE,
-                                           const LSRInstance &LSR) {
-  // For now, just pick the first IV generated and inserted. Ideally pick an IV
-  // that is unlikely to be optimised away by subsequent transforms.
-  for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
-    if (!IV)
+static void DbgApplyEqualValues(EqualValuesMap &DbgValueToEqualSet,
+                                LocationMap &DbgValueToLocation) {
+  for (auto A : DbgValueToEqualSet) {
+    auto *DVI = A.first;
+    // Only update those that are now undef.
+    if (!DVI->isUndef())
       continue;
-
-    assert(isa<PHINode>(&*IV) && "Expected PhI node.");
-    if (SE.isSCEVable((*IV).getType())) {
-      PHINode *Phi = dyn_cast<PHINode>(&*IV);
-      LLVM_DEBUG(const llvm::SCEV *S = SE.getSCEV(Phi);
-                 dbgs() << "scev-salvage: IV : " << *IV << "with SCEV: " << *S
-                 << "\n");
-      return Phi;
+    // The dbg.value may have had its value or expression changed during LSR by
+    // a failed salvage attempt; refresh them from the map.
+    auto *DbgDIExpr = DbgValueToLocation[DVI].first;
+    DVI->setRawLocation(DbgValueToLocation[DVI].second);
+    DVI->setExpression(DbgDIExpr);
+    assert(DVI->isUndef() && "dbg.value with non-undef location should not "
+                             "have been modified by LSR.");
+    for (auto IdxEV : A.second) {
+      unsigned Idx = IdxEV.first;
+      for (auto EV : IdxEV.second) {
+        auto EVHandle = std::get<WeakVH>(EV);
+        if (!EVHandle)
+          continue;
+        int64_t Offset = std::get<int64_t>(EV);
+        DVI->replaceVariableLocationOp(Idx, EVHandle);
+        if (Offset) {
+          SmallVector<uint64_t, 8> Ops;
+          DIExpression::appendOffset(Ops, Offset);
+          DbgDIExpr = DIExpression::appendOpsToArg(DbgDIExpr, Ops, Idx, true);
+        }
+        DVI->setExpression(DbgDIExpr);
+        break;
+      }
     }
   }
-
-  for (PHINode &Phi : L.getHeader()->phis()) {
-    if (!SE.isSCEVable(Phi.getType()))
-      continue;
-
-    const llvm::SCEV *PhiSCEV = SE.getSCEV(&Phi);
-    if (const llvm::SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(PhiSCEV))
-      if (!Rec->isAffine())
-        continue;
-
-    LLVM_DEBUG(dbgs() << "scev-salvage: Selected IV from loop header: " << Phi
-                      << " with SCEV: " << *PhiSCEV << "\n");
-    return &Phi;
-  }
-  return nullptr;
 }
 
 static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
@@ -6261,21 +5948,20 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                                AssumptionCache &AC, TargetLibraryInfo &TLI,
                                MemorySSA *MSSA) {
 
-  // Debug preservation - before we start removing anything identify which DVI
-  // meet the salvageable criteria and store their DIExpression and SCEVs.
-  SmallVector<DVIRecoveryRec, 2> SalvageableDVI;
-  SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
-  DbgGatherSalvagableDVI(L, SE, SalvageableDVI, DVIHandles);
-
   bool Changed = false;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   if (MSSA)
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
   // Run the main LSR transformation.
-  const LSRInstance &Reducer =
-      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
-  Changed |= Reducer.getChanged();
+  Changed |=
+      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
+
+  // Debug preservation - before we start removing anything create equivalence
+  // sets for the llvm.dbg.value intrinsics.
+  EqualValuesMap DbgValueToEqualSet;
+  LocationMap DbgValueToLocation;
+  DbgGatherEqualValues(L, SE, DbgValueToEqualSet, DbgValueToLocation);
 
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
@@ -6295,22 +5981,8 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
     }
   }
 
-  if (SalvageableDVI.empty())
-    return Changed;
+  DbgApplyEqualValues(DbgValueToEqualSet, DbgValueToLocation);
 
-  // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
-  // expressions composed using the derived iteration count.
-  // TODO: Allow for multiple IV references for nested AddRecSCEVs
-  for (auto &L : LI) {
-    if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
-      DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVI);
-    else {
-      LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
-                           "could not be identified.\n");
-    }
-  }
-
-  DVIHandles.clear();
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3978e1e29825..5af1c37e6197 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1393,10 +1393,9 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   // can ensure that IVIncrement dominates the current uses.
   PostIncLoops = SavedPostIncLoops;
 
-  // Remember this PHI, even in post-inc mode. LSR SCEV-based salvaging is most
-  // effective when we are able to use an IV inserted here, so record it.
+  // Remember this PHI, even in post-inc mode.
   InsertedValues.insert(PN);
-  InsertedIVs.push_back(PN);
+
   return PN;
 }
 

From 9cb5bdb8b26e2207293f0fb56701c4a0ff64a47d Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sat, 21 Aug 2021 23:25:07 +0200
Subject: [PATCH 2/3] Vendor import of llvm-project branch release/13.x
 llvmorg-13.0.0-rc1-0-gd6974c010878.

---
 .../clang/Basic/DiagnosticDriverKinds.td      |   2 -
 .../include/clang/Basic/DiagnosticLexKinds.td |   7 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 -
 clang/include/clang/Basic/LangOptions.def     |   2 +-
 clang/include/clang/Driver/Options.td         |   3 -
 clang/include/clang/Driver/Types.h            |   8 -
 .../Frontend/PreprocessorOutputOptions.h      |   2 -
 clang/include/clang/Lex/HeaderSearch.h        |  17 +-
 clang/include/clang/Lex/Preprocessor.h        |   5 +-
 clang/include/clang/Lex/PreprocessorLexer.h   |  20 +-
 clang/lib/Basic/OpenCLOptions.cpp             |   7 +-
 clang/lib/Basic/TargetInfo.cpp                |  10 +-
 clang/lib/Basic/Targets/AArch64.cpp           |   6 +-
 clang/lib/Basic/Targets/AMDGPU.h              |   5 +-
 clang/lib/CodeGen/CGStmt.cpp                  |  43 ++-
 clang/lib/CodeGen/TargetInfo.cpp              |  14 +
 clang/lib/CodeGen/TargetInfo.h                |   7 +
 clang/lib/Driver/ToolChains/Clang.cpp         |  15 +-
 clang/lib/Driver/Types.cpp                    |  39 --
 clang/lib/Frontend/CompilerInvocation.cpp     |   2 +-
 .../lib/Frontend/PrintPreprocessedOutput.cpp  | 341 +++++++-----------
 clang/lib/Lex/Lexer.cpp                       |   4 +-
 clang/lib/Lex/PPDirectives.cpp                |   4 +
 clang/lib/Lex/PPLexerChange.cpp               |  45 ++-
 clang/lib/Lex/Pragma.cpp                      |  92 ++++-
 clang/lib/Lex/Preprocessor.cpp                |   6 -
 clang/lib/Parse/ParseDecl.cpp                 |  16 +-
 clang/lib/Sema/Sema.cpp                       |   3 +-
 clang/lib/Sema/SemaType.cpp                   |  49 ++-
 libcxx/include/format                         |  12 +-
 libcxx/include/ranges                         |  12 +-
 lld/ELF/Config.h                              |   7 +-
 lld/ELF/Driver.cpp                            |  16 +-
 lld/ELF/Options.td                            |   3 +
 lld/ELF/Symbols.cpp                           |   8 +-
 lld/ELF/SyntheticSections.cpp                 |   2 +-
 lld/docs/ReleaseNotes.rst                     |   2 +
 lld/docs/ld.lld.1                             |   3 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |   5 +
 llvm/include/llvm/CodeGen/ValueTypes.td       |   1 +
 llvm/include/llvm/Support/MachineValueType.h  |   4 +-
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |   7 +-
 .../CodeGen/GlobalISel/InlineAsmLowering.cpp  |   9 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   4 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   6 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   3 +-
 llvm/lib/CodeGen/ValueTypes.cpp               |   2 +
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |   7 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  55 ++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   7 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +
 .../lib/Target/AArch64/AArch64RegisterInfo.td |   4 +-
 .../Target/AArch64/Utils/AArch64BaseInfo.h    |  19 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   7 +
 llvm/lib/Transforms/Scalar/DivRemPairs.cpp    |   7 +-
 llvm/utils/TableGen/CodeGenTarget.cpp         |   1 +
 56 files changed, 612 insertions(+), 391 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 3b4daa59f66b..fc3704303a95 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -129,8 +129,6 @@ def err_drv_invalid_Xopenmp_target_with_args : Error<
   "invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
 def err_drv_argument_only_allowed_with : Error<
   "invalid argument '%0' only allowed with '%1'">;
-def err_drv_minws_unsupported_input_type : Error<
-  "'-fminimize-whitespace' invalid for input of type %0">;
 def err_drv_amdgpu_ieee_without_no_honor_nans : Error<
   "invalid argument '-mno-amdgpu-ieee' only allowed with relaxed NaN handling">;
 def err_drv_argument_not_allowed_with : Error<
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index ce6d0d0394b4..bdf5d263fa92 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -300,6 +300,13 @@ def pp_pragma_once_in_main_file : Warning<"#pragma once in main file">,
 def pp_pragma_sysheader_in_main_file : Warning<
   "#pragma system_header ignored in main file">,
   InGroup<DiagGroup<"pragma-system-header-outside-header">>;
+
+def err_pragma_include_instead_not_sysheader : Error<
+  "'#pragma clang include_instead' cannot be used outside of system headers">;
+def err_pragma_include_instead_system_reserved : Error<
+  "header '%0' is an implementation detail; #include %select{'%2'|either '%2' "
+  "or '%3'|one of %2}1 instead">;
+
 def pp_poisoning_existing_macro : Warning<"poisoning existing macro">;
 def pp_out_of_date_dependency : Warning<
   "current file is older than dependency %0">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 108f1796415c..c57b8eca7deb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10100,8 +10100,6 @@ def err_opencl_requires_extension : Error<
 def ext_opencl_double_without_pragma : Extension<
   "Clang permits use of type 'double' regardless pragma if 'cl_khr_fp64' is"
   " supported">;
-def err_opencl_double_requires_extension :  Error<
-    "use of type 'double' requires %select{cl_khr_fp64|cl_khr_fp64 and __opencl_c_fp64}0 support">;
 def warn_opencl_generic_address_space_arg : Warning<
   "passing non-generic address space pointer to %0"
   " may cause dynamic conversion affecting performance">,
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 08b8d8851afa..74deba6ef7fb 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -224,7 +224,7 @@ LANGOPT(OpenCLVersion     , 32, 0, "OpenCL C version")
 LANGOPT(OpenCLCPlusPlus   , 1, 0, "C++ for OpenCL")
 LANGOPT(OpenCLCPlusPlusVersion     , 32, 0, "C++ for OpenCL version")
 LANGOPT(OpenCLGenericAddressSpace, 1, 0, "OpenCL generic keyword")
-LANGOPT(OpenCLPipe               , 1, 0, "OpenCL pipe keyword")
+LANGOPT(OpenCLPipes              , 1, 0, "OpenCL pipes language constructs and built-ins")
 LANGOPT(NativeHalfType    , 1, 0, "Native half type support")
 LANGOPT(NativeHalfArgsAndReturns, 1, 0, "Native half args and returns")
 LANGOPT(HalfArgsAndReturns, 1, 0, "half args and returns")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5a9fd078390e..ab1a5487d9c0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1799,9 +1799,6 @@ def frewrite_map_file_EQ : Joined<["-"], "frewrite-map-file=">,
 defm use_line_directives : BoolFOption<"use-line-directives",
   PreprocessorOutputOpts<"UseLineDirectives">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Use #line in preprocessed output">, NegFlag<SetFalse>>;
-defm minimize_whitespace : BoolFOption<"minimize-whitespace",
-  PreprocessorOutputOpts<"MinimizeWhitespace">, DefaultFalse,
-  PosFlag<SetTrue, [CC1Option], "Minimize whitespace when emitting preprocessor output">, NegFlag<SetFalse>>;
 
 def ffreestanding : Flag<["-"], "ffreestanding">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Assert that the compilation takes place in a freestanding environment">,
diff --git a/clang/include/clang/Driver/Types.h b/clang/include/clang/Driver/Types.h
index c9d63551090c..6a1f57416ae5 100644
--- a/clang/include/clang/Driver/Types.h
+++ b/clang/include/clang/Driver/Types.h
@@ -66,14 +66,6 @@ namespace types {
   /// isAcceptedByClang - Can clang handle this input type.
   bool isAcceptedByClang(ID Id);
 
-  /// isDerivedFromC - Is the input derived from C.
-  ///
-  /// That is, does the lexer follow the rules of
-  /// TokenConcatenation::AvoidConcat. If this is the case, the preprocessor may
-  /// add and remove whitespace between tokens. Used to determine whether the
-  /// input can be processed by -fminimize-whitespace.
-  bool isDerivedFromC(ID Id);
-
   /// isCXX - Is this a "C++" input (C++ and Obj-C++ sources and headers).
   bool isCXX(ID Id);
 
diff --git a/clang/include/clang/Frontend/PreprocessorOutputOptions.h b/clang/include/clang/Frontend/PreprocessorOutputOptions.h
index 257538ee0606..72e5ad1137fb 100644
--- a/clang/include/clang/Frontend/PreprocessorOutputOptions.h
+++ b/clang/include/clang/Frontend/PreprocessorOutputOptions.h
@@ -24,7 +24,6 @@ public:
   unsigned ShowIncludeDirectives : 1;  ///< Print includes, imports etc. within preprocessed output.
   unsigned RewriteIncludes : 1;    ///< Preprocess include directives only.
   unsigned RewriteImports  : 1;    ///< Include contents of transitively-imported modules.
-  unsigned MinimizeWhitespace : 1; ///< Ignore whitespace from input.
 
 public:
   PreprocessorOutputOptions() {
@@ -37,7 +36,6 @@ public:
     ShowIncludeDirectives = 0;
     RewriteIncludes = 0;
     RewriteImports = 0;
-    MinimizeWhitespace = 0;
   }
 };
 
diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h
index 93d6ea72270a..a35a394f719b 100644
--- a/clang/include/clang/Lex/HeaderSearch.h
+++ b/clang/include/clang/Lex/HeaderSearch.h
@@ -20,9 +20,12 @@
 #include "clang/Lex/ModuleMap.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Allocator.h"
 #include <cassert>
 #include <cstddef>
@@ -110,6 +113,14 @@ struct HeaderFileInfo {
   /// of the framework.
   StringRef Framework;
 
+  /// List of aliases that this header is known as.
+  /// Most headers should only have at most one alias, but a handful
+  /// have two.
+  llvm::SetVector<llvm::SmallString<32>,
+                  llvm::SmallVector<llvm::SmallString<32>, 2>,
+                  llvm::SmallSet<llvm::SmallString<32>, 2>>
+      Aliases;
+
   HeaderFileInfo()
       : isImport(false), isPragmaOnce(false), DirInfo(SrcMgr::C_User),
         External(false), isModuleHeader(false), isCompilingModuleHeader(false),
@@ -453,6 +464,10 @@ public:
     getFileInfo(File).DirInfo = SrcMgr::C_System;
   }
 
+  void AddFileAlias(const FileEntry *File, StringRef Alias) {
+    getFileInfo(File).Aliases.insert(Alias);
+  }
+
   /// Mark the specified file as part of a module.
   void MarkFileModuleHeader(const FileEntry *FE,
                             ModuleMap::ModuleHeaderRole Role,
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 7ab13640ce2c..fe2327f0a480 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -1953,7 +1953,8 @@ public:
   /// This either returns the EOF token and returns true, or
   /// pops a level off the include stack and returns false, at which point the
   /// client should call lex again.
-  bool HandleEndOfFile(Token &Result, bool isEndOfMacro = false);
+  bool HandleEndOfFile(Token &Result, SourceLocation Loc,
+                       bool isEndOfMacro = false);
 
   /// Callback invoked when the current TokenLexer hits the end of its
   /// token stream.
@@ -2363,12 +2364,14 @@ private:
 
   // Pragmas.
   void HandlePragmaDirective(PragmaIntroducer Introducer);
+  void ResolvePragmaIncludeInstead(SourceLocation Location) const;
 
 public:
   void HandlePragmaOnce(Token &OnceTok);
   void HandlePragmaMark(Token &MarkTok);
   void HandlePragmaPoison();
   void HandlePragmaSystemHeader(Token &SysHeaderTok);
+  void HandlePragmaIncludeInstead(Token &Tok);
   void HandlePragmaDependency(Token &DependencyTok);
   void HandlePragmaPushMacro(Token &Tok);
   void HandlePragmaPopMacro(Token &Tok);
diff --git a/clang/include/clang/Lex/PreprocessorLexer.h b/clang/include/clang/Lex/PreprocessorLexer.h
index 03b1cc2c10e2..b43197a6031c 100644
--- a/clang/include/clang/Lex/PreprocessorLexer.h
+++ b/clang/include/clang/Lex/PreprocessorLexer.h
@@ -14,11 +14,13 @@
 #ifndef LLVM_CLANG_LEX_PREPROCESSORLEXER_H
 #define LLVM_CLANG_LEX_PREPROCESSORLEXER_H
 
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/MultipleIncludeOpt.h"
 #include "clang/Lex/Token.h"
-#include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include <cassert>
 
 namespace clang {
@@ -74,6 +76,13 @@ protected:
   /// we are currently in.
   SmallVector<PPConditionalInfo, 4> ConditionalStack;
 
+  struct IncludeInfo {
+    const FileEntry *File;
+    SourceLocation Location;
+  };
+  // A complete history of all the files included by the current file.
+  llvm::StringMap<IncludeInfo> IncludeHistory;
+
   PreprocessorLexer() : FID() {}
   PreprocessorLexer(Preprocessor *pp, FileID fid);
   virtual ~PreprocessorLexer() = default;
@@ -175,6 +184,15 @@ public:
     ConditionalStack.clear();
     ConditionalStack.append(CL.begin(), CL.end());
   }
+
+  void addInclude(StringRef Filename, const FileEntry &File,
+                  SourceLocation Location) {
+    IncludeHistory.insert({Filename, {&File, Location}});
+  }
+
+  const llvm::StringMap<IncludeInfo> &getIncludeHistory() const {
+    return IncludeHistory;
+  }
 };
 
 } // namespace clang
diff --git a/clang/lib/Basic/OpenCLOptions.cpp b/clang/lib/Basic/OpenCLOptions.cpp
index 2e215b185f66..b7408f39bdab 100644
--- a/clang/lib/Basic/OpenCLOptions.cpp
+++ b/clang/lib/Basic/OpenCLOptions.cpp
@@ -111,7 +111,9 @@ bool OpenCLOptions::diagnoseUnsupportedFeatureDependencies(
   // Feature pairs. First feature in a pair requires the second one to be
   // supported.
   static const llvm::StringMap<llvm::StringRef> DependentFeaturesMap = {
-      {"__opencl_c_read_write_images", "__opencl_c_images"}};
+      {"__opencl_c_read_write_images", "__opencl_c_images"},
+      {"__opencl_c_3d_image_writes", "__opencl_c_images"},
+      {"__opencl_c_pipes", "__opencl_c_generic_address_space"}};
 
   auto OpenCLFeaturesMap = TI.getSupportedOpenCLOpts();
 
@@ -130,7 +132,8 @@ bool OpenCLOptions::diagnoseFeatureExtensionDifferences(
     const TargetInfo &TI, DiagnosticsEngine &Diags) {
   // Extensions and equivalent feature pairs.
   static const llvm::StringMap<llvm::StringRef> FeatureExtensionMap = {
-      {"cl_khr_fp64", "__opencl_c_fp64"}};
+      {"cl_khr_fp64", "__opencl_c_fp64"},
+      {"cl_khr_3d_image_writes", "__opencl_c_3d_image_writes"}};
 
   auto OpenCLFeaturesMap = TI.getSupportedOpenCLOpts();
 
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index b647a2fb8a67..5f8e04c2bd6c 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -400,14 +400,18 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
     // OpenCL C v3.0 s6.7.5 - The generic address space requires support for
     // OpenCL C 2.0 or OpenCL C 3.0 with the __opencl_c_generic_address_space
     // feature
-    // FIXME: OpenCLGenericAddressSpace is also defined in setLangDefaults()
+    // OpenCL C v3.0 s6.2.1 - OpenCL pipes require support of OpenCL C 2.0
+    // or later and __opencl_c_pipes feature
+    // FIXME: These language options are also defined in setLangDefaults()
     // for OpenCL C 2.0 but with no access to target capabilities. Target
-    // should be immutable once created and thus this language option needs
+    // should be immutable once created and thus these language options need
     // to be defined only once.
-    if (Opts.OpenCLVersion >= 300) {
+    if (Opts.OpenCLVersion == 300) {
       const auto &OpenCLFeaturesMap = getSupportedOpenCLOpts();
       Opts.OpenCLGenericAddressSpace = hasFeatureEnabled(
           OpenCLFeaturesMap, "__opencl_c_generic_address_space");
+      Opts.OpenCLPipes =
+          hasFeatureEnabled(OpenCLFeaturesMap, "__opencl_c_pipes");
     }
   }
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 4070ac727d16..e163ebfa2348 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -431,7 +431,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
            Feature == "sve2-aes" || Feature == "sve2-sha3" ||
            Feature == "sve2-sm4" || Feature == "f64mm" || Feature == "f32mm" ||
            Feature == "i8mm" || Feature == "bf16") &&
-          (FPU & SveMode));
+          (FPU & SveMode)) ||
+         (Feature == "ls64" && HasLS64);
 }
 
 bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
@@ -752,6 +753,9 @@ bool AArch64TargetInfo::validateConstraintModifier(
       if (Size == 64)
         return true;
 
+      if (Size == 512)
+        return HasLS64;
+
       SuggestedModifier = "w";
       return false;
     }
diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index 244a6e044690..2e580ecf2425 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -310,9 +310,12 @@ public:
       Opts["cl_khr_mipmap_image"] = true;
       Opts["cl_khr_mipmap_image_writes"] = true;
       Opts["cl_khr_subgroups"] = true;
-      Opts["cl_khr_3d_image_writes"] = true;
       Opts["cl_amd_media_ops"] = true;
       Opts["cl_amd_media_ops2"] = true;
+
+      Opts["__opencl_c_images"] = true;
+      Opts["__opencl_c_3d_image_writes"] = true;
+      Opts["cl_khr_3d_image_writes"] = true;
     }
   }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index aeb319ca1581..0a3a722fa653 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2097,7 +2097,8 @@ CodeGenFunction::EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info,
     } else {
       llvm::Type *Ty = ConvertType(InputType);
       uint64_t Size = CGM.getDataLayout().getTypeSizeInBits(Ty);
-      if (Size <= 64 && llvm::isPowerOf2_64(Size)) {
+      if ((Size <= 64 && llvm::isPowerOf2_64(Size)) ||
+          getTargetHooks().isScalarizableAsmOperand(*this, Ty)) {
         Ty = llvm::IntegerType::get(getLLVMContext(), Size);
         Ty = llvm::PointerType::getUnqual(Ty);
 
@@ -2320,23 +2321,28 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     // If this is a register output, then make the inline asm return it
     // by-value.  If this is a memory result, return the value by-reference.
-    bool isScalarizableAggregate =
-        hasAggregateEvaluationKind(OutExpr->getType());
-    if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) ||
-                                 isScalarizableAggregate)) {
+    QualType QTy = OutExpr->getType();
+    const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) ||
+                                     hasAggregateEvaluationKind(QTy);
+    if (!Info.allowsMemory() && IsScalarOrAggregate) {
+
       Constraints += "=" + OutputConstraint;
-      ResultRegQualTys.push_back(OutExpr->getType());
+      ResultRegQualTys.push_back(QTy);
       ResultRegDests.push_back(Dest);
-      ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType()));
-      if (Info.allowsRegister() && isScalarizableAggregate) {
-        ResultTypeRequiresCast.push_back(true);
-        unsigned Size = getContext().getTypeSize(OutExpr->getType());
-        llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size);
-        ResultRegTypes.push_back(ConvTy);
-      } else {
-        ResultTypeRequiresCast.push_back(false);
-        ResultRegTypes.push_back(ResultTruncRegTypes.back());
+
+      llvm::Type *Ty = ConvertTypeForMem(QTy);
+      const bool RequiresCast = Info.allowsRegister() &&
+          (getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
+           Ty->isAggregateType());
+
+      ResultTruncRegTypes.push_back(Ty);
+      ResultTypeRequiresCast.push_back(RequiresCast);
+
+      if (RequiresCast) {
+        unsigned Size = getContext().getTypeSize(QTy);
+        Ty = llvm::IntegerType::get(getLLVMContext(), Size);
       }
+      ResultRegTypes.push_back(Ty);
       // If this output is tied to an input, and if the input is larger, then
       // we need to set the actual result type of the inline asm node to be the
       // same as the input type.
@@ -2638,11 +2644,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
   assert(ResultTypeRequiresCast.size() <= ResultRegDests.size());
   for (unsigned i = 0, e = RegResults.size(); i != e; ++i) {
     llvm::Value *Tmp = RegResults[i];
+    llvm::Type *TruncTy = ResultTruncRegTypes[i];
 
     // If the result type of the LLVM IR asm doesn't match the result type of
     // the expression, do the conversion.
     if (ResultRegTypes[i] != ResultTruncRegTypes[i]) {
-      llvm::Type *TruncTy = ResultTruncRegTypes[i];
 
       // Truncate the integer result to the right size, note that TruncTy can be
       // a pointer.
@@ -2672,6 +2678,11 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Builder.CreateBitCast(Dest.getAddress(*this),
                                         ResultRegTypes[i]->getPointerTo());
+      if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) {
+        Builder.CreateStore(Tmp, A);
+        continue;
+      }
+
       QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false);
       if (Ty.isNull()) {
         const Expr *OutExpr = S.getOutputExpr(i);
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index a2b68a04d351..d2cc0a699f43 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5526,6 +5526,20 @@ public:
     Fn->addFnAttr("branch-target-enforcement",
                   BPI.BranchTargetEnforcement ? "true" : "false");
   }
+
+  bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
+                                llvm::Type *Ty) const override {
+    if (CGF.getTarget().hasFeature("ls64")) {
+      auto *ST = dyn_cast<llvm::StructType>(Ty);
+      if (ST && ST->getNumElements() == 1) {
+        auto *AT = dyn_cast<llvm::ArrayType>(ST->getElementType(0));
+        if (AT && AT->getNumElements() == 8 &&
+            AT->getElementType()->isIntegerTy(64))
+          return true;
+      }
+    }
+    return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
+  }
 };
 
 class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index e6e474544fc4..aa8bbb60a75f 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -148,6 +148,13 @@ public:
     return Ty;
   }
 
+  /// Target hook to decide whether an inline asm operand can be passed
+  /// by value.
+  virtual bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
+                                        llvm::Type *Ty) const {
+    return false;
+  }
+
   /// Adds constraints and types for result registers.
   virtual void addReturnRegisterOutputs(
       CodeGen::CodeGenFunction &CGF, CodeGen::LValue ReturnValue,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a4b53a640ab5..1870bd81789c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -52,9 +52,8 @@ using namespace clang;
 using namespace llvm::opt;
 
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
-  if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
-                               options::OPT_fminimize_whitespace,
-                               options::OPT_fno_minimize_whitespace)) {
+  if (Arg *A =
+          Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC)) {
     if (!Args.hasArg(options::OPT_E) && !Args.hasArg(options::OPT__SLASH_P) &&
         !Args.hasArg(options::OPT__SLASH_EP) && !D.CCCIsCPP()) {
       D.Diag(clang::diag::err_drv_argument_only_allowed_with)
@@ -6068,16 +6067,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                    options::OPT_fno_use_line_directives, false))
     CmdArgs.push_back("-fuse-line-directives");
 
-  // -fno-minimize-whitespace is default.
-  if (Args.hasFlag(options::OPT_fminimize_whitespace,
-                   options::OPT_fno_minimize_whitespace, false)) {
-    types::ID InputType = Inputs[0].getType();
-    if (!isDerivedFromC(InputType))
-      D.Diag(diag::err_drv_minws_unsupported_input_type)
-          << types::getTypeName(InputType);
-    CmdArgs.push_back("-fminimize-whitespace");
-  }
-
   // -fms-extensions=0 is default.
   if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
                    IsWindowsMSVC))
diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp
index 3cb2d6e8f6fd..b7ccdf23cbaa 100644
--- a/clang/lib/Driver/Types.cpp
+++ b/clang/lib/Driver/Types.cpp
@@ -147,45 +147,6 @@ bool types::isAcceptedByClang(ID Id) {
   }
 }
 
-bool types::isDerivedFromC(ID Id) {
-  switch (Id) {
-  default:
-    return false;
-
-  case TY_PP_C:
-  case TY_C:
-  case TY_CL:
-  case TY_CLCXX:
-  case TY_PP_CUDA:
-  case TY_CUDA:
-  case TY_CUDA_DEVICE:
-  case TY_PP_HIP:
-  case TY_HIP:
-  case TY_HIP_DEVICE:
-  case TY_PP_ObjC:
-  case TY_PP_ObjC_Alias:
-  case TY_ObjC:
-  case TY_PP_CXX:
-  case TY_CXX:
-  case TY_PP_ObjCXX:
-  case TY_PP_ObjCXX_Alias:
-  case TY_ObjCXX:
-  case TY_RenderScript:
-  case TY_PP_CHeader:
-  case TY_CHeader:
-  case TY_CLHeader:
-  case TY_PP_ObjCHeader:
-  case TY_ObjCHeader:
-  case TY_PP_CXXHeader:
-  case TY_CXXHeader:
-  case TY_PP_ObjCXXHeader:
-  case TY_ObjCXXHeader:
-  case TY_CXXModule:
-  case TY_PP_CXXModule:
-    return true;
-  }
-}
-
 bool types::isObjC(ID Id) {
   switch (Id) {
   default:
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d545e9358f04..33e5f3e99c45 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3173,7 +3173,7 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
     Opts.ZVector = 0;
     Opts.setDefaultFPContractMode(LangOptions::FPM_On);
     Opts.OpenCLCPlusPlus = Opts.CPlusPlus;
-    Opts.OpenCLPipe = Opts.OpenCLCPlusPlus || Opts.OpenCLVersion == 200;
+    Opts.OpenCLPipes = Opts.OpenCLCPlusPlus || Opts.OpenCLVersion == 200;
     Opts.OpenCLGenericAddressSpace =
         Opts.OpenCLCPlusPlus || Opts.OpenCLVersion == 200;
 
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index b7259569595d..24ea1ccba207 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -95,20 +95,14 @@ private:
   bool DumpIncludeDirectives;
   bool UseLineDirectives;
   bool IsFirstFileEntered;
-  bool MinimizeWhitespace;
-
-  Token PrevTok;
-  Token PrevPrevTok;
-
 public:
   PrintPPOutputPPCallbacks(Preprocessor &pp, raw_ostream &os, bool lineMarkers,
                            bool defines, bool DumpIncludeDirectives,
-                           bool UseLineDirectives, bool MinimizeWhitespace)
+                           bool UseLineDirectives)
       : PP(pp), SM(PP.getSourceManager()), ConcatInfo(PP), OS(os),
         DisableLineMarkers(lineMarkers), DumpDefines(defines),
         DumpIncludeDirectives(DumpIncludeDirectives),
-        UseLineDirectives(UseLineDirectives),
-        MinimizeWhitespace(MinimizeWhitespace) {
+        UseLineDirectives(UseLineDirectives) {
     CurLine = 0;
     CurFilename += "<uninit>";
     EmittedTokensOnThisLine = false;
@@ -116,13 +110,8 @@ public:
     FileType = SrcMgr::C_User;
     Initialized = false;
     IsFirstFileEntered = false;
-
-    PrevTok.startToken();
-    PrevPrevTok.startToken();
   }
 
-  bool isMinimizeWhitespace() const { return MinimizeWhitespace; }
-
   void setEmittedTokensOnThisLine() { EmittedTokensOnThisLine = true; }
   bool hasEmittedTokensOnThisLine() const { return EmittedTokensOnThisLine; }
 
@@ -131,12 +120,7 @@ public:
     return EmittedDirectiveOnThisLine;
   }
 
-  /// Ensure that the output stream position is at the beginning of a new line
-  /// and inserts one if it does not. It is intended to ensure that directives
-  /// inserted by the directives not from the input source (such as #line) are
-  /// in the first column. To insert newlines that represent the input, use
-  /// MoveToLine(/*...*/, /*RequireStartOfLine=*/true).
-  void startNewLineIfNeeded();
+  bool startNewLineIfNeeded(bool ShouldUpdateCurrentLine = true);
 
   void FileChanged(SourceLocation Loc, FileChangeReason Reason,
                    SrcMgr::CharacteristicKind FileType,
@@ -164,45 +148,18 @@ public:
   void PragmaAssumeNonNullBegin(SourceLocation Loc) override;
   void PragmaAssumeNonNullEnd(SourceLocation Loc) override;
 
-  /// Insert whitespace before emitting the next token.
-  ///
-  /// @param Tok             Next token to be emitted.
-  /// @param RequireSpace    Ensure at least one whitespace is emitted. Useful
-  ///                        if non-tokens have been emitted to the stream.
-  /// @param RequireSameLine Never emit newlines. Useful when semantics depend
-  ///                        on being on the same line, such as directives.
-  void HandleWhitespaceBeforeTok(const Token &Tok, bool RequireSpace,
-                                 bool RequireSameLine);
+  bool HandleFirstTokOnLine(Token &Tok);
 
   /// Move to the line of the provided source location. This will
-  /// return true if a newline was inserted or if
-  /// the requested location is the first token on the first line.
-  /// In these cases the next output will be the first column on the line and
-  /// make it possible to insert indention. The newline was inserted
-  /// implicitly when at the beginning of the file.
-  ///
-  /// @param Tok                 Token where to move to.
-  /// @param RequiresStartOfLine Whether the next line depends on being in the
-  ///                            first column, such as a directive.
-  ///
-  /// @return Whether column adjustments are necessary.
-  bool MoveToLine(const Token &Tok, bool RequireStartOfLine) {
-    PresumedLoc PLoc = SM.getPresumedLoc(Tok.getLocation());
-    if (PLoc.isInvalid())
-      return false;
-    bool IsFirstInFile = Tok.isAtStartOfLine() && PLoc.getLine() == 1;
-    return MoveToLine(PLoc.getLine(), RequireStartOfLine) || IsFirstInFile;
-  }
-
-  /// Move to the line of the provided source location. Returns true if a new
-  /// line was inserted.
-  bool MoveToLine(SourceLocation Loc, bool RequireStartOfLine) {
+  /// return true if the output stream required adjustment or if
+  /// the requested location is on the first line.
+  bool MoveToLine(SourceLocation Loc) {
     PresumedLoc PLoc = SM.getPresumedLoc(Loc);
     if (PLoc.isInvalid())
       return false;
-    return MoveToLine(PLoc.getLine(), RequireStartOfLine);
+    return MoveToLine(PLoc.getLine()) || (PLoc.getLine() == 1);
   }
-  bool MoveToLine(unsigned LineNo, bool RequireStartOfLine);
+  bool MoveToLine(unsigned LineNo);
 
   bool AvoidConcat(const Token &PrevPrevTok, const Token &PrevTok,
                    const Token &Tok) {
@@ -230,7 +187,7 @@ public:
 void PrintPPOutputPPCallbacks::WriteLineInfo(unsigned LineNo,
                                              const char *Extra,
                                              unsigned ExtraLen) {
-  startNewLineIfNeeded();
+  startNewLineIfNeeded(/*ShouldUpdateCurrentLine=*/false);
 
   // Emit #line directives or GNU line markers depending on what mode we're in.
   if (UseLineDirectives) {
@@ -257,57 +214,43 @@ void PrintPPOutputPPCallbacks::WriteLineInfo(unsigned LineNo,
 /// object.  We can do this by emitting some number of \n's, or be emitting a
 /// #line directive.  This returns false if already at the specified line, true
 /// if some newlines were emitted.
-bool PrintPPOutputPPCallbacks::MoveToLine(unsigned LineNo,
-                                          bool RequireStartOfLine) {
-  // If it is required to start a new line or finish the current, insert
-  // vertical whitespace now and take it into account when moving to the
-  // expected line.
-  bool StartedNewLine = false;
-  if ((RequireStartOfLine && EmittedTokensOnThisLine) ||
-      EmittedDirectiveOnThisLine) {
-    OS << '\n';
-    StartedNewLine = true;
-    CurLine += 1;
-    EmittedTokensOnThisLine = false;
-    EmittedDirectiveOnThisLine = false;
-  }
-
+bool PrintPPOutputPPCallbacks::MoveToLine(unsigned LineNo) {
   // If this line is "close enough" to the original line, just print newlines,
   // otherwise print a #line directive.
-  if (CurLine == LineNo) {
-    // Nothing to do if we are already on the correct line.
-  } else if (!StartedNewLine && (!MinimizeWhitespace || !DisableLineMarkers) &&
-             LineNo - CurLine == 1) {
-    // Printing a single line has priority over printing a #line directive, even
-    // when minimizing whitespace which otherwise would print #line directives
-    // for every single line.
-    OS << '\n';
-    StartedNewLine = true;
-  } else if (!MinimizeWhitespace && LineNo - CurLine <= 8) {
-    const char *NewLines = "\n\n\n\n\n\n\n\n";
-    OS.write(NewLines, LineNo - CurLine);
-    StartedNewLine = true;
+  if (LineNo-CurLine <= 8) {
+    if (LineNo-CurLine == 1)
+      OS << '\n';
+    else if (LineNo == CurLine)
+      return false;    // Spelling line moved, but expansion line didn't.
+    else {
+      const char *NewLines = "\n\n\n\n\n\n\n\n";
+      OS.write(NewLines, LineNo-CurLine);
+    }
   } else if (!DisableLineMarkers) {
     // Emit a #line or line marker.
     WriteLineInfo(LineNo, nullptr, 0);
-    StartedNewLine = true;
-  }
-
-  if (StartedNewLine) {
-    EmittedTokensOnThisLine = false;
-    EmittedDirectiveOnThisLine = false;
+  } else {
+    // Okay, we're in -P mode, which turns off line markers.  However, we still
+    // need to emit a newline between tokens on different lines.
+    startNewLineIfNeeded(/*ShouldUpdateCurrentLine=*/false);
   }
 
   CurLine = LineNo;
-  return StartedNewLine;
+  return true;
 }
 
-void PrintPPOutputPPCallbacks::startNewLineIfNeeded() {
+bool
+PrintPPOutputPPCallbacks::startNewLineIfNeeded(bool ShouldUpdateCurrentLine) {
   if (EmittedTokensOnThisLine || EmittedDirectiveOnThisLine) {
     OS << '\n';
     EmittedTokensOnThisLine = false;
     EmittedDirectiveOnThisLine = false;
+    if (ShouldUpdateCurrentLine)
+      ++CurLine;
+    return true;
   }
+
+  return false;
 }
 
 /// FileChanged - Whenever the preprocessor enters or exits a #include file
@@ -330,7 +273,7 @@ void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc,
   if (Reason == PPCallbacks::EnterFile) {
     SourceLocation IncludeLoc = UserLoc.getIncludeLoc();
     if (IncludeLoc.isValid())
-      MoveToLine(IncludeLoc, /*RequireStartOfLine=*/false);
+      MoveToLine(IncludeLoc);
   } else if (Reason == PPCallbacks::SystemHeaderPragma) {
     // GCC emits the # directive for this directive on the line AFTER the
     // directive and emits a bunch of spaces that aren't needed. This is because
@@ -347,8 +290,7 @@ void PrintPPOutputPPCallbacks::FileChanged(SourceLocation Loc,
   FileType = NewFileType;
 
   if (DisableLineMarkers) {
-    if (!MinimizeWhitespace)
-      startNewLineIfNeeded();
+    startNewLineIfNeeded(/*ShouldUpdateCurrentLine=*/false);
     return;
   }
 
@@ -394,13 +336,15 @@ void PrintPPOutputPPCallbacks::InclusionDirective(
   // In -dI mode, dump #include directives prior to dumping their content or
   // interpretation.
   if (DumpIncludeDirectives) {
-    MoveToLine(HashLoc, /*RequireStartOfLine=*/true);
+    startNewLineIfNeeded();
+    MoveToLine(HashLoc);
     const std::string TokenText = PP.getSpelling(IncludeTok);
     assert(!TokenText.empty());
     OS << "#" << TokenText << " "
        << (IsAngled ? '<' : '"') << FileName << (IsAngled ? '>' : '"')
        << " /* clang -E -dI */";
     setEmittedDirectiveOnThisLine();
+    startNewLineIfNeeded();
   }
 
   // When preprocessing, turn implicit imports into module import pragmas.
@@ -409,13 +353,17 @@ void PrintPPOutputPPCallbacks::InclusionDirective(
     case tok::pp_include:
     case tok::pp_import:
     case tok::pp_include_next:
-      MoveToLine(HashLoc, /*RequireStartOfLine=*/true);
+      startNewLineIfNeeded();
+      MoveToLine(HashLoc);
       OS << "#pragma clang module import " << Imported->getFullModuleName(true)
          << " /* clang -E: implicit import for "
          << "#" << PP.getSpelling(IncludeTok) << " "
          << (IsAngled ? '<' : '"') << FileName << (IsAngled ? '>' : '"')
          << " */";
-      setEmittedDirectiveOnThisLine();
+      // Since we want a newline after the pragma, but not a #<line>, start a
+      // new line immediately.
+      EmittedTokensOnThisLine = true;
+      startNewLineIfNeeded();
       break;
 
     case tok::pp___include_macros:
@@ -450,11 +398,11 @@ void PrintPPOutputPPCallbacks::EndModule(const Module *M) {
 /// Ident - Handle #ident directives when read by the preprocessor.
 ///
 void PrintPPOutputPPCallbacks::Ident(SourceLocation Loc, StringRef S) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  MoveToLine(Loc);
 
   OS.write("#ident ", strlen("#ident "));
   OS.write(S.begin(), S.size());
-  setEmittedTokensOnThisLine();
+  EmittedTokensOnThisLine = true;
 }
 
 /// MacroDefined - This hook is called whenever a macro definition is seen.
@@ -466,7 +414,7 @@ void PrintPPOutputPPCallbacks::MacroDefined(const Token &MacroNameTok,
       // Ignore __FILE__ etc.
       MI->isBuiltinMacro()) return;
 
-  MoveToLine(MI->getDefinitionLoc(), /*RequireStartOfLine=*/true);
+  MoveToLine(MI->getDefinitionLoc());
   PrintMacroDefinition(*MacroNameTok.getIdentifierInfo(), *MI, PP, OS);
   setEmittedDirectiveOnThisLine();
 }
@@ -477,7 +425,7 @@ void PrintPPOutputPPCallbacks::MacroUndefined(const Token &MacroNameTok,
   // Only print out macro definitions in -dD mode.
   if (!DumpDefines) return;
 
-  MoveToLine(MacroNameTok.getLocation(), /*RequireStartOfLine=*/true);
+  MoveToLine(MacroNameTok.getLocation());
   OS << "#undef " << MacroNameTok.getIdentifierInfo()->getName();
   setEmittedDirectiveOnThisLine();
 }
@@ -498,7 +446,8 @@ void PrintPPOutputPPCallbacks::PragmaMessage(SourceLocation Loc,
                                              StringRef Namespace,
                                              PragmaMessageKind Kind,
                                              StringRef Str) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma ";
   if (!Namespace.empty())
     OS << Namespace << ' ';
@@ -523,7 +472,8 @@ void PrintPPOutputPPCallbacks::PragmaMessage(SourceLocation Loc,
 
 void PrintPPOutputPPCallbacks::PragmaDebug(SourceLocation Loc,
                                            StringRef DebugType) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
 
   OS << "#pragma clang __debug ";
   OS << DebugType;
@@ -533,14 +483,16 @@ void PrintPPOutputPPCallbacks::PragmaDebug(SourceLocation Loc,
 
 void PrintPPOutputPPCallbacks::
 PragmaDiagnosticPush(SourceLocation Loc, StringRef Namespace) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma " << Namespace << " diagnostic push";
   setEmittedDirectiveOnThisLine();
 }
 
 void PrintPPOutputPPCallbacks::
 PragmaDiagnosticPop(SourceLocation Loc, StringRef Namespace) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma " << Namespace << " diagnostic pop";
   setEmittedDirectiveOnThisLine();
 }
@@ -549,7 +501,8 @@ void PrintPPOutputPPCallbacks::PragmaDiagnostic(SourceLocation Loc,
                                                 StringRef Namespace,
                                                 diag::Severity Map,
                                                 StringRef Str) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma " << Namespace << " diagnostic ";
   switch (Map) {
   case diag::Severity::Remark:
@@ -575,7 +528,8 @@ void PrintPPOutputPPCallbacks::PragmaDiagnostic(SourceLocation Loc,
 void PrintPPOutputPPCallbacks::PragmaWarning(SourceLocation Loc,
                                              StringRef WarningSpec,
                                              ArrayRef<int> Ids) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma warning(" << WarningSpec << ':';
   for (ArrayRef<int>::iterator I = Ids.begin(), E = Ids.end(); I != E; ++I)
     OS << ' ' << *I;
@@ -585,7 +539,8 @@ void PrintPPOutputPPCallbacks::PragmaWarning(SourceLocation Loc,
 
 void PrintPPOutputPPCallbacks::PragmaWarningPush(SourceLocation Loc,
                                                  int Level) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma warning(push";
   if (Level >= 0)
     OS << ", " << Level;
@@ -594,14 +549,16 @@ void PrintPPOutputPPCallbacks::PragmaWarningPush(SourceLocation Loc,
 }
 
 void PrintPPOutputPPCallbacks::PragmaWarningPop(SourceLocation Loc) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma warning(pop)";
   setEmittedDirectiveOnThisLine();
 }
 
 void PrintPPOutputPPCallbacks::PragmaExecCharsetPush(SourceLocation Loc,
                                                      StringRef Str) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma character_execution_set(push";
   if (!Str.empty())
     OS << ", " << Str;
@@ -610,80 +567,64 @@ void PrintPPOutputPPCallbacks::PragmaExecCharsetPush(SourceLocation Loc,
 }
 
 void PrintPPOutputPPCallbacks::PragmaExecCharsetPop(SourceLocation Loc) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma character_execution_set(pop)";
   setEmittedDirectiveOnThisLine();
 }
 
 void PrintPPOutputPPCallbacks::
 PragmaAssumeNonNullBegin(SourceLocation Loc) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma clang assume_nonnull begin";
   setEmittedDirectiveOnThisLine();
 }
 
 void PrintPPOutputPPCallbacks::
 PragmaAssumeNonNullEnd(SourceLocation Loc) {
-  MoveToLine(Loc, /*RequireStartOfLine=*/true);
+  startNewLineIfNeeded();
+  MoveToLine(Loc);
   OS << "#pragma clang assume_nonnull end";
   setEmittedDirectiveOnThisLine();
 }
 
-void PrintPPOutputPPCallbacks::HandleWhitespaceBeforeTok(const Token &Tok,
-                                                         bool RequireSpace,
-                                                         bool RequireSameLine) {
-  // These tokens are not expanded to anything and don't need whitespace before
-  // them.
-  if (Tok.is(tok::eof) ||
-      (Tok.isAnnotation() && !Tok.is(tok::annot_header_unit) &&
-       !Tok.is(tok::annot_module_begin) && !Tok.is(tok::annot_module_end)))
-    return;
+/// HandleFirstTokOnLine - When emitting a preprocessed file in -E mode, this
+/// is called for the first token on each new line.  If this really is the start
+/// of a new logical line, handle it and return true, otherwise return false.
+/// This may not be the start of a logical line because the "start of line"
+/// marker is set for spelling lines, not expansion ones.
+bool PrintPPOutputPPCallbacks::HandleFirstTokOnLine(Token &Tok) {
+  // Figure out what line we went to and insert the appropriate number of
+  // newline characters.
+  if (!MoveToLine(Tok.getLocation()))
+    return false;
 
-  if (!RequireSameLine && MoveToLine(Tok, /*RequireStartOfLine=*/false)) {
-    if (MinimizeWhitespace) {
-      // Avoid interpreting hash as a directive under -fpreprocessed.
-      if (Tok.is(tok::hash))
-        OS << ' ';
-    } else {
-      // Print out space characters so that the first token on a line is
-      // indented for easy reading.
-      unsigned ColNo = SM.getExpansionColumnNumber(Tok.getLocation());
+  // Print out space characters so that the first token on a line is
+  // indented for easy reading.
+  unsigned ColNo = SM.getExpansionColumnNumber(Tok.getLocation());
 
-      // The first token on a line can have a column number of 1, yet still
-      // expect leading white space, if a macro expansion in column 1 starts
-      // with an empty macro argument, or an empty nested macro expansion. In
-      // this case, move the token to column 2.
-      if (ColNo == 1 && Tok.hasLeadingSpace())
-        ColNo = 2;
+  // The first token on a line can have a column number of 1, yet still expect
+  // leading white space, if a macro expansion in column 1 starts with an empty
+  // macro argument, or an empty nested macro expansion. In this case, move the
+  // token to column 2.
+  if (ColNo == 1 && Tok.hasLeadingSpace())
+    ColNo = 2;
 
-      // This hack prevents stuff like:
-      // #define HASH #
-      // HASH define foo bar
-      // From having the # character end up at column 1, which makes it so it
-      // is not handled as a #define next time through the preprocessor if in
-      // -fpreprocessed mode.
-      if (ColNo <= 1 && Tok.is(tok::hash))
-        OS << ' ';
+  // This hack prevents stuff like:
+  // #define HASH #
+  // HASH define foo bar
+  // From having the # character end up at column 1, which makes it so it
+  // is not handled as a #define next time through the preprocessor if in
+  // -fpreprocessed mode.
+  if (ColNo <= 1 && Tok.is(tok::hash))
+    OS << ' ';
 
-      // Otherwise, indent the appropriate number of spaces.
-      for (; ColNo > 1; --ColNo)
-        OS << ' ';
-    }
-  } else {
-    // Insert whitespace between the previous and next token if either
-    // - The caller requires it
-    // - The input had whitespace between them and we are not in
-    //   whitespace-minimization mode
-    // - The whitespace is necessary to keep the tokens apart and there is not
-    //   already a newline between them
-    if (RequireSpace || (!MinimizeWhitespace && Tok.hasLeadingSpace()) ||
-        ((EmittedTokensOnThisLine || EmittedTokensOnThisLine) &&
-         AvoidConcat(PrevPrevTok, PrevTok, Tok)))
-      OS << ' ';
-  }
+  // Otherwise, indent the appropriate number of spaces.
+  for (; ColNo > 1; --ColNo)
+    OS << ' ';
 
-  PrevPrevTok = PrevTok;
-  PrevTok = Tok;
+  return true;
 }
 
 void PrintPPOutputPPCallbacks::HandleNewlinesInToken(const char *TokStr,
@@ -727,9 +668,9 @@ struct UnknownPragmaHandler : public PragmaHandler {
                     Token &PragmaTok) override {
     // Figure out what line we went to and insert the appropriate number of
     // newline characters.
-    Callbacks->MoveToLine(PragmaTok.getLocation(), /*RequireStartOfLine=*/true);
+    Callbacks->startNewLineIfNeeded();
+    Callbacks->MoveToLine(PragmaTok.getLocation());
     Callbacks->OS.write(Prefix, strlen(Prefix));
-    Callbacks->setEmittedTokensOnThisLine();
 
     if (ShouldExpandTokens) {
       // The first token does not have expanded macros. Expand them, if
@@ -741,16 +682,21 @@ struct UnknownPragmaHandler : public PragmaHandler {
                           /*IsReinject=*/false);
       PP.Lex(PragmaTok);
     }
+    Token PrevToken;
+    Token PrevPrevToken;
+    PrevToken.startToken();
+    PrevPrevToken.startToken();
 
     // Read and print all of the pragma tokens.
-    bool IsFirst = true;
     while (PragmaTok.isNot(tok::eod)) {
-      Callbacks->HandleWhitespaceBeforeTok(PragmaTok, /*RequireSpace=*/IsFirst,
-                                           /*RequireSameLine=*/true);
-      IsFirst = false;
+      if (PragmaTok.hasLeadingSpace() ||
+          Callbacks->AvoidConcat(PrevPrevToken, PrevToken, PragmaTok))
+        Callbacks->OS << ' ';
       std::string TokSpell = PP.getSpelling(PragmaTok);
       Callbacks->OS.write(&TokSpell[0], TokSpell.size());
-      Callbacks->setEmittedTokensOnThisLine();
+
+      PrevPrevToken = PrevToken;
+      PrevToken = PragmaTok;
 
       if (ShouldExpandTokens)
         PP.Lex(PragmaTok);
@@ -769,41 +715,44 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
   bool DropComments = PP.getLangOpts().TraditionalCPP &&
                       !PP.getCommentRetentionState();
 
-  bool IsStartOfLine = false;
   char Buffer[256];
+  Token PrevPrevTok, PrevTok;
+  PrevPrevTok.startToken();
+  PrevTok.startToken();
   while (1) {
-    // Two lines joined with line continuation ('\' as last character on the
-    // line) must be emitted as one line even though Tok.getLine() returns two
-    // different values. In this situation Tok.isAtStartOfLine() is false even
-    // though it may be the first token on the lexical line. When
-    // dropping/skipping a token that is at the start of a line, propagate the
-    // start-of-line-ness to the next token to not append it to the previous
-    // line.
-    IsStartOfLine = IsStartOfLine || Tok.isAtStartOfLine();
+    if (Callbacks->hasEmittedDirectiveOnThisLine()) {
+      Callbacks->startNewLineIfNeeded();
+      Callbacks->MoveToLine(Tok.getLocation());
+    }
 
-    Callbacks->HandleWhitespaceBeforeTok(Tok, /*RequireSpace=*/false,
-                                         /*RequireSameLine=*/!IsStartOfLine);
+    // If this token is at the start of a line, emit newlines if needed.
+    if (Tok.isAtStartOfLine() && Callbacks->HandleFirstTokOnLine(Tok)) {
+      // done.
+    } else if (Tok.hasLeadingSpace() ||
+               // If we haven't emitted a token on this line yet, PrevTok isn't
+               // useful to look at and no concatenation could happen anyway.
+               (Callbacks->hasEmittedTokensOnThisLine() &&
+                // Don't print "-" next to "-", it would form "--".
+                Callbacks->AvoidConcat(PrevPrevTok, PrevTok, Tok))) {
+      OS << ' ';
+    }
 
     if (DropComments && Tok.is(tok::comment)) {
       // Skip comments. Normally the preprocessor does not generate
       // tok::comment nodes at all when not keeping comments, but under
       // -traditional-cpp the lexer keeps /all/ whitespace, including comments.
-      PP.Lex(Tok);
-      continue;
+      SourceLocation StartLoc = Tok.getLocation();
+      Callbacks->MoveToLine(StartLoc.getLocWithOffset(Tok.getLength()));
     } else if (Tok.is(tok::eod)) {
       // Don't print end of directive tokens, since they are typically newlines
       // that mess up our line tracking. These come from unknown pre-processor
       // directives or hash-prefixed comments in standalone assembly files.
       PP.Lex(Tok);
-      // FIXME: The token on the next line after #include should have
-      // Tok.isAtStartOfLine() set.
-      IsStartOfLine = true;
       continue;
     } else if (Tok.is(tok::annot_module_include)) {
       // PrintPPOutputPPCallbacks::InclusionDirective handles producing
       // appropriate output here. Ignore this token entirely.
       PP.Lex(Tok);
-      IsStartOfLine = true;
       continue;
     } else if (Tok.is(tok::annot_module_begin)) {
       // FIXME: We retrieve this token after the FileChanged callback, and
@@ -815,13 +764,11 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
       Callbacks->BeginModule(
           reinterpret_cast<Module *>(Tok.getAnnotationValue()));
       PP.Lex(Tok);
-      IsStartOfLine = true;
       continue;
     } else if (Tok.is(tok::annot_module_end)) {
       Callbacks->EndModule(
           reinterpret_cast<Module *>(Tok.getAnnotationValue()));
       PP.Lex(Tok);
-      IsStartOfLine = true;
       continue;
     } else if (Tok.is(tok::annot_header_unit)) {
       // This is a header-name that has been (effectively) converted into a
@@ -849,17 +796,8 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
 
       // Tokens that can contain embedded newlines need to adjust our current
       // line number.
-      // FIXME: The token may end with a newline in which case
-      // setEmittedDirectiveOnThisLine/setEmittedTokensOnThisLine afterwards is
-      // wrong.
       if (Tok.getKind() == tok::comment || Tok.getKind() == tok::unknown)
         Callbacks->HandleNewlinesInToken(TokPtr, Len);
-      if (Tok.is(tok::comment) && Len >= 2 && TokPtr[0] == '/' &&
-          TokPtr[1] == '/') {
-        // It's a line comment;
-        // Ensure that we don't concatenate anything behind it.
-        Callbacks->setEmittedDirectiveOnThisLine();
-      }
     } else {
       std::string S = PP.getSpelling(Tok);
       OS.write(S.data(), S.size());
@@ -868,17 +806,13 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
       // line number.
       if (Tok.getKind() == tok::comment || Tok.getKind() == tok::unknown)
         Callbacks->HandleNewlinesInToken(S.data(), S.size());
-      if (Tok.is(tok::comment) && S.size() >= 2 && S[0] == '/' && S[1] == '/') {
-        // It's a line comment;
-        // Ensure that we don't concatenate anything behind it.
-        Callbacks->setEmittedDirectiveOnThisLine();
-      }
     }
     Callbacks->setEmittedTokensOnThisLine();
-    IsStartOfLine = false;
 
     if (Tok.is(tok::eof)) break;
 
+    PrevPrevTok = PrevTok;
+    PrevTok = Tok;
     PP.Lex(Tok);
   }
 }
@@ -936,8 +870,7 @@ void clang::DoPrintPreprocessedInput(Preprocessor &PP, raw_ostream *OS,
 
   PrintPPOutputPPCallbacks *Callbacks = new PrintPPOutputPPCallbacks(
       PP, *OS, !Opts.ShowLineMarkers, Opts.ShowMacros,
-      Opts.ShowIncludeDirectives, Opts.UseLineDirectives,
-      Opts.MinimizeWhitespace);
+      Opts.ShowIncludeDirectives, Opts.UseLineDirectives);
 
   // Expand macros in pragmas with -fms-extensions.  The assumption is that
   // the majority of pragmas in such a file will be Microsoft pragmas.
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 3034af231e0e..64944492eb99 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2811,11 +2811,11 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
     ConditionalStack.pop_back();
   }
 
+  SourceLocation EndLoc = getSourceLocation(BufferEnd);
   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
   // a pedwarn.
   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
     DiagnosticsEngine &Diags = PP->getDiagnostics();
-    SourceLocation EndLoc = getSourceLocation(BufferEnd);
     unsigned DiagID;
 
     if (LangOpts.CPlusPlus11) {
@@ -2838,7 +2838,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
   BufferPtr = CurPtr;
 
   // Finally, let the preprocessor handle this.
-  return PP->HandleEndOfFile(Result, isPragmaLexer());
+  return PP->HandleEndOfFile(Result, EndLoc, isPragmaLexer());
 }
 
 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 556dd8daf652..3fa8746653b0 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2022,6 +2022,10 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
       IsFrameworkFound, IsImportDecl, IsMapped, LookupFrom, LookupFromFile,
       LookupFilename, RelativePath, SearchPath, SuggestedModule, isAngled);
 
+  // Record the header's filename for later use.
+  if (File)
+    CurLexer->addInclude(OriginalFilename, File->getFileEntry(), FilenameLoc);
+
   if (usingPCHWithThroughHeader() && SkippingUntilPCHThroughHeader) {
     if (File && isPCHThroughHeader(&File->getFileEntry()))
       SkippingUntilPCHThroughHeader = false;
diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp
index b979b965f46a..16170969a322 100644
--- a/clang/lib/Lex/PPLexerChange.cpp
+++ b/clang/lib/Lex/PPLexerChange.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/LexDiagnostic.h"
@@ -22,6 +23,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/Path.h"
+
 using namespace clang;
 
 //===----------------------------------------------------------------------===//
@@ -299,10 +301,46 @@ void Preprocessor::diagnoseMissingHeaderInUmbrellaDir(const Module &Mod) {
   }
 }
 
+void Preprocessor::ResolvePragmaIncludeInstead(
+    const SourceLocation Location) const {
+  assert(Location.isValid());
+  if (CurLexer == nullptr)
+    return;
+
+  if (SourceMgr.isInSystemHeader(Location))
+    return;
+
+  for (const auto &Include : CurLexer->getIncludeHistory()) {
+    StringRef Filename = Include.getKey();
+    const PreprocessorLexer::IncludeInfo &Info = Include.getValue();
+    ArrayRef<SmallString<32>> Aliases =
+        HeaderInfo.getFileInfo(Info.File).Aliases.getArrayRef();
+
+    if (Aliases.empty())
+      continue;
+
+    switch (Aliases.size()) {
+    case 1:
+      Diag(Info.Location, diag::err_pragma_include_instead_system_reserved)
+          << Filename << 0 << Aliases[0];
+      continue;
+    case 2:
+      Diag(Info.Location, diag::err_pragma_include_instead_system_reserved)
+          << Filename << 1 << Aliases[0] << Aliases[1];
+      continue;
+    default: {
+      Diag(Info.Location, diag::err_pragma_include_instead_system_reserved)
+          << Filename << 2 << ("{'" + llvm::join(Aliases, "', '") + "'}");
+    }
+    }
+  }
+}
+
 /// HandleEndOfFile - This callback is invoked when the lexer hits the end of
 /// the current file.  This either returns the EOF token or pops a level off
 /// the include stack and keeps going.
-bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) {
+bool Preprocessor::HandleEndOfFile(Token &Result, SourceLocation EndLoc,
+                                   bool isEndOfMacro) {
   assert(!CurTokenLexer &&
          "Ending a file when currently in a macro!");
 
@@ -372,6 +410,9 @@ bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) {
     }
   }
 
+  if (EndLoc.isValid())
+    ResolvePragmaIncludeInstead(EndLoc);
+
   // Complain about reaching a true EOF within arc_cf_code_audited.
   // We don't want to complain about reaching the end of a macro
   // instantiation or a _Pragma.
@@ -560,7 +601,7 @@ bool Preprocessor::HandleEndOfTokenLexer(Token &Result) {
     TokenLexerCache[NumCachedTokenLexers++] = std::move(CurTokenLexer);
 
   // Handle this like a #include file being popped off the stack.
-  return HandleEndOfFile(Result, true);
+  return HandleEndOfFile(Result, {}, true);
 }
 
 /// RemoveTopOfLexerStack - Pop the current lexer/macro exp off the top of the
diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp
index c89061ba6d02..27765af34fed 100644
--- a/clang/lib/Lex/Pragma.cpp
+++ b/clang/lib/Lex/Pragma.cpp
@@ -13,6 +13,7 @@
 
 #include "clang/Lex/Pragma.h"
 #include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticLex.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LLVM.h"
@@ -35,11 +36,12 @@
 #include "clang/Lex/TokenLexer.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Timer.h"
@@ -495,43 +497,88 @@ void Preprocessor::HandlePragmaSystemHeader(Token &SysHeaderTok) {
                         SrcMgr::C_System);
 }
 
-/// HandlePragmaDependency - Handle \#pragma GCC dependency "foo" blah.
-void Preprocessor::HandlePragmaDependency(Token &DependencyTok) {
+static llvm::Optional<Token> LexHeader(Preprocessor &PP,
+                                       Optional<FileEntryRef> &File,
+                                       bool SuppressIncludeNotFoundError) {
   Token FilenameTok;
-  if (LexHeaderName(FilenameTok, /*AllowConcatenation*/false))
-    return;
+  if (PP.LexHeaderName(FilenameTok, /*AllowConcatenation*/ false))
+    return llvm::None;
 
   // If the next token wasn't a header-name, diagnose the error.
   if (FilenameTok.isNot(tok::header_name)) {
-    Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename);
-    return;
+    PP.Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename);
+    return llvm::None;
   }
 
   // Reserve a buffer to get the spelling.
   SmallString<128> FilenameBuffer;
   bool Invalid = false;
-  StringRef Filename = getSpelling(FilenameTok, FilenameBuffer, &Invalid);
+  StringRef Filename = PP.getSpelling(FilenameTok, FilenameBuffer, &Invalid);
   if (Invalid)
-    return;
+    return llvm::None;
 
   bool isAngled =
-    GetIncludeFilenameSpelling(FilenameTok.getLocation(), Filename);
+      PP.GetIncludeFilenameSpelling(FilenameTok.getLocation(), Filename);
   // If GetIncludeFilenameSpelling set the start ptr to null, there was an
   // error.
   if (Filename.empty())
-    return;
+    return llvm::None;
 
   // Search include directories for this file.
   const DirectoryLookup *CurDir;
-  Optional<FileEntryRef> File =
-      LookupFile(FilenameTok.getLocation(), Filename, isAngled, nullptr,
-                 nullptr, CurDir, nullptr, nullptr, nullptr, nullptr, nullptr);
+  File = PP.LookupFile(FilenameTok.getLocation(), Filename, isAngled, nullptr,
+                       nullptr, CurDir, nullptr, nullptr, nullptr, nullptr,
+                       nullptr);
   if (!File) {
     if (!SuppressIncludeNotFoundError)
-      Diag(FilenameTok, diag::err_pp_file_not_found) << Filename;
+      PP.Diag(FilenameTok, diag::err_pp_file_not_found) << Filename;
+    return llvm::None;
+  }
+
+  return FilenameTok;
+}
+
+/// HandlePragmaIncludeInstead - Handle \#pragma clang include_instead(header).
+void Preprocessor::HandlePragmaIncludeInstead(Token &Tok) {
+  // Get the current file lexer we're looking at.  Ignore _Pragma 'files' etc.
+  PreprocessorLexer *TheLexer = getCurrentFileLexer();
+
+  if (!SourceMgr.isInSystemHeader(Tok.getLocation())) {
+    Diag(Tok, diag::err_pragma_include_instead_not_sysheader);
     return;
   }
 
+  Lex(Tok);
+  if (Tok.isNot(tok::l_paren)) {
+    Diag(Tok, diag::err_expected) << "(";
+    return;
+  }
+
+  Optional<FileEntryRef> File;
+  llvm::Optional<Token> FilenameTok =
+      LexHeader(*this, File, SuppressIncludeNotFoundError);
+  if (!FilenameTok)
+    return;
+
+  Lex(Tok);
+  if (Tok.isNot(tok::r_paren)) {
+    Diag(Tok, diag::err_expected) << ")";
+    return;
+  }
+
+  SmallString<128> FilenameBuffer;
+  StringRef Filename = getSpelling(*FilenameTok, FilenameBuffer);
+  HeaderInfo.AddFileAlias(TheLexer->getFileEntry(), Filename);
+}
+
+/// HandlePragmaDependency - Handle \#pragma GCC dependency "foo" blah.
+void Preprocessor::HandlePragmaDependency(Token &DependencyTok) {
+  Optional<FileEntryRef> File;
+  llvm::Optional<Token> FilenameTok =
+      LexHeader(*this, File, SuppressIncludeNotFoundError);
+  if (!FilenameTok)
+    return;
+
   const FileEntry *CurFile = getCurrentFileLexer()->getFileEntry();
 
   // If this file is older than the file it depends on, emit a diagnostic.
@@ -547,7 +594,7 @@ void Preprocessor::HandlePragmaDependency(Token &DependencyTok) {
     // Remove the trailing ' ' if present.
     if (!Message.empty())
       Message.erase(Message.end()-1);
-    Diag(FilenameTok, diag::pp_out_of_date_dependency) << Message;
+    Diag(*FilenameTok, diag::pp_out_of_date_dependency) << Message;
   }
 }
 
@@ -1022,6 +1069,18 @@ struct PragmaSystemHeaderHandler : public PragmaHandler {
   }
 };
 
+/// PragmaIncludeInsteadHandler - "\#pragma clang include_instead(header)" marks
+/// the current file as non-includable if the including header is not a system
+/// header.
+struct PragmaIncludeInsteadHandler : public PragmaHandler {
+  PragmaIncludeInsteadHandler() : PragmaHandler("include_instead") {}
+
+  void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer,
+                    Token &IIToken) override {
+    PP.HandlePragmaIncludeInstead(IIToken);
+  }
+};
+
 struct PragmaDependencyHandler : public PragmaHandler {
   PragmaDependencyHandler() : PragmaHandler("dependency") {}
 
@@ -1934,6 +1993,7 @@ void Preprocessor::RegisterBuiltinPragmas() {
   // #pragma clang ...
   AddPragmaHandler("clang", new PragmaPoisonHandler());
   AddPragmaHandler("clang", new PragmaSystemHeaderHandler());
+  AddPragmaHandler("clang", new PragmaIncludeInsteadHandler());
   AddPragmaHandler("clang", new PragmaDebugHandler());
   AddPragmaHandler("clang", new PragmaDependencyHandler());
   AddPragmaHandler("clang", new PragmaDiagnosticHandler("clang"));
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 32ea8791d29a..e376fff90432 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -716,12 +716,6 @@ IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const {
   }
 
   // Update the token info (identifier info and appropriate token kind).
-  // FIXME: the raw_identifier may contain leading whitespace which is removed
-  // from the cleaned identifier token. The SourceLocation should be updated to
-  // refer to the non-whitespace character. For instance, the text "\\\nB" (a
-  // line continuation before 'B') is parsed as a single tok::raw_identifier and
-  // is cleaned to tok::identifier "B". After cleaning the token's length is
-  // still 3 and the SourceLocation refers to the location of the backslash.
   Identifier.setIdentifierInfo(II);
   if (getLangOpts().MSVCCompat && II->isCPlusPlusOperatorKeyword() &&
       getSourceManager().isInSystemHeader(Identifier.getLocation()))
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index f4f5f461e3b6..939323517b4d 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3952,8 +3952,12 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
         Tok.getIdentifierInfo()->revertTokenIDToIdentifier();
         Tok.setKind(tok::identifier);
         goto DoneWithDeclSpec;
-      }
-      isInvalid = DS.SetTypePipe(true, Loc, PrevSpec, DiagID, Policy);
+      } else if (!getLangOpts().OpenCLPipes) {
+        DiagID = diag::err_opencl_unknown_type_specifier;
+        PrevSpec = Tok.getIdentifierInfo()->getNameStart();
+        isInvalid = true;
+      } else
+        isInvalid = DS.SetTypePipe(true, Loc, PrevSpec, DiagID, Policy);
       break;
 // We only need to enumerate each image type once.
 #define IMAGE_READ_WRITE_TYPE(Type, Id, Ext)
@@ -5126,8 +5130,10 @@ bool Parser::isDeclarationSpecifier(bool DisambiguatingWithExpression) {
   switch (Tok.getKind()) {
   default: return false;
 
+  // OpenCL 2.0 and later define this keyword.
   case tok::kw_pipe:
-    return getLangOpts().OpenCLPipe;
+    return (getLangOpts().OpenCL && getLangOpts().OpenCLVersion >= 200) ||
+           getLangOpts().OpenCLCPlusPlus;
 
   case tok::identifier:   // foo::bar
     // Unfortunate hack to support "Class.factoryMethod" notation.
@@ -5656,7 +5662,9 @@ static bool isPtrOperatorToken(tok::TokenKind Kind, const LangOptions &Lang,
   if (Kind == tok::star || Kind == tok::caret)
     return true;
 
-  if (Kind == tok::kw_pipe && Lang.OpenCLPipe)
+  // OpenCL 2.0 and later define this keyword.
+  if (Kind == tok::kw_pipe &&
+      ((Lang.OpenCL && Lang.OpenCLVersion >= 200) || Lang.OpenCLCPlusPlus))
     return true;
 
   if (!Lang.CPlusPlus)
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 5d3de06e9576..a54bd8719178 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -327,7 +327,8 @@ void Sema::Initialize() {
     if (getLangOpts().OpenCLCPlusPlus || getLangOpts().OpenCLVersion >= 200) {
       addImplicitTypedef("clk_event_t", Context.OCLClkEventTy);
       addImplicitTypedef("queue_t", Context.OCLQueueTy);
-      addImplicitTypedef("reserve_id_t", Context.OCLReserveIDTy);
+      if (getLangOpts().OpenCLPipes)
+        addImplicitTypedef("reserve_id_t", Context.OCLReserveIDTy);
       addImplicitTypedef("atomic_int", Context.getAtomicType(Context.IntTy));
       addImplicitTypedef("atomic_uint",
                          Context.getAtomicType(Context.UnsignedIntTy));
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index b78331cdfe91..bca21b351c91 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1525,18 +1525,20 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
     break;
   case DeclSpec::TST_float:   Result = Context.FloatTy; break;
   case DeclSpec::TST_double:
-    if (S.getLangOpts().OpenCL) {
-      if (!S.getOpenCLOptions().isSupported("cl_khr_fp64", S.getLangOpts()))
-        S.Diag(DS.getTypeSpecTypeLoc(),
-               diag::err_opencl_double_requires_extension)
-            << (S.getLangOpts().OpenCLVersion >= 300);
-      else if (!S.getOpenCLOptions().isAvailableOption("cl_khr_fp64", S.getLangOpts()))
-        S.Diag(DS.getTypeSpecTypeLoc(), diag::ext_opencl_double_without_pragma);
-    }
     if (DS.getTypeSpecWidth() == TypeSpecifierWidth::Long)
       Result = Context.LongDoubleTy;
     else
       Result = Context.DoubleTy;
+    if (S.getLangOpts().OpenCL) {
+      if (!S.getOpenCLOptions().isSupported("cl_khr_fp64", S.getLangOpts()))
+        S.Diag(DS.getTypeSpecTypeLoc(), diag::err_opencl_requires_extension)
+            << 0 << Result
+            << (S.getLangOpts().OpenCLVersion == 300
+                    ? "cl_khr_fp64 and __opencl_c_fp64"
+                    : "cl_khr_fp64");
+      else if (!S.getOpenCLOptions().isAvailableOption("cl_khr_fp64", S.getLangOpts()))
+        S.Diag(DS.getTypeSpecTypeLoc(), diag::ext_opencl_double_without_pragma);
+    }
     break;
   case DeclSpec::TST_float128:
     if (!S.Context.getTargetInfo().hasFloat128Type() &&
@@ -1724,21 +1726,28 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
 
   if (S.getLangOpts().OpenCL) {
     const auto &OpenCLOptions = S.getOpenCLOptions();
-    StringRef OptName;
+    bool IsOpenCLC30 = (S.getLangOpts().OpenCLVersion == 300);
     // OpenCL C v3.0 s6.3.3 - OpenCL image types require __opencl_c_images
-    // support
+    // support.
+    // OpenCL C v3.0 s6.2.1 - OpenCL 3d image write types requires support
+    // for OpenCL C 2.0, or OpenCL C 3.0 or newer and the
+    // __opencl_c_3d_image_writes feature. OpenCL C v3.0 API s4.2 - For devices
+    // that support OpenCL 3.0, cl_khr_3d_image_writes must be returned when and
+    // only when the optional feature is supported
     if ((Result->isImageType() || Result->isSamplerT()) &&
-        (S.getLangOpts().OpenCLVersion >= 300 &&
-         !OpenCLOptions.isSupported("__opencl_c_images", S.getLangOpts())))
-      OptName = "__opencl_c_images";
-    else if (Result->isOCLImage3dWOType() &&
-             !OpenCLOptions.isSupported("cl_khr_3d_image_writes",
-                                        S.getLangOpts()))
-      OptName = "cl_khr_3d_image_writes";
-
-    if (!OptName.empty()) {
+        (IsOpenCLC30 &&
+         !OpenCLOptions.isSupported("__opencl_c_images", S.getLangOpts()))) {
       S.Diag(DS.getTypeSpecTypeLoc(), diag::err_opencl_requires_extension)
-          << 0 << Result << OptName;
+          << 0 << Result << "__opencl_c_images";
+      declarator.setInvalidType();
+    } else if (Result->isOCLImage3dWOType() &&
+               !OpenCLOptions.isSupported("cl_khr_3d_image_writes",
+                                          S.getLangOpts())) {
+      S.Diag(DS.getTypeSpecTypeLoc(), diag::err_opencl_requires_extension)
+          << 0 << Result
+          << (IsOpenCLC30
+                  ? "cl_khr_3d_image_writes and __opencl_c_3d_image_writes"
+                  : "cl_khr_3d_image_writes");
       declarator.setInvalidType();
     }
   }
diff --git a/libcxx/include/format b/libcxx/include/format
index 0ec4b85ca0a5..cfd851aa9a3d 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -55,14 +55,14 @@ namespace std {
 
 */
 
+// Make sure all feature tests macros are always available.
+#include <version>
+// Only enable the contents of the header when libc++ was build with LIBCXX_ENABLE_INCOMPLETE_FEATURES enabled
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_FORMAT)
+
 #include <__config>
 #include <__format/format_error.h>
 #include <__format/format_parse_context.h>
-#include <version>
-
-#if defined(_LIBCPP_HAS_NO_INCOMPLETE_FORMAT)
-# error "The Format library is not supported since libc++ has been configured with LIBCXX_ENABLE_INCOMPLETE_FEATURES disabled"
-#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -81,4 +81,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_FORMAT)
+
 #endif // _LIBCPP_FORMAT
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 47f66fd3f622..5feaf4c322fc 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -160,6 +160,11 @@ namespace std::ranges {
 
 */
 
+// Make sure all feature tests macros are always available.
+#include <version>
+// Only enable the contents of the header when libc++ was build with LIBCXX_ENABLE_INCOMPLETE_FEATURES enabled
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
 #include <__config>
 #include <__ranges/access.h>
 #include <__ranges/all.h>
@@ -181,11 +186,6 @@ namespace std::ranges {
 #include <initializer_list> // Required by the standard.
 #include <iterator>         // Required by the standard.
 #include <type_traits>
-#include <version>
-
-#if defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
-# error "The Ranges library is not supported since libc++ has been configured with LIBCXX_ENABLE_INCOMPLETE_FEATURES disabled"
-#endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -204,4 +204,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES)
+
 #endif // _LIBCPP_RANGES
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 9144347045b9..a996a815599a 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -38,6 +38,10 @@ enum ELFKind {
   ELF64BEKind
 };
 
+// For -Bno-symbolic, -Bsymbolic-non-weak-functions, -Bsymbolic-functions,
+// -Bsymbolic.
+enum class BsymbolicKind { None, NonWeakFunctions, Functions, All };
+
 // For --build-id.
 enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid };
 
@@ -144,8 +148,7 @@ struct Configuration {
   bool armHasMovtMovw = false;
   bool armJ1J2BranchEncoding = false;
   bool asNeeded = false;
-  bool bsymbolic = false;
-  bool bsymbolicFunctions = false;
+  BsymbolicKind bsymbolic = BsymbolicKind::None;
   bool callGraphProfileSort;
   bool checkSections;
   bool checkDynamicRelocs;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index a15959158653..91e7df21a60a 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1006,12 +1006,15 @@ static void readConfigs(opt::InputArgList &args) {
                    OPT_no_allow_multiple_definition, false) ||
       hasZOption(args, "muldefs");
   config->auxiliaryList = args::getStrings(args, OPT_auxiliary);
-  if (opt::Arg *arg = args.getLastArg(OPT_Bno_symbolic, OPT_Bsymbolic_functions,
-                                      OPT_Bsymbolic)) {
-    if (arg->getOption().matches(OPT_Bsymbolic_functions))
-      config->bsymbolicFunctions = true;
+  if (opt::Arg *arg =
+          args.getLastArg(OPT_Bno_symbolic, OPT_Bsymbolic_non_weak_functions,
+                          OPT_Bsymbolic_functions, OPT_Bsymbolic)) {
+    if (arg->getOption().matches(OPT_Bsymbolic_non_weak_functions))
+      config->bsymbolic = BsymbolicKind::NonWeakFunctions;
+    else if (arg->getOption().matches(OPT_Bsymbolic_functions))
+      config->bsymbolic = BsymbolicKind::Functions;
     else if (arg->getOption().matches(OPT_Bsymbolic))
-      config->bsymbolic = true;
+      config->bsymbolic = BsymbolicKind::All;
   }
   config->checkSections =
       args.hasFlag(OPT_check_sections, OPT_no_check_sections, true);
@@ -1374,7 +1377,8 @@ static void readConfigs(opt::InputArgList &args) {
   // When producing an executable, --dynamic-list specifies non-local defined
   // symbols which are required to be exported. When producing a shared object,
   // symbols not specified by --dynamic-list are non-preemptible.
-  config->symbolic = config->bsymbolic || args.hasArg(OPT_dynamic_list);
+  config->symbolic =
+      config->bsymbolic == BsymbolicKind::All || args.hasArg(OPT_dynamic_list);
   for (auto *arg : args.filtered(OPT_dynamic_list))
     if (Optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
       readDynamicList(*buffer);
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index bedcf43bbe85..f0e4c11b79eb 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -43,6 +43,9 @@ def Bsymbolic: F<"Bsymbolic">, HelpText<"Bind default visibility defined symbols
 def Bsymbolic_functions: F<"Bsymbolic-functions">,
   HelpText<"Bind default visibility defined function symbols locally for -shared">;
 
+def Bsymbolic_non_weak_functions: F<"Bsymbolic-non-weak-functions">,
+  HelpText<"Bind default visibility defined STB_GLOBAL function symbols locally for -shared">;
+
 def Bdynamic: F<"Bdynamic">, HelpText<"Link against shared libraries (default)">;
 
 def Bstatic: F<"Bstatic">, HelpText<"Do not link against shared libraries">;
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 1039be369d9e..496be33dd182 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -368,8 +368,12 @@ bool elf::computeIsPreemptible(const Symbol &sym) {
 
   // If -Bsymbolic or --dynamic-list is specified, or -Bsymbolic-functions is
   // specified and the symbol is STT_FUNC, the symbol is preemptible iff it is
-  // in the dynamic list.
-  if (config->symbolic || (config->bsymbolicFunctions && sym.isFunc()))
+  // in the dynamic list. -Bsymbolic-non-weak-functions is a non-weak subset of
+  // -Bsymbolic-functions.
+  if (config->symbolic ||
+      (config->bsymbolic == BsymbolicKind::Functions && sym.isFunc()) ||
+      (config->bsymbolic == BsymbolicKind::NonWeakFunctions && sym.isFunc() &&
+       sym.binding != STB_WEAK))
     return sym.inDynamicList;
   return true;
 }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 3496df1d2814..187b2ac90c21 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1356,7 +1356,7 @@ template <class ELFT> void DynamicSection<ELFT>::finalizeContents() {
   // Set DT_FLAGS and DT_FLAGS_1.
   uint32_t dtFlags = 0;
   uint32_t dtFlags1 = 0;
-  if (config->bsymbolic)
+  if (config->bsymbolic == BsymbolicKind::All)
     dtFlags |= DF_SYMBOLIC;
   if (config->zGlobal)
     dtFlags1 |= DF_1_GLOBAL;
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 40439c995f17..a52ee4348f78 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -30,6 +30,8 @@ ELF Improvements
   (`D102461 <https://reviews.llvm.org/D102461>`_)
 * A new linker script command ``OVERWRITE_SECTIONS`` has been added.
   (`D103303 <https://reviews.llvm.org/D103303>`_)
+* ``-Bsymbolic-non-weak-functions`` has been added as a ``STB_GLOBAL`` subset of ``-Bsymbolic-functions``.
+  (`D102570 <https://reviews.llvm.org/D102570>`_)
 
 Breaking changes
 ----------------
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index ba3b0779e699..bd67e58daa4d 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -85,6 +85,9 @@ flag.
 .It Fl Bsymbolic-functions
 Bind default visibility defined function symbols locally for
 .Fl shared.
+.It Fl Bsymbolic-non-weak-functions
+Bind default visibility defined STB_GLOBAL function symbols locally for
+.Fl shared.
 .It Fl -build-id Ns = Ns Ar value
 Generate a build ID note.
 .Ar value
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 692dc4d7d4cf..a4955e2a973a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1396,6 +1396,11 @@ public:
     return NVT;
   }
 
+  virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
+                                     bool AllowUnknown = false) const {
+    return getValueType(DL, Ty, AllowUnknown);
+  }
+
   /// Return the EVT corresponding to this LLVM type.  This is fixed by the LLVM
   /// operations except for the pointer size.  If AllowUnknown is true, this
   /// will return MVT::Other for types with no EVT counterpart (e.g. structs),
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 0e88e705e16b..8bacf687ac76 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -216,6 +216,7 @@ def untyped   : ValueType<8,    174>;  // Produces an untyped value
 def funcref   : ValueType<0,    175>;  // WebAssembly's funcref type
 def externref : ValueType<0,    176>;  // WebAssembly's externref type
 def x86amx    : ValueType<8192, 177>;  // X86 AMX value
+def i64x8     : ValueType<512,  178>;  // 8 Consecutive GPRs (AArch64)
 
 
 def token      : ValueType<0, 248>;  // TokenTy
diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h
index 31f2d5a48183..5c73cece85c3 100644
--- a/llvm/include/llvm/Support/MachineValueType.h
+++ b/llvm/include/llvm/Support/MachineValueType.h
@@ -270,9 +270,10 @@ namespace llvm {
       funcref        = 175,    // WebAssembly's funcref type
       externref      = 176,    // WebAssembly's externref type
       x86amx         = 177,    // This is an X86 AMX value
+      i64x8          = 178,    // 8 Consecutive GPRs (AArch64)
 
       FIRST_VALUETYPE =  1,    // This is always the beginning of the list.
-      LAST_VALUETYPE = x86amx, // This always remains at the end of the list.
+      LAST_VALUETYPE = i64x8,  // This always remains at the end of the list.
       VALUETYPE_SIZE = LAST_VALUETYPE + 1,
 
       // This is the current maximum for LAST_VALUETYPE.
@@ -987,6 +988,7 @@ namespace llvm {
       case nxv16f16:
       case nxv8f32:
       case nxv4f64: return TypeSize::Scalable(256);
+      case i64x8:
       case v512i1:
       case v64i8:
       case v32i16:
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 4a8818f2e2a8..c3a609ee4fe1 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -893,9 +893,10 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
            FTy.getReturnType()->isIntegerTy(32);
 
   case LibFunc_snprintf:
-    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(2)->isPointerTy() &&
-            FTy.getReturnType()->isIntegerTy(32));
+    return NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+           IsSizeTTy(FTy.getParamType(1)) &&
+           FTy.getParamType(2)->isPointerTy() &&
+           FTy.getReturnType()->isIntegerTy(32);
 
   case LibFunc_snprintf_chk:
     return NumParams == 5 && FTy.getParamType(0)->isPointerTy() &&
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index bb4d41cfd69f..4ae427484945 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -325,7 +325,8 @@ bool InlineAsmLowering::lowerInlineAsm(
         return false;
       }
 
-      OpInfo.ConstraintVT = TLI->getValueType(DL, OpTy, true).getSimpleVT();
+      OpInfo.ConstraintVT =
+          TLI->getAsmOperandValueType(DL, OpTy, true).getSimpleVT();
 
     } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
@@ -334,13 +335,17 @@ bool InlineAsmLowering::lowerInlineAsm(
             TLI->getSimpleValueType(DL, STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT = TLI->getSimpleValueType(DL, Call.getType());
+        OpInfo.ConstraintVT =
+            TLI->getAsmOperandValueType(DL, Call.getType()).getSimpleVT();
       }
       ++ResNo;
     } else {
       OpInfo.ConstraintVT = MVT::Other;
     }
 
+    if (OpInfo.ConstraintVT == MVT::i64x8)
+      return false;
+
     // Compute the constraint code and ConstraintType to use.
     computeConstraintToUse(TLI, OpInfo);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b104e995019f..1bba7232eb14 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2439,9 +2439,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
                          N0.getOperand(0));
 
     // fold (add (add (xor a, -1), b), 1) -> (sub b, a)
-    if (N0.getOpcode() == ISD::ADD ||
-        N0.getOpcode() == ISD::UADDO ||
-        N0.getOpcode() == ISD::SADDO) {
+    if (N0.getOpcode() == ISD::ADD) {
       SDValue A, Xor;
 
       if (isBitwiseNot(N0.getOperand(0))) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d56d4bcc9169..a08548393979 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8176,7 +8176,7 @@ public:
       }
     }
 
-    return TLI.getValueType(DL, OpTy, true);
+    return TLI.getAsmOperandValueType(DL, OpTy, true);
   }
 };
 
@@ -8479,8 +8479,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
             DAG.getDataLayout(), STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT =
-            TLI.getSimpleValueType(DAG.getDataLayout(), Call.getType());
+        OpInfo.ConstraintVT = TLI.getAsmOperandValueType(
+            DAG.getDataLayout(), Call.getType()).getSimpleVT();
       }
       ++ResNo;
     } else {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1c1dae8f953f..5e1786958b6f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4687,7 +4687,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
             getSimpleValueType(DL, STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT = getSimpleValueType(DL, Call.getType());
+        OpInfo.ConstraintVT =
+            getAsmOperandValueType(DL, Call.getType()).getSimpleVT();
       }
       ++ResNo;
       break;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 9daebfd9e63d..4876b9e23717 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -167,6 +167,7 @@ std::string EVT::getEVTString() const {
   case MVT::Glue:      return "glue";
   case MVT::x86mmx:    return "x86mmx";
   case MVT::x86amx:    return "x86amx";
+  case MVT::i64x8:     return "i64x8";
   case MVT::Metadata:  return "Metadata";
   case MVT::Untyped:   return "Untyped";
   case MVT::funcref:   return "funcref";
@@ -198,6 +199,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
+  case MVT::i64x8:   return IntegerType::get(Context, 512);
   case MVT::externref:
     return PointerType::get(StructType::create(Context), 10);
   case MVT::funcref:
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index adefe3b37ee0..3ab9b250749a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -653,6 +653,9 @@ bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
   case 'x':
     Reg = getXRegFromWReg(Reg);
     break;
+  case 't':
+    Reg = getXRegFromXRegTuple(Reg);
+    break;
   }
 
   O << AArch64InstPrinter::getRegisterName(Reg);
@@ -749,6 +752,10 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         AArch64::GPR64allRegClass.contains(Reg))
       return printAsmMRegister(MO, 'x', O);
 
+    // If this is an x register tuple, print an x register.
+    if (AArch64::GPR64x8ClassRegClass.contains(Reg))
+      return printAsmMRegister(MO, 't', O);
+
     unsigned AltName = AArch64::NoRegAltName;
     const TargetRegisterClass *RegClass;
     if (AArch64::ZPRRegClass.contains(Reg)) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ae702eedcd66..ca6b87a5ebb0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -246,6 +246,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
+  if (Subtarget->hasLS64()) {
+    addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
+    setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
+    setOperationAction(ISD::STORE, MVT::i64x8, Custom);
+  }
+
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
@@ -2023,6 +2029,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::LASTA)
     MAKE_CASE(AArch64ISD::LASTB)
     MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
+    MAKE_CASE(AArch64ISD::LS64_BUILD)
+    MAKE_CASE(AArch64ISD::LS64_EXTRACT)
     MAKE_CASE(AArch64ISD::TBL)
     MAKE_CASE(AArch64ISD::FADD_PRED)
     MAKE_CASE(AArch64ISD::FADDA_PRED)
@@ -4611,17 +4619,51 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
         {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
         StoreNode->getMemoryVT(), StoreNode->getMemOperand());
     return Result;
+  } else if (MemVT == MVT::i64x8) {
+    SDValue Value = StoreNode->getValue();
+    assert(Value->getValueType(0) == MVT::i64x8);
+    SDValue Chain = StoreNode->getChain();
+    SDValue Base = StoreNode->getBasePtr();
+    EVT PtrVT = Base.getValueType();
+    for (unsigned i = 0; i < 8; i++) {
+      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
+                                 Value, DAG.getConstant(i, Dl, MVT::i32));
+      SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
+                                DAG.getConstant(i * 8, Dl, PtrVT));
+      Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
+                           StoreNode->getOriginalAlign());
+    }
+    return Chain;
   }
 
   return SDValue();
 }
 
-// Custom lowering for extending v4i8 vector loads.
 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
                                          SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
+
+  if (LoadNode->getMemoryVT() == MVT::i64x8) {
+    SmallVector<SDValue, 8> Ops;
+    SDValue Base = LoadNode->getBasePtr();
+    SDValue Chain = LoadNode->getChain();
+    EVT PtrVT = Base.getValueType();
+    for (unsigned i = 0; i < 8; i++) {
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
+                                DAG.getConstant(i * 8, DL, PtrVT));
+      SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
+                                 LoadNode->getPointerInfo(),
+                                 LoadNode->getOriginalAlign());
+      Ops.push_back(Part);
+      Chain = SDValue(Part.getNode(), 1);
+    }
+    SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
+    return DAG.getMergeValues({Loaded, Chain}, DL);
+  }
+
+  // Custom lowering for extending v4i8 vector loads.
   EVT VT = Op->getValueType(0);
   assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
 
@@ -8179,6 +8221,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
     case 'r':
       if (VT.isScalableVector())
         return std::make_pair(0U, nullptr);
+      if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
+        return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
       if (VT.getFixedSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
@@ -8266,6 +8310,15 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   return Res;
 }
 
+EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
+                                                  llvm::Type *Ty,
+                                                  bool AllowUnknown) const {
+  if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
+    return EVT(MVT::i64x8);
+
+  return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
+}
+
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void AArch64TargetLowering::LowerAsmOperandForConstraint(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 386e1c2d8400..2b337255fc27 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -330,6 +330,10 @@ enum NodeType : unsigned {
   // Cast between vectors of the same element type but differ in length.
   REINTERPRET_CAST,
 
+  // Nodes to build an LD64B / ST64B 64-bit quantity out of i64, and vice versa
+  LS64_BUILD,
+  LS64_EXTRACT,
+
   LD1_MERGE_ZERO,
   LD1S_MERGE_ZERO,
   LDNF1_MERGE_ZERO,
@@ -824,6 +828,9 @@ public:
   bool isAllActivePredicate(SDValue N) const;
   EVT getPromotedVTForPredicate(EVT VT) const;
 
+  EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
+                             bool AllowUnknown = false) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 682cec361728..12744e4de09b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8092,6 +8092,20 @@ let AddedComplexity = 10 in {
   // FIXME: add SVE dot-product patterns.
 }
 
+// Custom DAG nodes and isel rules to make a 64-byte block out of eight GPRs,
+// so that it can be used as input to inline asm, and vice versa.
+def LS64_BUILD : SDNode<"AArch64ISD::LS64_BUILD", SDTypeProfile<1, 8, []>>;
+def LS64_EXTRACT : SDNode<"AArch64ISD::LS64_EXTRACT", SDTypeProfile<1, 2, []>>;
+def : Pat<(i64x8 (LS64_BUILD GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3,
+                             GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7)),
+          (REG_SEQUENCE GPR64x8Class,
+              $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3,
+              $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7)>;
+foreach i = 0-7 in {
+  def : Pat<(i64 (LS64_EXTRACT (i64x8 GPR64x8:$val), (i32 i))),
+            (EXTRACT_SUBREG $val, !cast<SubRegIndex>("x8sub_"#i))>;
+}
+
 let Predicates = [HasLS64] in {
   def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
                                           (outs GPR64x8:$Rt)>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 07dee3ce1fbc..67680e356683 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -732,7 +732,9 @@ def Tuples8X : RegisterTuples<
   !foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)),
   !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>;
 
-def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>;
+def GPR64x8Class : RegisterClass<"AArch64", [i64x8], 512, (trunc Tuples8X, 12)> {
+  let Size = 512;
+}
 def GPR64x8AsmOp : AsmOperandClass {
   let Name = "GPR64x8";
   let ParserMethod = "tryParseGPR64x8";
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index ce6866154242..d168c2a84bbe 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -106,6 +106,25 @@ inline static unsigned getXRegFromWReg(unsigned Reg) {
   return Reg;
 }
 
+inline static unsigned getXRegFromXRegTuple(unsigned RegTuple) {
+  switch (RegTuple) {
+  case AArch64::X0_X1_X2_X3_X4_X5_X6_X7: return AArch64::X0;
+  case AArch64::X2_X3_X4_X5_X6_X7_X8_X9: return AArch64::X2;
+  case AArch64::X4_X5_X6_X7_X8_X9_X10_X11: return AArch64::X4;
+  case AArch64::X6_X7_X8_X9_X10_X11_X12_X13: return AArch64::X6;
+  case AArch64::X8_X9_X10_X11_X12_X13_X14_X15: return AArch64::X8;
+  case AArch64::X10_X11_X12_X13_X14_X15_X16_X17: return AArch64::X10;
+  case AArch64::X12_X13_X14_X15_X16_X17_X18_X19: return AArch64::X12;
+  case AArch64::X14_X15_X16_X17_X18_X19_X20_X21: return AArch64::X14;
+  case AArch64::X16_X17_X18_X19_X20_X21_X22_X23: return AArch64::X16;
+  case AArch64::X18_X19_X20_X21_X22_X23_X24_X25: return AArch64::X18;
+  case AArch64::X20_X21_X22_X23_X24_X25_X26_X27: return AArch64::X20;
+  case AArch64::X22_X23_X24_X25_X26_X27_X28_FP: return AArch64::X22;
+  }
+  // For anything else, return it unchanged.
+  return RegTuple;
+}
+
 static inline unsigned getBRegFromDReg(unsigned Reg) {
   switch (Reg) {
   case AArch64::D0:  return AArch64::B0;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d37ed584d9d2..294532011650 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5814,6 +5814,13 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
     break;
   }
 
+  // Only handle cases where the result is used by a CopyToReg that likely
+  // means the value is a liveout of the basic block. This helps prevent
+  // infinite combine loops like PR51206.
+  if (none_of(N->uses(),
+              [](SDNode *User) { return User->getOpcode() == ISD::CopyToReg; }))
+    return SDValue();
+
   SmallVector<SDNode *, 4> SetCCs;
   for (SDNode::use_iterator UI = Src.getNode()->use_begin(),
                             UE = Src.getNode()->use_end();
diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index c77769368ede..66c9d9f0902a 100644
--- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -272,9 +272,10 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
 
       if (PredBB && IsSafeToHoist(RemInst, RemBB) &&
           IsSafeToHoist(DivInst, DivBB) &&
-          llvm::all_of(successors(PredBB), [&](BasicBlock *BB) {
-            return BB == DivBB || BB == RemBB;
-          })) {
+          all_of(successors(PredBB),
+                 [&](BasicBlock *BB) { return BB == DivBB || BB == RemBB; }) &&
+          all_of(predecessors(DivBB),
+                 [&](BasicBlock *BB) { return BB == RemBB || BB == PredBB; })) {
         DivDominates = true;
         DivInst->moveBefore(PredBB->getTerminator());
         Changed = true;
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 7311819f77ff..137f99078faf 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -77,6 +77,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::ppcf128:  return "MVT::ppcf128";
   case MVT::x86mmx:   return "MVT::x86mmx";
   case MVT::x86amx:   return "MVT::x86amx";
+  case MVT::i64x8:    return "MVT::i64x8";
   case MVT::Glue:     return "MVT::Glue";
   case MVT::isVoid:   return "MVT::isVoid";
   case MVT::v1i1:     return "MVT::v1i1";

From d545c2ce5ad1891282e8818b47ffe557c76a86b4 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sat, 21 Aug 2021 23:27:36 +0200
Subject: [PATCH 3/3] Vendor import of llvm-project branch release/13.x
 llvmorg-13.0.0-rc1-97-g23ba3732246a.

---
 clang/include/clang/Basic/BuiltinsAArch64.def |   3 +
 clang/include/clang/Basic/LangOptions.h       |   6 +
 clang/include/clang/Driver/Options.td         |   6 +-
 clang/include/clang/Lex/PreprocessorOptions.h |   3 -
 clang/include/clang/Sema/Sema.h               |   3 +-
 clang/lib/AST/ASTContext.cpp                  |  18 +-
 clang/lib/AST/Expr.cpp                        |   7 +-
 clang/lib/Basic/LangOptions.cpp               |   8 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  23 +
 clang/lib/CodeGen/CGDeclCXX.cpp               |  18 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   2 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  38 +-
 clang/lib/Driver/ToolChains/Hexagon.cpp       |  32 +-
 clang/lib/Driver/ToolChains/MinGW.cpp         |   7 +-
 clang/lib/Frontend/CompilerInvocation.cpp     |  18 +-
 clang/lib/Headers/intrin.h                    |   3 +
 clang/lib/Lex/PPMacroExpansion.cpp            |  11 +-
 clang/lib/Sema/SemaConcept.cpp                |  17 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |   5 +-
 clang/lib/Sema/SemaTemplate.cpp               |   9 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  19 +-
 compiler-rt/include/profile/InstrProfData.inc |   4 +-
 .../lib/profile/InstrProfilingBuffer.c        |   2 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  11 +-
 .../lib/profile/InstrProfilingPlatformLinux.c |  19 +
 libcxx/include/__config                       |  11 +
 libcxx/include/ctime                          |   2 +-
 libcxx/include/ios                            |   7 +
 lld/ELF/Config.h                              |   3 +-
 lld/ELF/Driver.cpp                            |  55 +-
 lld/ELF/LinkerScript.cpp                      |  36 +-
 lld/ELF/LinkerScript.h                        |   2 +-
 lld/ELF/Relocations.cpp                       |   7 +
 lld/ELF/ScriptParser.cpp                      |   9 +-
 lld/ELF/SymbolTable.cpp                       | 118 ++-
 lld/ELF/SymbolTable.h                         |  10 +-
 lld/ELF/Symbols.cpp                           |   3 +
 lld/docs/ReleaseNotes.rst                     | 144 ++-
 .../Commands/CommandObjectMemoryTag.cpp       | 182 +++-
 lldb/source/Commands/Options.td               |   8 +
 .../GDBRemoteCommunicationServerLLGS.cpp      |  28 +-
 lldb/source/Symbol/TypeSystem.cpp             |  90 +-
 llvm/include/llvm/Analysis/ValueTracking.h    |   4 +
 llvm/include/llvm/IR/Module.h                 |   3 +
 llvm/include/llvm/ProfileData/InstrProf.h     |   1 +
 .../llvm/ProfileData/InstrProfData.inc        |   4 +-
 llvm/include/llvm/Transforms/IPO/Attributor.h |  20 +
 .../llvm/Transforms/Utils/PredicateInfo.h     |   6 +-
 .../Utils/ScalarEvolutionExpander.h           |   5 +
 llvm/lib/Analysis/InstructionSimplify.cpp     |  16 +
 llvm/lib/Analysis/ValueTracking.cpp           |  10 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   6 +-
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  11 +-
 llvm/lib/IR/ConstantFold.cpp                  | 241 -----
 llvm/lib/IR/Module.cpp                        |   4 +
 llvm/lib/ProfileData/InstrProfReader.cpp      |   2 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  18 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  34 +-
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp  |   2 +-
 llvm/lib/Target/BPF/BPFTargetTransformInfo.h  |  18 +
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |   3 +
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td      | 989 +++++++++++++-----
 llvm/lib/Target/RISCV/RISCVSchedRocket.td     |   3 +
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td    |   3 +
 llvm/lib/Target/RISCV/RISCVSchedule.td        |   1 +
 llvm/lib/Target/RISCV/RISCVScheduleV.td       | 820 +++++++++++++++
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  14 +-
 llvm/lib/Target/X86/X86InstrArithmetic.td     |  28 +-
 llvm/lib/Transforms/IPO/Attributor.cpp        | 117 ++-
 .../Transforms/IPO/AttributorAttributes.cpp   |  54 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  16 +-
 .../InstCombine/InstCombineCompares.cpp       | 123 ++-
 .../InstCombineLoadStoreAlloca.cpp            |   8 +-
 .../InstCombine/InstCombineSelect.cpp         |   3 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  | 492 +++++++--
 llvm/lib/Transforms/Scalar/SROA.cpp           |   5 +-
 llvm/lib/Transforms/Utils/PredicateInfo.cpp   |  46 +
 .../Utils/ScalarEvolutionExpander.cpp         |   5 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  46 +
 llvm/tools/llvm-mca/Views/TimelineView.cpp    |  15 +-
 openmp/runtime/src/kmp_taskdeps.cpp           |  27 +-
 openmp/runtime/src/kmp_taskdeps.h             |   3 +-
 openmp/runtime/src/kmp_tasking.cpp            |   1 +
 83 files changed, 3225 insertions(+), 1009 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVScheduleV.td

diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 1dac5d2371d4..634bcaed20a6 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -243,6 +243,9 @@ TARGET_HEADER_BUILTIN(_ReadStatusReg,  "LLii",  "nh", "intrin.h", ALL_MS_LANGUAG
 TARGET_HEADER_BUILTIN(_WriteStatusReg, "viLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_AddressOfReturnAddress, "v*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 
+TARGET_HEADER_BUILTIN(__mulh,  "SLLiSLLiSLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(__umulh, "ULLiULLiULLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+
 #undef BUILTIN
 #undef LANGBUILTIN
 #undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 71cf0c65e692..b60b94a1ba08 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -354,6 +354,9 @@ public:
   /// A list of all -fno-builtin-* function names (e.g., memset).
   std::vector<std::string> NoBuiltinFuncs;
 
+  /// A prefix map for __FILE__, __BASE_FILE__ and __builtin_FILE().
+  std::map<std::string, std::string, std::greater<std::string>> MacroPrefixMap;
+
   /// Triples of the OpenMP targets that the host code codegen should
   /// take into account in order to generate accurate offloading descriptors.
   std::vector<llvm::Triple> OMPTargetTriples;
@@ -460,6 +463,9 @@ public:
   }
 
   bool isSYCL() const { return SYCLIsDevice || SYCLIsHost; }
+
+  /// Remap path prefix according to -fmacro-prefix-path option.
+  void remapPathPrefix(SmallString<256> &Path) const;
 };
 
 /// Floating point control options
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index ab1a5487d9c0..a0cbcae0bdc3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2825,10 +2825,10 @@ def fcoverage_prefix_map_EQ
     HelpText<"remap file source paths in coverage mapping">;
 def ffile_prefix_map_EQ
   : Joined<["-"], "ffile-prefix-map=">, Group<f_Group>,
-    HelpText<"remap file source paths in debug info and predefined preprocessor macros">;
+    HelpText<"remap file source paths in debug info, predefined preprocessor macros and __builtin_FILE()">;
 def fmacro_prefix_map_EQ
-  : Joined<["-"], "fmacro-prefix-map=">, Group<Preprocessor_Group>, Flags<[CC1Option]>,
-    HelpText<"remap file source paths in predefined preprocessor macros">;
+  : Joined<["-"], "fmacro-prefix-map=">, Group<f_Group>, Flags<[CC1Option]>,
+    HelpText<"remap file source paths in predefined preprocessor macros and __builtin_FILE()">;
 defm force_dwarf_frame : BoolFOption<"force-dwarf-frame",
   CodeGenOpts<"ForceDwarfFrameSection">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Always emit a debug frame section">, NegFlag<SetFalse>>;
diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h
index 99085b98fc7a..a7aabc3e1df2 100644
--- a/clang/include/clang/Lex/PreprocessorOptions.h
+++ b/clang/include/clang/Lex/PreprocessorOptions.h
@@ -199,9 +199,6 @@ public:
   /// build it again.
   std::shared_ptr<FailedModulesSet> FailedModules;
 
-  /// A prefix map for __FILE__ and __BASE_FILE__.
-  std::map<std::string, std::string, std::greater<std::string>> MacroPrefixMap;
-
   /// Contains the currently active skipped range mappings for skipping excluded
   /// conditional directives.
   ///
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 83a2d132bf6a..d8b2546b81a3 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7828,8 +7828,7 @@ public:
                                  TemplateArgumentLoc &Arg,
                            SmallVectorImpl<TemplateArgument> &Converted);
 
-  bool CheckTemplateArgument(TemplateTypeParmDecl *Param,
-                             TypeSourceInfo *Arg);
+  bool CheckTemplateArgument(TypeSourceInfo *Arg);
   ExprResult CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
                                    QualType InstantiatedParamType, Expr *Arg,
                                    TemplateArgument &Converted,
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e102a3ba508d..fdba204fbe7f 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6066,9 +6066,11 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
                                     NNS->getAsNamespaceAlias()->getNamespace()
                                                       ->getOriginalNamespace());
 
+  // The difference between TypeSpec and TypeSpecWithTemplate is that the
+  // latter will have the 'template' keyword when printed.
   case NestedNameSpecifier::TypeSpec:
   case NestedNameSpecifier::TypeSpecWithTemplate: {
-    QualType T = getCanonicalType(QualType(NNS->getAsType(), 0));
+    const Type *T = getCanonicalType(NNS->getAsType());
 
     // If we have some kind of dependent-named type (e.g., "typename T::type"),
     // break it apart into its prefix and identifier, then reconsititute those
@@ -6078,14 +6080,16 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
     //   typedef typename T::type T1;
     //   typedef typename T1::type T2;
     if (const auto *DNT = T->getAs<DependentNameType>())
-      return NestedNameSpecifier::Create(*this, DNT->getQualifier(),
-                           const_cast<IdentifierInfo *>(DNT->getIdentifier()));
+      return NestedNameSpecifier::Create(
+          *this, DNT->getQualifier(),
+          const_cast<IdentifierInfo *>(DNT->getIdentifier()));
+    if (const auto *DTST = T->getAs<DependentTemplateSpecializationType>())
+      return NestedNameSpecifier::Create(*this, DTST->getQualifier(), true,
+                                         const_cast<Type *>(T));
 
-    // Otherwise, just canonicalize the type, and force it to be a TypeSpec.
-    // FIXME: Why are TypeSpec and TypeSpecWithTemplate distinct in the
-    // first place?
+    // TODO: Set 'Template' parameter to true for other template types.
     return NestedNameSpecifier::Create(*this, nullptr, false,
-                                       const_cast<Type *>(T.getTypePtr()));
+                                       const_cast<Type *>(T));
   }
 
   case NestedNameSpecifier::Global:
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index e8b4aaa2b81e..11f10d4695fc 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -2233,8 +2233,11 @@ APValue SourceLocExpr::EvaluateInContext(const ASTContext &Ctx,
   };
 
   switch (getIdentKind()) {
-  case SourceLocExpr::File:
-    return MakeStringLiteral(PLoc.getFilename());
+  case SourceLocExpr::File: {
+    SmallString<256> Path(PLoc.getFilename());
+    Ctx.getLangOpts().remapPathPrefix(Path);
+    return MakeStringLiteral(Path);
+  }
   case SourceLocExpr::Function: {
     const Decl *CurDecl = dyn_cast_or_null<Decl>(Context);
     return MakeStringLiteral(
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index dc392d5352aa..bebf3178426f 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -11,6 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Path.h"
 
 using namespace clang;
 
@@ -48,6 +50,12 @@ VersionTuple LangOptions::getOpenCLVersionTuple() const {
   return VersionTuple(Ver / 100, (Ver % 100) / 10);
 }
 
+void LangOptions::remapPathPrefix(SmallString<256> &Path) const {
+  for (const auto &Entry : MacroPrefixMap)
+    if (llvm::sys::path::replace_path_prefix(Path, Entry.first, Entry.second))
+      break;
+}
+
 FPOptions FPOptions::defaultWithoutTrailingStorage(const LangOptions &LO) {
   FPOptions result(LO);
   return result;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d9b2a5fe16be..1a02965b223e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9732,6 +9732,29 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F);
   }
 
+  if (BuiltinID == AArch64::BI__mulh || BuiltinID == AArch64::BI__umulh) {
+    llvm::Type *ResType = ConvertType(E->getType());
+    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
+
+    bool IsSigned = BuiltinID == AArch64::BI__mulh;
+    Value *LHS =
+        Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
+    Value *RHS =
+        Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
+
+    Value *MulResult, *HigherBits;
+    if (IsSigned) {
+      MulResult = Builder.CreateNSWMul(LHS, RHS);
+      HigherBits = Builder.CreateAShr(MulResult, 64);
+    } else {
+      MulResult = Builder.CreateNUWMul(LHS, RHS);
+      HigherBits = Builder.CreateLShr(MulResult, 64);
+    }
+    HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
+
+    return HigherBits;
+  }
+
   // Handle MSVC intrinsics before argument evaluation to prevent double
   // evaluation.
   if (Optional<MSVCIntrin> MsvcIntId = translateAarch64ToMsvcIntrin(BuiltinID))
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index d43fb99550a8..553fedebfe56 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -555,7 +555,8 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
                                           PrioritizedCXXGlobalInits.size());
     PrioritizedCXXGlobalInits.push_back(std::make_pair(Key, Fn));
   } else if (isTemplateInstantiation(D->getTemplateSpecializationKind()) ||
-             getContext().GetGVALinkageForVariable(D) == GVA_DiscardableODR) {
+             getContext().GetGVALinkageForVariable(D) == GVA_DiscardableODR ||
+             D->hasAttr<SelectAnyAttr>()) {
     // C++ [basic.start.init]p2:
     //   Definitions of explicitly specialized class template static data
     //   members have ordered initialization. Other class template static data
@@ -568,17 +569,18 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
     // group with the global being initialized.  On most platforms, this is a
     // minor startup time optimization.  In the MS C++ ABI, there are no guard
     // variables, so this COMDAT key is required for correctness.
-    AddGlobalCtor(Fn, 65535, COMDATKey);
-    if (getTarget().getCXXABI().isMicrosoft() && COMDATKey) {
-      // In The MS C++, MS add template static data member in the linker
-      // drective.
-      addUsedGlobal(COMDATKey);
-    }
-  } else if (D->hasAttr<SelectAnyAttr>()) {
+    //
     // SelectAny globals will be comdat-folded. Put the initializer into a
     // COMDAT group associated with the global, so the initializers get folded
     // too.
+
     AddGlobalCtor(Fn, 65535, COMDATKey);
+    if (COMDATKey && (getTriple().isOSBinFormatELF() ||
+                      getTarget().getCXXABI().isMicrosoft())) {
+      // When COMDAT is used on ELF or in the MS C++ ABI, the key must be in
+      // llvm.used to prevent linker GC.
+      addUsedGlobal(COMDATKey);
+    }
   } else {
     I = DelayedCXXInitPosition.find(D); // Re-do lookup in case of re-hash.
     if (I == DelayedCXXInitPosition.end()) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 9b40b88ea3c9..49a1396b58e3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -186,7 +186,7 @@ CodeGenModule::CodeGenModule(ASTContext &C, const HeaderSearchOptions &HSO,
       !getModule().getSourceFileName().empty()) {
     std::string Path = getModule().getSourceFileName();
     // Check if a path substitution is needed from the MacroPrefixMap.
-    for (const auto &Entry : PPO.MacroPrefixMap)
+    for (const auto &Entry : LangOpts.MacroPrefixMap)
       if (Path.rfind(Entry.first, 0) != std::string::npos) {
         Path = Entry.second + Path.substr(Entry.first.size());
         break;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1870bd81789c..4c8ba8cdcd29 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2637,7 +2637,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
 
   llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
   llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
-  StringRef FPContract = "on";
+  StringRef FPContract = "";
   bool StrictFPModel = false;
 
 
@@ -2662,7 +2662,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = false;
       SignedZeros = true;
       // -fno_fast_math restores default denormal and fpcontract handling
-      FPContract = "on";
+      FPContract = "";
       DenormalFPMath = llvm::DenormalMode::getIEEE();
 
       // FIXME: The target may have picked a non-IEEE default mode here based on
@@ -2682,18 +2682,20 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       // ffp-model= is a Driver option, it is entirely rewritten into more
       // granular options before being passed into cc1.
       // Use the gcc option in the switch below.
-      if (!FPModel.empty() && !FPModel.equals(Val))
+      if (!FPModel.empty() && !FPModel.equals(Val)) {
         D.Diag(clang::diag::warn_drv_overriding_flag_option)
           << Args.MakeArgString("-ffp-model=" + FPModel)
           << Args.MakeArgString("-ffp-model=" + Val);
+        FPContract = "";
+      }
       if (Val.equals("fast")) {
         optID = options::OPT_ffast_math;
         FPModel = Val;
-        FPContract = Val;
+        FPContract = "fast";
       } else if (Val.equals("precise")) {
         optID = options::OPT_ffp_contract;
         FPModel = Val;
-        FPContract = "on";
+        FPContract = "fast";
         PreciseFPModel = true;
       } else if (Val.equals("strict")) {
         StrictFPModel = true;
@@ -2779,11 +2781,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     case options::OPT_ffp_contract: {
       StringRef Val = A->getValue();
       if (PreciseFPModel) {
-        // When -ffp-model=precise is seen on the command line,
-        // the boolean PreciseFPModel is set to true which indicates
-        // "the current option is actually PreciseFPModel". The optID
-        // is changed to OPT_ffp_contract and FPContract is set to "on".
-        // the argument Val string is "precise": it shouldn't be checked.
+        // -ffp-model=precise enables ffp-contract=fast as a side effect
+        // the FPContract value has already been set to a string literal
+        // and the Val string isn't a pertinent value.
         ;
       } else if (Val.equals("fast") || Val.equals("on") || Val.equals("off"))
         FPContract = Val;
@@ -2881,17 +2881,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       // -fno_fast_math restores default denormal and fpcontract handling
       DenormalFPMath = DefaultDenormalFPMath;
       DenormalFP32Math = llvm::DenormalMode::getIEEE();
-      FPContract = "on";
+      FPContract = "";
       break;
     }
     if (StrictFPModel) {
       // If -ffp-model=strict has been specified on command line but
       // subsequent options conflict then emit warning diagnostic.
-      if (HonorINFs && HonorNaNs && !AssociativeMath && !ReciprocalMath &&
-          SignedZeros && TrappingMath && RoundingFPMath &&
-          DenormalFPMath == llvm::DenormalMode::getIEEE() &&
-          DenormalFP32Math == llvm::DenormalMode::getIEEE() &&
-          FPContract.equals("off"))
+      if (HonorINFs && HonorNaNs &&
+        !AssociativeMath && !ReciprocalMath &&
+        SignedZeros && TrappingMath && RoundingFPMath &&
+        (FPContract.equals("off") || FPContract.empty()) &&
+        DenormalFPMath == llvm::DenormalMode::getIEEE() &&
+        DenormalFP32Math == llvm::DenormalMode::getIEEE())
         // OK: Current Arg doesn't conflict with -ffp-model=strict
         ;
       else {
@@ -7690,8 +7691,11 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
         assert(CurTC == nullptr && "Expected one dependence!");
         CurTC = TC;
       });
+      UB += C.addTempFile(
+          C.getArgs().MakeArgString(CurTC->getInputFilename(Inputs[I])));
+    } else {
+      UB += CurTC->getInputFilename(Inputs[I]);
     }
-    UB += CurTC->getInputFilename(Inputs[I]);
   }
   CmdArgs.push_back(TCArgs.MakeArgString(UB));
 
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 828bfdbb05a3..314d0efce441 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -588,21 +588,43 @@ void HexagonToolChain::addClangTargetOptions(const ArgList &DriverArgs,
 
 void HexagonToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                  ArgStringList &CC1Args) const {
-  if (DriverArgs.hasArg(options::OPT_nostdinc) ||
-      DriverArgs.hasArg(options::OPT_nostdlibinc))
+  if (DriverArgs.hasArg(options::OPT_nostdinc))
     return;
 
+  const bool IsELF = !getTriple().isMusl() && !getTriple().isOSLinux();
+  const bool IsLinuxMusl = getTriple().isMusl() && getTriple().isOSLinux();
+
   const Driver &D = getDriver();
-  if (!D.SysRoot.empty()) {
+  SmallString<128> ResourceDirInclude(D.ResourceDir);
+  if (!IsELF) {
+    llvm::sys::path::append(ResourceDirInclude, "include");
+    if (!DriverArgs.hasArg(options::OPT_nobuiltininc) &&
+        (!IsLinuxMusl || DriverArgs.hasArg(options::OPT_nostdlibinc)))
+      addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
+  }
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
+    return;
+
+  const bool HasSysRoot = !D.SysRoot.empty();
+  if (HasSysRoot) {
     SmallString<128> P(D.SysRoot);
-    if (getTriple().isMusl())
+    if (IsLinuxMusl)
       llvm::sys::path::append(P, "usr/include");
     else
       llvm::sys::path::append(P, "include");
+
     addExternCSystemInclude(DriverArgs, CC1Args, P.str());
-    return;
+    // LOCAL_INCLUDE_DIR
+    addSystemInclude(DriverArgs, CC1Args, P + "/usr/local/include");
+    // TOOL_INCLUDE_DIR
+    AddMultilibIncludeArgs(DriverArgs, CC1Args);
   }
 
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc) && IsLinuxMusl)
+    addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
+
+  if (HasSysRoot)
+    return;
   std::string TargetDir = getHexagonTargetDir(D.getInstalledDir(),
                                               D.PrefixDirs);
   addExternCSystemInclude(DriverArgs, CC1Args, TargetDir + "/hexagon/include");
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 20efbdc237a8..7ba729f36bd8 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -136,10 +136,13 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     llvm_unreachable("Unsupported target architecture.");
   }
 
-  if (Args.hasArg(options::OPT_mwindows)) {
+  Arg *SubsysArg =
+      Args.getLastArg(options::OPT_mwindows, options::OPT_mconsole);
+  if (SubsysArg && SubsysArg->getOption().matches(options::OPT_mwindows)) {
     CmdArgs.push_back("--subsystem");
     CmdArgs.push_back("windows");
-  } else if (Args.hasArg(options::OPT_mconsole)) {
+  } else if (SubsysArg &&
+             SubsysArg->getOption().matches(options::OPT_mconsole)) {
     CmdArgs.push_back("--subsystem");
     CmdArgs.push_back("console");
   }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 33e5f3e99c45..7025028bc94a 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3528,6 +3528,9 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
     GenerateArg(Args, OPT_fexperimental_relative_cxx_abi_vtables, SA);
   else
     GenerateArg(Args, OPT_fno_experimental_relative_cxx_abi_vtables, SA);
+
+  for (const auto &MP : Opts.MacroPrefixMap)
+    GenerateArg(Args, OPT_fmacro_prefix_map_EQ, MP.first + "=" + MP.second, SA);
 }
 
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4037,6 +4040,12 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                    options::OPT_fno_experimental_relative_cxx_abi_vtables,
                    TargetCXXABI::usesRelativeVTables(T));
 
+  for (const auto &A : Args.getAllArgValues(OPT_fmacro_prefix_map_EQ)) {
+    auto Split = StringRef(A).split('=');
+    Opts.MacroPrefixMap.insert(
+        {std::string(Split.first), std::string(Split.second)});
+  }
+
   return Diags.getNumErrors() == NumErrorsBefore;
 }
 
@@ -4109,9 +4118,6 @@ static void GeneratePreprocessorArgs(PreprocessorOptions &Opts,
   for (const auto &D : Opts.DeserializedPCHDeclsToErrorOn)
     GenerateArg(Args, OPT_error_on_deserialized_pch_decl, D, SA);
 
-  for (const auto &MP : Opts.MacroPrefixMap)
-    GenerateArg(Args, OPT_fmacro_prefix_map_EQ, MP.first + "=" + MP.second, SA);
-
   if (Opts.PrecompiledPreambleBytes != std::make_pair(0u, false))
     GenerateArg(Args, OPT_preamble_bytes_EQ,
                 Twine(Opts.PrecompiledPreambleBytes.first) + "," +
@@ -4180,12 +4186,6 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
   for (const auto *A : Args.filtered(OPT_error_on_deserialized_pch_decl))
     Opts.DeserializedPCHDeclsToErrorOn.insert(A->getValue());
 
-  for (const auto &A : Args.getAllArgValues(OPT_fmacro_prefix_map_EQ)) {
-    auto Split = StringRef(A).split('=');
-    Opts.MacroPrefixMap.insert(
-        {std::string(Split.first), std::string(Split.second)});
-  }
-
   if (const Arg *A = Args.getLastArg(OPT_preamble_bytes_EQ)) {
     StringRef Value(A->getValue());
     size_t Comma = Value.find(',');
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index ff8eb8fca268..34ec79d6acbc 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -574,6 +574,9 @@ void _WriteStatusReg(int, __int64);
 unsigned short __cdecl _byteswap_ushort(unsigned short val);
 unsigned long __cdecl _byteswap_ulong (unsigned long val);
 unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
+
+__int64 __mulh(__int64 __a, __int64 __b);
+unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 8728ac9e2166..d8ad9d845e7a 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1453,15 +1453,6 @@ static bool isTargetEnvironment(const TargetInfo &TI,
   return TI.getTriple().getEnvironment() == Env.getEnvironment();
 }
 
-static void remapMacroPath(
-    SmallString<256> &Path,
-    const std::map<std::string, std::string, std::greater<std::string>>
-        &MacroPrefixMap) {
-  for (const auto &Entry : MacroPrefixMap)
-    if (llvm::sys::path::replace_path_prefix(Path, Entry.first, Entry.second))
-      break;
-}
-
 /// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
 /// as a builtin macro, handle it and return the next token as 'Tok'.
 void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
@@ -1543,7 +1534,7 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
       } else {
         FN += PLoc.getFilename();
       }
-      remapMacroPath(FN, PPOpts->MacroPrefixMap);
+      getLangOpts().remapPathPrefix(FN);
       Lexer::Stringify(FN);
       OS << '"' << FN << '"';
     }
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index f2c70d0a56ef..931c9e3e2738 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -742,22 +742,15 @@ Optional<NormalizedConstraint>
 NormalizedConstraint::fromConstraintExprs(Sema &S, NamedDecl *D,
                                           ArrayRef<const Expr *> E) {
   assert(E.size() != 0);
-  auto First = fromConstraintExpr(S, D, E[0]);
-  if (E.size() == 1)
-    return First;
-  auto Second = fromConstraintExpr(S, D, E[1]);
-  if (!Second)
+  auto Conjunction = fromConstraintExpr(S, D, E[0]);
+  if (!Conjunction)
     return None;
-  llvm::Optional<NormalizedConstraint> Conjunction;
-  Conjunction.emplace(S.Context, std::move(*First), std::move(*Second),
-                      CCK_Conjunction);
-  for (unsigned I = 2; I < E.size(); ++I) {
+  for (unsigned I = 1; I < E.size(); ++I) {
     auto Next = fromConstraintExpr(S, D, E[I]);
     if (!Next)
-      return llvm::Optional<NormalizedConstraint>{};
-    NormalizedConstraint NewConjunction(S.Context, std::move(*Conjunction),
+      return None;
+    *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction),
                                         std::move(*Next), CCK_Conjunction);
-    *Conjunction = std::move(NewConjunction);
   }
   return Conjunction;
 }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 83c97626ff7e..da4f4f862095 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -12472,6 +12472,8 @@ bool Sema::CheckUsingDeclRedeclaration(SourceLocation UsingLoc,
     return false;
   }
 
+  const NestedNameSpecifier *CNNS =
+      Context.getCanonicalNestedNameSpecifier(Qual);
   for (LookupResult::iterator I = Prev.begin(), E = Prev.end(); I != E; ++I) {
     NamedDecl *D = *I;
 
@@ -12497,8 +12499,7 @@ bool Sema::CheckUsingDeclRedeclaration(SourceLocation UsingLoc,
     // using decls differ if they name different scopes (but note that
     // template instantiation can cause this check to trigger when it
     // didn't before instantiation).
-    if (Context.getCanonicalNestedNameSpecifier(Qual) !=
-        Context.getCanonicalNestedNameSpecifier(DQual))
+    if (CNNS != Context.getCanonicalNestedNameSpecifier(DQual))
       continue;
 
     Diag(NameLoc, diag::err_using_decl_redeclaration) << SS.getRange();
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 175388198324..5d26f2d2c11a 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1079,7 +1079,7 @@ NamedDecl *Sema::ActOnTypeParameter(Scope *S, bool Typename,
       return Param;
 
     // Check the template argument itself.
-    if (CheckTemplateArgument(Param, DefaultTInfo)) {
+    if (CheckTemplateArgument(DefaultTInfo)) {
       Param->setInvalidDecl();
       return Param;
     }
@@ -5042,7 +5042,7 @@ bool Sema::CheckTemplateTypeArgument(TemplateTypeParmDecl *Param,
   }
   }
 
-  if (CheckTemplateArgument(Param, TSI))
+  if (CheckTemplateArgument(TSI))
     return true;
 
   // Add the converted template type argument.
@@ -5661,7 +5661,7 @@ bool Sema::CheckTemplateArgumentList(
   TemplateArgumentListInfo NewArgs = TemplateArgs;
 
   // Make sure we get the template parameter list from the most
-  // recentdeclaration, since that is the only one that has is guaranteed to
+  // recent declaration, since that is the only one that is guaranteed to
   // have all the default template argument information.
   TemplateParameterList *Params =
       cast<TemplateDecl>(Template->getMostRecentDecl())
@@ -6208,8 +6208,7 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier(
 ///
 /// This routine implements the semantics of C++ [temp.arg.type]. It
 /// returns true if an error occurred, and false otherwise.
-bool Sema::CheckTemplateArgument(TemplateTypeParmDecl *Param,
-                                 TypeSourceInfo *ArgInfo) {
+bool Sema::CheckTemplateArgument(TypeSourceInfo *ArgInfo) {
   assert(ArgInfo && "invalid TypeSourceInfo");
   QualType Arg = ArgInfo->getType();
   SourceRange SR = ArgInfo->getTypeLoc().getSourceRange();
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index f18f77d3442a..74889aa3ca88 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1934,25 +1934,23 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
     return Req;
 
   Sema::SFINAETrap Trap(SemaRef);
-  TemplateDeductionInfo Info(Req->getExpr()->getBeginLoc());
 
   llvm::PointerUnion<Expr *, concepts::Requirement::SubstitutionDiagnostic *>
       TransExpr;
   if (Req->isExprSubstitutionFailure())
     TransExpr = Req->getExprSubstitutionDiagnostic();
   else {
-    Sema::InstantiatingTemplate ExprInst(SemaRef, Req->getExpr()->getBeginLoc(),
-                                         Req, Info,
-                                         Req->getExpr()->getSourceRange());
+    Expr *E = Req->getExpr();
+    TemplateDeductionInfo Info(E->getBeginLoc());
+    Sema::InstantiatingTemplate ExprInst(SemaRef, E->getBeginLoc(), Req, Info,
+                                         E->getSourceRange());
     if (ExprInst.isInvalid())
       return nullptr;
-    ExprResult TransExprRes = TransformExpr(Req->getExpr());
+    ExprResult TransExprRes = TransformExpr(E);
     if (TransExprRes.isInvalid() || Trap.hasErrorOccurred())
-      TransExpr = createSubstDiag(SemaRef, Info,
-          [&] (llvm::raw_ostream& OS) {
-              Req->getExpr()->printPretty(OS, nullptr,
-                                          SemaRef.getPrintingPolicy());
-          });
+      TransExpr = createSubstDiag(SemaRef, Info, [&](llvm::raw_ostream &OS) {
+        E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy());
+      });
     else
       TransExpr = TransExprRes.get();
   }
@@ -1966,6 +1964,7 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
   else if (RetReq.isTypeConstraint()) {
     TemplateParameterList *OrigTPL =
         RetReq.getTypeConstraintTemplateParameterList();
+    TemplateDeductionInfo Info(OrigTPL->getTemplateLoc());
     Sema::InstantiatingTemplate TPLInst(SemaRef, OrigTPL->getTemplateLoc(),
                                         Req, Info, OrigTPL->getSourceRange());
     if (TPLInst.isInvalid())
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 08a642469627..7d2097cfc297 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -129,6 +129,7 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::Type::getInt8PtrTy(Ctx), Next, \
 #endif
 INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
 INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
@@ -137,7 +138,6 @@ INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
-INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
 
@@ -646,7 +646,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 6
+#define INSTR_PROF_RAW_VERSION 7
 /* Indexed profile format version (start from 1). */
 #define INSTR_PROF_INDEX_VERSION 7
 /* Coverage mapping format version (start from 0). */
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 21fa7ba1ddd6..68b4f5cd6f52 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -116,7 +116,7 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
       DataSize, CountersSize, NamesSize, &PaddingBytesBeforeCounters,
       &PaddingBytesAfterCounters, &PaddingBytesAfterNames);
 
-  return sizeof(__llvm_profile_header) +
+  return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          (DataSize * sizeof(__llvm_profile_data)) + PaddingBytesBeforeCounters +
          (CountersSize * sizeof(uint64_t)) + PaddingBytesAfterCounters +
          NamesSize + PaddingBytesAfterNames;
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index 913228513259..16ebc2f8b2a9 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -22,6 +22,7 @@ void (*VPMergeHook)(ValueProfData *, __llvm_profile_data *);
 COMPILER_RT_VISIBILITY
 uint64_t lprofGetLoadModuleSignature() {
   /* A very fast way to compute a module signature.  */
+  uint64_t Version = __llvm_profile_get_version();
   uint64_t CounterSize = (uint64_t)(__llvm_profile_end_counters() -
                                     __llvm_profile_begin_counters());
   uint64_t DataSize = __llvm_profile_get_data_size(__llvm_profile_begin_data(),
@@ -33,7 +34,7 @@ uint64_t lprofGetLoadModuleSignature() {
   const __llvm_profile_data *FirstD = __llvm_profile_begin_data();
 
   return (NamesSize << 40) + (CounterSize << 30) + (DataSize << 20) +
-         (NumVnodes << 10) + (DataSize > 0 ? FirstD->NameRef : 0);
+         (NumVnodes << 10) + (DataSize > 0 ? FirstD->NameRef : 0) + Version;
 }
 
 /* Returns 1 if profile is not structurally compatible.  */
@@ -44,7 +45,8 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
   __llvm_profile_header *Header = (__llvm_profile_header *)ProfileData;
   __llvm_profile_data *SrcDataStart, *SrcDataEnd, *SrcData, *DstData;
   SrcDataStart =
-      (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header));
+      (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header) +
+                              Header->BinaryIdsSize);
   SrcDataEnd = SrcDataStart + Header->DataSize;
 
   if (ProfileSize < sizeof(__llvm_profile_header))
@@ -63,7 +65,7 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
       Header->ValueKindLast != IPVK_Last)
     return 1;
 
-  if (ProfileSize < sizeof(__llvm_profile_header) +
+  if (ProfileSize < sizeof(__llvm_profile_header) + Header->BinaryIdsSize +
                         Header->DataSize * sizeof(__llvm_profile_data) +
                         Header->NamesSize + Header->CountersSize)
     return 1;
@@ -91,7 +93,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   const char *SrcValueProfDataStart, *SrcValueProfData;
 
   SrcDataStart =
-      (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header));
+      (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header) +
+                              Header->BinaryIdsSize);
   SrcDataEnd = SrcDataStart + Header->DataSize;
   SrcCountersStart = (uint64_t *)SrcDataEnd;
   SrcNameStart = (const char *)(SrcCountersStart + Header->CountersSize);
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 508624a80cd6..7c15f97aff89 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -17,6 +17,15 @@
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
 
+#if defined(__FreeBSD__) && !defined(ElfW)
+/*
+ * FreeBSD's elf.h and link.h headers do not define the ElfW(type) macro yet.
+ * If this is added to all supported FreeBSD versions in the future, this
+ * compatibility macro can be removed.
+ */
+#define ElfW(type) __ElfN(type)
+#endif
+
 #define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
@@ -76,6 +85,7 @@ COMPILER_RT_VISIBILITY ValueProfNode *__llvm_profile_end_vnodes(void) {
 COMPILER_RT_VISIBILITY ValueProfNode *CurrentVNode = &PROF_VNODES_START;
 COMPILER_RT_VISIBILITY ValueProfNode *EndVNode = &PROF_VNODES_STOP;
 
+#ifdef NT_GNU_BUILD_ID
 static size_t RoundUp(size_t size, size_t align) {
   return (size + align - 1) & ~(align - 1);
 }
@@ -179,5 +189,14 @@ COMPILER_RT_VISIBILITY int __llvm_write_binary_ids(ProfDataWriter *Writer) {
 
   return 0;
 }
+#else /* !NT_GNU_BUILD_ID */
+/*
+ * Fallback implementation for targets that don't support the GNU
+ * extensions NT_GNU_BUILD_ID and __ehdr_start.
+ */
+COMPILER_RT_VISIBILITY int __llvm_write_binary_ids(ProfDataWriter *Writer) {
+  return 0;
+}
+#endif
 
 #endif
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 3cf23694f878..97e33f3157aa 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -354,6 +354,16 @@
 #  define _LIBCPP_NO_CFI
 #endif
 
+// If the compiler supports using_if_exists, pretend we have those functions and they'll
+// be picked up if the C library provides them.
+//
+// TODO: Once we drop support for Clang 12, we can assume the compiler supports using_if_exists
+//       for platforms that don't have a conforming C11 library, so we can drop this whole thing.
+#if __has_attribute(using_if_exists)
+# define _LIBCPP_HAS_TIMESPEC_GET
+# define _LIBCPP_HAS_QUICK_EXIT
+# define _LIBCPP_HAS_ALIGNED_ALLOC
+#else
 #if (defined(__ISO_C_VISIBLE) && (__ISO_C_VISIBLE >= 2011)) || __cplusplus >= 201103L
 #  if defined(__FreeBSD__)
 #    define _LIBCPP_HAS_ALIGNED_ALLOC
@@ -408,6 +418,7 @@
 #    endif
 #  endif // __APPLE__
 #endif
+#endif // __has_attribute(using_if_exists)
 
 #ifndef _LIBCPP_CXX03_LANG
 # define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp)
diff --git a/libcxx/include/ctime b/libcxx/include/ctime
index 8b2efd7449ca..2a3fdd12e874 100644
--- a/libcxx/include/ctime
+++ b/libcxx/include/ctime
@@ -59,7 +59,7 @@ int timespec_get( struct timespec *ts, int base); // C++17
 // we're detecting this here instead of in <__config> because we can't include
 // system headers from <__config>, since it leads to circular module dependencies.
 // This is also meant to be a very temporary workaround until the SDKs are fixed.
-#if defined(__APPLE__)
+#if defined(__APPLE__) && !__has_attribute(using_if_exists)
 #   include <sys/cdefs.h>
 #   if defined(_LIBCPP_HAS_TIMESPEC_GET) && (__DARWIN_C_LEVEL < __DARWIN_C_FULL)
 #       define _LIBCPP_HAS_TIMESPEC_GET_NOT_ACTUALLY_PROVIDED
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 3128bca89999..c9230d6a9484 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -607,8 +607,15 @@ public:
     static_assert((is_same<_CharT, typename traits_type::char_type>::value),
                   "traits_type::char_type must be the same type as CharT");
 
+#ifdef _LIBCPP_CXX03_LANG
+    // Preserve the ability to compare with literal 0,
+    // and implicitly convert to bool, but not implicitly convert to int.
+    _LIBCPP_INLINE_VISIBILITY
+    operator void*() const {return fail() ? nullptr : (void*)this;}
+#else
     _LIBCPP_INLINE_VISIBILITY
     explicit operator bool() const {return !fail();}
+#endif
 
     _LIBCPP_INLINE_VISIBILITY bool operator!() const    {return  fail();}
     _LIBCPP_INLINE_VISIBILITY iostate rdstate() const   {return ios_base::rdstate();}
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index a996a815599a..e1abb4dfab36 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -86,7 +86,8 @@ struct SymbolVersion {
 struct VersionDefinition {
   llvm::StringRef name;
   uint16_t id;
-  std::vector<SymbolVersion> patterns;
+  std::vector<SymbolVersion> nonLocalPatterns;
+  std::vector<SymbolVersion> localPatterns;
 };
 
 // This struct contains the global configuration for the linker.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 91e7df21a60a..594c20016827 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1351,18 +1351,19 @@ static void readConfigs(opt::InputArgList &args) {
   }
 
   assert(config->versionDefinitions.empty());
-  config->versionDefinitions.push_back({"local", (uint16_t)VER_NDX_LOCAL, {}});
   config->versionDefinitions.push_back(
-      {"global", (uint16_t)VER_NDX_GLOBAL, {}});
+      {"local", (uint16_t)VER_NDX_LOCAL, {}, {}});
+  config->versionDefinitions.push_back(
+      {"global", (uint16_t)VER_NDX_GLOBAL, {}, {}});
 
   // If --retain-symbol-file is used, we'll keep only the symbols listed in
   // the file and discard all others.
   if (auto *arg = args.getLastArg(OPT_retain_symbols_file)) {
-    config->versionDefinitions[VER_NDX_LOCAL].patterns.push_back(
+    config->versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns.push_back(
         {"*", /*isExternCpp=*/false, /*hasWildcard=*/true});
     if (Optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
       for (StringRef s : args::getLines(*buffer))
-        config->versionDefinitions[VER_NDX_GLOBAL].patterns.push_back(
+        config->versionDefinitions[VER_NDX_GLOBAL].nonLocalPatterns.push_back(
             {s, /*isExternCpp=*/false, /*hasWildcard=*/false});
   }
 
@@ -2069,23 +2070,37 @@ static void redirectSymbols(ArrayRef<WrappedSymbol> wrapped) {
     if (suffix1[0] != '@' || suffix1[1] == '@')
       continue;
 
-    // Check whether the default version foo@@v1 exists. If it exists, the
-    // symbol can be found by the name "foo" in the symbol table.
-    Symbol *maybeDefault = symtab->find(name);
-    if (!maybeDefault)
+    // Check the existing symbol foo. We have two special cases to handle:
+    //
+    // * There is a definition of foo@v1 and foo@@v1.
+    // * There is a definition of foo@v1 and foo.
+    Defined *sym2 = dyn_cast_or_null<Defined>(symtab->find(name));
+    if (!sym2)
       continue;
-    const char *suffix2 = maybeDefault->getVersionSuffix();
-    if (suffix2[0] != '@' || suffix2[1] != '@' ||
-        strcmp(suffix1 + 1, suffix2 + 2) != 0)
-      continue;
-
-    // foo@v1 and foo@@v1 should be merged, so redirect foo@v1 to foo@@v1.
-    map.try_emplace(sym, maybeDefault);
-    // If both foo@v1 and foo@@v1 are defined and non-weak, report a duplicate
-    // definition error.
-    maybeDefault->resolve(*sym);
-    // Eliminate foo@v1 from the symbol table.
-    sym->symbolKind = Symbol::PlaceholderKind;
+    const char *suffix2 = sym2->getVersionSuffix();
+    if (suffix2[0] == '@' && suffix2[1] == '@' &&
+        strcmp(suffix1 + 1, suffix2 + 2) == 0) {
+      // foo@v1 and foo@@v1 should be merged, so redirect foo@v1 to foo@@v1.
+      map.try_emplace(sym, sym2);
+      // If both foo@v1 and foo@@v1 are defined and non-weak, report a duplicate
+      // definition error.
+      sym2->resolve(*sym);
+      // Eliminate foo@v1 from the symbol table.
+      sym->symbolKind = Symbol::PlaceholderKind;
+    } else if (auto *sym1 = dyn_cast<Defined>(sym)) {
+      if (sym2->versionId > VER_NDX_GLOBAL
+              ? config->versionDefinitions[sym2->versionId].name == suffix1 + 1
+              : sym1->section == sym2->section && sym1->value == sym2->value) {
+        // Due to an assembler design flaw, if foo is defined, .symver foo,
+        // foo@v1 defines both foo and foo@v1. Unless foo is bound to a
+        // different version, GNU ld makes foo@v1 canonical and elimiates foo.
+        // Emulate its behavior, otherwise we would have foo or foo@@v1 beside
+        // foo@v1. foo@v1 and foo combining does not apply if they are not
+        // defined in the same place.
+        map.try_emplace(sym2, sym);
+        sym2->symbolKind = Symbol::PlaceholderKind;
+      }
+    }
   }
 
   if (map.empty())
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index a938984ad945..01785f39ed75 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -849,17 +849,8 @@ void LinkerScript::diagnoseOrphanHandling() const {
 }
 
 uint64_t LinkerScript::advance(uint64_t size, unsigned alignment) {
-  bool isTbss =
-      (ctx->outSec->flags & SHF_TLS) && ctx->outSec->type == SHT_NOBITS;
-  uint64_t start = isTbss ? dot + ctx->threadBssOffset : dot;
-  start = alignTo(start, alignment);
-  uint64_t end = start + size;
-
-  if (isTbss)
-    ctx->threadBssOffset = end - dot;
-  else
-    dot = end;
-  return end;
+  dot = alignTo(dot, alignment) + size;
+  return dot;
 }
 
 void LinkerScript::output(InputSection *s) {
@@ -931,13 +922,24 @@ static OutputSection *findFirstSection(PhdrEntry *load) {
 // This function assigns offsets to input sections and an output section
 // for a single sections command (e.g. ".text { *(.text); }").
 void LinkerScript::assignOffsets(OutputSection *sec) {
+  const bool isTbss = (sec->flags & SHF_TLS) && sec->type == SHT_NOBITS;
   const bool sameMemRegion = ctx->memRegion == sec->memRegion;
   const bool prevLMARegionIsDefault = ctx->lmaRegion == nullptr;
   const uint64_t savedDot = dot;
   ctx->memRegion = sec->memRegion;
   ctx->lmaRegion = sec->lmaRegion;
 
-  if (sec->flags & SHF_ALLOC) {
+  if (!(sec->flags & SHF_ALLOC)) {
+    // Non-SHF_ALLOC sections have zero addresses.
+    dot = 0;
+  } else if (isTbss) {
+    // Allow consecutive SHF_TLS SHT_NOBITS output sections. The address range
+    // starts from the end address of the previous tbss section.
+    if (ctx->tbssAddr == 0)
+      ctx->tbssAddr = dot;
+    else
+      dot = ctx->tbssAddr;
+  } else {
     if (ctx->memRegion)
       dot = ctx->memRegion->curPos;
     if (sec->addrExpr)
@@ -950,9 +952,6 @@ void LinkerScript::assignOffsets(OutputSection *sec) {
     if (ctx->memRegion && ctx->memRegion->curPos < dot)
       expandMemoryRegion(ctx->memRegion, dot - ctx->memRegion->curPos,
                          ctx->memRegion->name, sec->name);
-  } else {
-    // Non-SHF_ALLOC sections have zero addresses.
-    dot = 0;
   }
 
   switchTo(sec);
@@ -1008,8 +1007,13 @@ void LinkerScript::assignOffsets(OutputSection *sec) {
 
   // Non-SHF_ALLOC sections do not affect the addresses of other OutputSections
   // as they are not part of the process image.
-  if (!(sec->flags & SHF_ALLOC))
+  if (!(sec->flags & SHF_ALLOC)) {
     dot = savedDot;
+  } else if (isTbss) {
+    // NOBITS TLS sections are similar. Additionally save the end address.
+    ctx->tbssAddr = dot;
+    dot = savedDot;
+  }
 }
 
 static bool isDiscardable(OutputSection &sec) {
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index 0592c52acb84..d2487ae0f9d2 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -247,11 +247,11 @@ class LinkerScript final {
   // not be used outside of the scope of a call to the above functions.
   struct AddressState {
     AddressState();
-    uint64_t threadBssOffset = 0;
     OutputSection *outSec = nullptr;
     MemoryRegion *memRegion = nullptr;
     MemoryRegion *lmaRegion = nullptr;
     uint64_t lmaOffset = 0;
+    uint64_t tbssAddr = 0;
   };
 
   llvm::DenseMap<StringRef, OutputSection *> nameToOutputSection;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index e3cc210972b2..537859f9e0b5 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -527,6 +527,13 @@ static SmallSet<SharedSymbol *, 4> getSymbolsAt(SharedSymbol &ss) {
     if (auto *alias = dyn_cast_or_null<SharedSymbol>(sym))
       ret.insert(alias);
   }
+
+  // The loop does not check SHT_GNU_verneed, so ret does not contain
+  // non-default version symbols. If ss has a non-default version, ret won't
+  // contain ss. Just add ss unconditionally. If a non-default version alias is
+  // separately copy relocated, it and ss will have different addresses.
+  // Fortunately this case is impractical and fails with GNU ld as well.
+  ret.insert(&ss);
   return ret;
 }
 
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 2c980eb810c7..1c743fd47747 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -1496,9 +1496,9 @@ void ScriptParser::readAnonymousDeclaration() {
   std::vector<SymbolVersion> globals;
   std::tie(locals, globals) = readSymbols();
   for (const SymbolVersion &pat : locals)
-    config->versionDefinitions[VER_NDX_LOCAL].patterns.push_back(pat);
+    config->versionDefinitions[VER_NDX_LOCAL].localPatterns.push_back(pat);
   for (const SymbolVersion &pat : globals)
-    config->versionDefinitions[VER_NDX_GLOBAL].patterns.push_back(pat);
+    config->versionDefinitions[VER_NDX_GLOBAL].nonLocalPatterns.push_back(pat);
 
   expect(";");
 }
@@ -1510,13 +1510,12 @@ void ScriptParser::readVersionDeclaration(StringRef verStr) {
   std::vector<SymbolVersion> locals;
   std::vector<SymbolVersion> globals;
   std::tie(locals, globals) = readSymbols();
-  for (const SymbolVersion &pat : locals)
-    config->versionDefinitions[VER_NDX_LOCAL].patterns.push_back(pat);
 
   // Create a new version definition and add that to the global symbols.
   VersionDefinition ver;
   ver.name = verStr;
-  ver.patterns = globals;
+  ver.nonLocalPatterns = std::move(globals);
+  ver.localPatterns = std::move(locals);
   ver.id = config->versionDefinitions.size();
   config->versionDefinitions.push_back(ver);
 
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index 70aea288c53f..22e6b4f92898 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -134,9 +134,20 @@ static bool canBeVersioned(const Symbol &sym) {
 StringMap<std::vector<Symbol *>> &SymbolTable::getDemangledSyms() {
   if (!demangledSyms) {
     demangledSyms.emplace();
+    std::string demangled;
     for (Symbol *sym : symVector)
-      if (canBeVersioned(*sym))
-        (*demangledSyms)[demangleItanium(sym->getName())].push_back(sym);
+      if (canBeVersioned(*sym)) {
+        StringRef name = sym->getName();
+        size_t pos = name.find('@');
+        if (pos == std::string::npos)
+          demangled = demangleItanium(name);
+        else if (pos + 1 == name.size() || name[pos + 1] == '@')
+          demangled = demangleItanium(name.substr(0, pos));
+        else
+          demangled =
+              (demangleItanium(name.substr(0, pos)) + name.substr(pos)).str();
+        (*demangledSyms)[demangled].push_back(sym);
+      }
   }
   return *demangledSyms;
 }
@@ -150,19 +161,29 @@ std::vector<Symbol *> SymbolTable::findByVersion(SymbolVersion ver) {
   return {};
 }
 
-std::vector<Symbol *> SymbolTable::findAllByVersion(SymbolVersion ver) {
+std::vector<Symbol *> SymbolTable::findAllByVersion(SymbolVersion ver,
+                                                    bool includeNonDefault) {
   std::vector<Symbol *> res;
   SingleStringMatcher m(ver.name);
+  auto check = [&](StringRef name) {
+    size_t pos = name.find('@');
+    if (!includeNonDefault)
+      return pos == StringRef::npos;
+    return !(pos + 1 < name.size() && name[pos + 1] == '@');
+  };
 
   if (ver.isExternCpp) {
     for (auto &p : getDemangledSyms())
       if (m.match(p.first()))
-        res.insert(res.end(), p.second.begin(), p.second.end());
+        for (Symbol *sym : p.second)
+          if (check(sym->getName()))
+            res.push_back(sym);
     return res;
   }
 
   for (Symbol *sym : symVector)
-    if (canBeVersioned(*sym) && m.match(sym->getName()))
+    if (canBeVersioned(*sym) && check(sym->getName()) &&
+        m.match(sym->getName()))
       res.push_back(sym);
   return res;
 }
@@ -172,7 +193,7 @@ void SymbolTable::handleDynamicList() {
   for (SymbolVersion &ver : config->dynamicList) {
     std::vector<Symbol *> syms;
     if (ver.hasWildcard)
-      syms = findAllByVersion(ver);
+      syms = findAllByVersion(ver, /*includeNonDefault=*/true);
     else
       syms = findByVersion(ver);
 
@@ -181,21 +202,13 @@ void SymbolTable::handleDynamicList() {
   }
 }
 
-// Set symbol versions to symbols. This function handles patterns
-// containing no wildcard characters.
-void SymbolTable::assignExactVersion(SymbolVersion ver, uint16_t versionId,
-                                     StringRef versionName) {
-  if (ver.hasWildcard)
-    return;
-
+// Set symbol versions to symbols. This function handles patterns containing no
+// wildcard characters. Return false if no symbol definition matches ver.
+bool SymbolTable::assignExactVersion(SymbolVersion ver, uint16_t versionId,
+                                     StringRef versionName,
+                                     bool includeNonDefault) {
   // Get a list of symbols which we need to assign the version to.
   std::vector<Symbol *> syms = findByVersion(ver);
-  if (syms.empty()) {
-    if (!config->undefinedVersion)
-      error("version script assignment of '" + versionName + "' to symbol '" +
-            ver.name + "' failed: symbol not defined");
-    return;
-  }
 
   auto getName = [](uint16_t ver) -> std::string {
     if (ver == VER_NDX_LOCAL)
@@ -207,10 +220,11 @@ void SymbolTable::assignExactVersion(SymbolVersion ver, uint16_t versionId,
 
   // Assign the version.
   for (Symbol *sym : syms) {
-    // Skip symbols containing version info because symbol versions
-    // specified by symbol names take precedence over version scripts.
-    // See parseSymbolVersion().
-    if (sym->getName().contains('@'))
+    // For a non-local versionId, skip symbols containing version info because
+    // symbol versions specified by symbol names take precedence over version
+    // scripts. See parseSymbolVersion().
+    if (!includeNonDefault && versionId != VER_NDX_LOCAL &&
+        sym->getName().contains('@'))
       continue;
 
     // If the version has not been assigned, verdefIndex is -1. Use an arbitrary
@@ -225,13 +239,15 @@ void SymbolTable::assignExactVersion(SymbolVersion ver, uint16_t versionId,
     warn("attempt to reassign symbol '" + ver.name + "' of " +
          getName(sym->versionId) + " to " + getName(versionId));
   }
+  return !syms.empty();
 }
 
-void SymbolTable::assignWildcardVersion(SymbolVersion ver, uint16_t versionId) {
+void SymbolTable::assignWildcardVersion(SymbolVersion ver, uint16_t versionId,
+                                        bool includeNonDefault) {
   // Exact matching takes precedence over fuzzy matching,
   // so we set a version to a symbol only if no version has been assigned
   // to the symbol. This behavior is compatible with GNU.
-  for (Symbol *sym : findAllByVersion(ver))
+  for (Symbol *sym : findAllByVersion(ver, includeNonDefault))
     if (sym->verdefIndex == UINT32_C(-1)) {
       sym->verdefIndex = 0;
       sym->versionId = versionId;
@@ -244,26 +260,60 @@ void SymbolTable::assignWildcardVersion(SymbolVersion ver, uint16_t versionId) {
 // script file, the script does not actually define any symbol version,
 // but just specifies symbols visibilities.
 void SymbolTable::scanVersionScript() {
+  SmallString<128> buf;
   // First, we assign versions to exact matching symbols,
   // i.e. version definitions not containing any glob meta-characters.
-  for (VersionDefinition &v : config->versionDefinitions)
-    for (SymbolVersion &pat : v.patterns)
-      assignExactVersion(pat, v.id, v.name);
+  std::vector<Symbol *> syms;
+  for (VersionDefinition &v : config->versionDefinitions) {
+    auto assignExact = [&](SymbolVersion pat, uint16_t id, StringRef ver) {
+      bool found =
+          assignExactVersion(pat, id, ver, /*includeNonDefault=*/false);
+      buf.clear();
+      found |= assignExactVersion({(pat.name + "@" + v.name).toStringRef(buf),
+                                   pat.isExternCpp, /*hasWildCard=*/false},
+                                  id, ver, /*includeNonDefault=*/true);
+      if (!found && !config->undefinedVersion)
+        errorOrWarn("version script assignment of '" + ver + "' to symbol '" +
+                    pat.name + "' failed: symbol not defined");
+    };
+    for (SymbolVersion &pat : v.nonLocalPatterns)
+      if (!pat.hasWildcard)
+        assignExact(pat, v.id, v.name);
+    for (SymbolVersion pat : v.localPatterns)
+      if (!pat.hasWildcard)
+        assignExact(pat, VER_NDX_LOCAL, "local");
+  }
 
   // Next, assign versions to wildcards that are not "*". Note that because the
   // last match takes precedence over previous matches, we iterate over the
   // definitions in the reverse order.
-  for (VersionDefinition &v : llvm::reverse(config->versionDefinitions))
-    for (SymbolVersion &pat : v.patterns)
+  auto assignWildcard = [&](SymbolVersion pat, uint16_t id, StringRef ver) {
+    assignWildcardVersion(pat, id, /*includeNonDefault=*/false);
+    buf.clear();
+    assignWildcardVersion({(pat.name + "@" + ver).toStringRef(buf),
+                           pat.isExternCpp, /*hasWildCard=*/true},
+                          id,
+                          /*includeNonDefault=*/true);
+  };
+  for (VersionDefinition &v : llvm::reverse(config->versionDefinitions)) {
+    for (SymbolVersion &pat : v.nonLocalPatterns)
       if (pat.hasWildcard && pat.name != "*")
-        assignWildcardVersion(pat, v.id);
+        assignWildcard(pat, v.id, v.name);
+    for (SymbolVersion &pat : v.localPatterns)
+      if (pat.hasWildcard && pat.name != "*")
+        assignWildcard(pat, VER_NDX_LOCAL, v.name);
+  }
 
   // Then, assign versions to "*". In GNU linkers they have lower priority than
   // other wildcards.
-  for (VersionDefinition &v : config->versionDefinitions)
-    for (SymbolVersion &pat : v.patterns)
+  for (VersionDefinition &v : config->versionDefinitions) {
+    for (SymbolVersion &pat : v.nonLocalPatterns)
       if (pat.hasWildcard && pat.name == "*")
-        assignWildcardVersion(pat, v.id);
+        assignWildcard(pat, v.id, v.name);
+    for (SymbolVersion &pat : v.localPatterns)
+      if (pat.hasWildcard && pat.name == "*")
+        assignWildcard(pat, VER_NDX_LOCAL, v.name);
+  }
 
   // Symbol themselves might know their versions because symbols
   // can contain versions in the form of <name>@<version>.
diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h
index 507af8d2be75..54c4b1169ed1 100644
--- a/lld/ELF/SymbolTable.h
+++ b/lld/ELF/SymbolTable.h
@@ -65,12 +65,14 @@ public:
 
 private:
   std::vector<Symbol *> findByVersion(SymbolVersion ver);
-  std::vector<Symbol *> findAllByVersion(SymbolVersion ver);
+  std::vector<Symbol *> findAllByVersion(SymbolVersion ver,
+                                         bool includeNonDefault);
 
   llvm::StringMap<std::vector<Symbol *>> &getDemangledSyms();
-  void assignExactVersion(SymbolVersion ver, uint16_t versionId,
-                          StringRef versionName);
-  void assignWildcardVersion(SymbolVersion ver, uint16_t versionId);
+  bool assignExactVersion(SymbolVersion ver, uint16_t versionId,
+                          StringRef versionName, bool includeNonDefault);
+  void assignWildcardVersion(SymbolVersion ver, uint16_t versionId,
+                             bool includeNonDefault);
 
   // The order the global symbols are in is not defined. We can use an arbitrary
   // order, but it has to be reproducible. That is true even when cross linking.
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 496be33dd182..cef303f05f89 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -208,6 +208,9 @@ OutputSection *Symbol::getOutputSection() const {
 // If a symbol name contains '@', the characters after that is
 // a symbol version name. This function parses that.
 void Symbol::parseSymbolVersion() {
+  // Return if localized by a local: pattern in a version script.
+  if (versionId == VER_NDX_LOCAL)
+    return;
   StringRef s = getName();
   size_t pos = s.find('@');
   if (pos == 0 || pos == StringRef::npos)
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index a52ee4348f78..50af6e7d7939 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -24,6 +24,13 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
+* ``-z start-stop-gc`` is now supported and becomes the default.
+  (`D96914 <https://reviews.llvm.org/D96914>`_)
+  (`rG6d2d3bd0 <https://reviews.llvm.org/rG6d2d3bd0a61f5fc7fd9f61f48bc30e9ca77cc619>`_)
+* ``--shuffle-sections=<seed>`` has been changed to ``--shuffle-sections=<section-glob>=<seed>``.
+  If seed is -1, the matched input sections are reversed.
+  (`D98445 <https://reviews.llvm.org/D98445>`_)
+  (`D98679 <https://reviews.llvm.org/D98679>`_)
 * ``-Bsymbolic -Bsymbolic-functions`` has been changed to behave the same as ``-Bsymbolic-functions``. This matches GNU ld.
   (`D102461 <https://reviews.llvm.org/D102461>`_)
 * ``-Bno-symbolic`` has been added.
@@ -32,6 +39,75 @@ ELF Improvements
   (`D103303 <https://reviews.llvm.org/D103303>`_)
 * ``-Bsymbolic-non-weak-functions`` has been added as a ``STB_GLOBAL`` subset of ``-Bsymbolic-functions``.
   (`D102570 <https://reviews.llvm.org/D102570>`_)
+* ``--no-allow-shlib-undefined`` has been improved to catch more cases.
+  (`D101996 <https://reviews.llvm.org/D101996>`_)
+* ``__rela_iplt_start`` is no longer defined for -pie/-shared.
+  This makes GCC/Clang ``-static-pie`` built executables work.
+  (`rG8cb78e99 <https://reviews.llvm.org/rf8cb78e99aae9aa3f89f7bfe667db2c5b767f21f>`_)
+* IRELATIVE/TLSDESC relocations now support ``-z rel``.
+  (`D100544 <https://reviews.llvm.org/D100544>`_)
+* Section groups with a zero flag are now supported.
+  This is used by ``comdat nodeduplicate`` in LLVM IR.
+  (`D96636 <https://reviews.llvm.org/D96636>`_)
+  (`D106228 <https://reviews.llvm.org/D106228>`_)
+* Defined symbols are now resolved before undefined symbols to stabilize the bheavior of archive member extraction.
+  (`D95985 <https://reviews.llvm.org/D95985>`_)
+* ``STB_WEAK`` symbols are now preferred over COMMON symbols as a fix to a ``--fortran-common`` regression.
+  (`D105945 <https://reviews.llvm.org/D105945>`_)
+* Absolute relocations referencing undef weak now produce dynamic relocations for -pie, matching GOT-generating relocations.
+  (`D105164 <https://reviews.llvm.org/D105164>`_)
+* Exported symbols are now communicated to the LTO library so as to make LTO
+  based whole program devirtualization (``-flto=thin -fwhole-program-vtables``)
+  work with shared objects.
+  (`D91583 <https://reviews.llvm.org/D91583>`_)
+* Whole program devirtualization now respects ``local:`` version nodes in a version script.
+  (`D98220 <https://reviews.llvm.org/D98220>`_)
+  (`D98686 <https://reviews.llvm.org/D98686>`_)
+* ``local:`` version nodes in a version script now apply to non-default version symbols.
+  (`D107234 <https://reviews.llvm.org/D107234>`_)
+* If an object file defines both ``foo`` and ``foo@v1``, now only ``foo@v1`` will be in the output.
+  (`D107235 <https://reviews.llvm.org/D107235>`_)
+* Copy relocations on non-default version symbols are now supported.
+  (`D107535 <https://reviews.llvm.org/D107535>`_)
+
+Linker script changes:
+
+* ``.``, ``$``, and double quotes can now be used in symbol names in expressions.
+  (`D98306 <https://reviews.llvm.org/D98306>`_)
+  (`rGe7a7ad13 <https://reviews.llvm.org/rGe7a7ad134fe182aad190cb3ebc441164470e92f5>`_)
+* Fixed value of ``.`` in the output section description of ``.tbss``.
+  (`D107288 <https://reviews.llvm.org/D107288>`_)
+* ``NOLOAD`` sections can now be placed in a ``PT_LOAD`` program header.
+  (`D103815 <https://reviews.llvm.org/D103815>`_)
+* ``OUTPUT_FORMAT(default, big, little)`` now consults ``-EL`` and ``-EB``.
+  (`D96214 <https://reviews.llvm.org/D96214>`_)
+* The ``OVERWRITE_SECTIONS`` command has been added.
+  (`D103303 <https://reviews.llvm.org/D103303>`_)
+* The section order within an ``INSERT AFTER`` command is now preserved.
+  (`D105158 <https://reviews.llvm.org/D105158>`_)
+
+Architecture specific changes:
+
+* aarch64_be is now supported.
+  (`D96188 <https://reviews.llvm.org/D96188>`_)
+* The AMDGPU port now supports ``--amdhsa-code-object-version=4`` object files;
+  (`D95811 <https://reviews.llvm.org/D95811>`_)
+* The ARM port now accounts for PC biases in range extension thunk creation.
+  (`D97550 <https://reviews.llvm.org/D97550>`_)
+* The AVR port now computes ``e_flags``.
+  (`D99754 <https://reviews.llvm.org/D99754>`_)
+* The Mips port now omits unneeded dynamic relocations for PIE non-preemptible TLS.
+  (`D101382 <https://reviews.llvm.org/D101382>`_)
+* The PowerPC port now supports ``--power10-stubs=no`` to omit Power10 instructions from call stubs.
+  (`D94625 <https://reviews.llvm.org/D94625>`_)
+* Fixed a thunk creation bug in the PowerPC port when TOC/NOTOC calls are mixed.
+  (`D101837 <https://reviews.llvm.org/D101837>`_)
+* The RISC-V port now resolves undefined weak relocations to the current location if not using PLT.
+  (`D103001 <https://reviews.llvm.org/D103001>`_)
+* ``R_386_GOTOFF`` relocations from .debug_info are now allowed to be compatible with GCC.
+  (`D95994 <https://reviews.llvm.org/D95994>`_)
+* ``gotEntrySize`` has been added to improve support for the ILP32 ABI of x86-64.
+  (`D102569 <https://reviews.llvm.org/D102569>`_)
 
 Breaking changes
 ----------------
@@ -42,17 +118,75 @@ Breaking changes
 COFF Improvements
 -----------------
 
-* ...
+* Avoid thread exhaustion when running on 32 bit Windows.
+  (`D105506 <https://reviews.llvm.org/D105506>`_)
+
+* Improve terminating the process on Windows while a thread pool might be
+  running. (`D102944 <https://reviews.llvm.org/D102944>`_)
 
 MinGW Improvements
 ------------------
 
-* ...
+* Support for linking directly against a DLL without using an import library
+  has been added. (`D104530 <https://reviews.llvm.org/D104530>`_ and
+  `D104531 <https://reviews.llvm.org/D104531>`_)
 
-MachO Improvements
-------------------
+* Fix linking with ``--export-all-symbols`` in combination with
+  ``-function-sections``. (`D101522 <https://reviews.llvm.org/D101522>`_ and
+  `D101615 <https://reviews.llvm.org/D101615>`_)
 
-* Item 1.
+* Fix automatic export of symbols from LTO objects.
+  (`D101569 <https://reviews.llvm.org/D101569>`_)
+
+* Accept more spellings of some options.
+  (`D107237 <https://reviews.llvm.org/D107237>`_ and
+  `D107253 <https://reviews.llvm.org/D107253>`_)
+
+Mach-O Improvements
+-------------------
+
+The Mach-O backend is now able to link several large, real-world programs,
+though we are still working out the kinks.
+
+* arm64 is now supported as a target. (`D88629 <https://reviews.llvm.org/D88629>`_)
+* arm64_32 is now supported as a target. (`D99822 <https://reviews.llvm.org/D99822>`_)
+* Branch-range-extension thunks are now supported. (`D100818 <https://reviews.llvm.org/D100818>`_)
+* ``-dead_strip`` is now supported. (`D103324 <https://reviews.llvm.org/D103324>`_)
+* Support for identical code folding (``--icf=all``) has been added.
+  (`D103292 <https://reviews.llvm.org/D103292>`_)
+* Support for special ``$start`` and ``$end`` symbols for segment & sections has been
+  added. (`D106767 <https://reviews.llvm.org/D106767>`_, `D106629 <https://reviews.llvm.org/D106629>`_)
+* ``$ld$previous`` symbols are now supported. (`D103505 <https://reviews.llvm.org/D103505 >`_)
+* ``$ld$install_name`` symbols are now supported. (`D103746 <https://reviews.llvm.org/D103746>`_)
+* ``__mh_*_header`` symbols are now supported. (`D97007 <https://reviews.llvm.org/D97007>`_)
+* LC_CODE_SIGNATURE is now supported. (`D96164 <https://reviews.llvm.org/D96164>`_)
+* LC_FUNCTION_STARTS is now supported. (`D97260 <https://reviews.llvm.org/D97260>`_)
+* LC_DATA_IN_CODE is now supported. (`D103006 <https://reviews.llvm.org/D103006>`_)
+* Bind opcodes are more compactly encoded. (`D106128 <https://reviews.llvm.org/D106128>`_,
+  `D105075 <https://reviews.llvm.org/D105075>`_)
+* LTO cache support has been added. (`D105922 <https://reviews.llvm.org/D105922>`_)
+* ``-application_extension`` is now supported. (`D105818 <https://reviews.llvm.org/D105818>`_)
+* ``-export_dynamic`` is now partially supported. (`D105482 <https://reviews.llvm.org/D105482>`_)
+* ``-arch_multiple`` is now supported. (`D105450 <https://reviews.llvm.org/D105450>`_)
+* ``-final_output`` is now supported. (`D105449 <https://reviews.llvm.org/D105449>`_)
+* ``-umbrella`` is now supported. (`D105448 <https://reviews.llvm.org/D105448>`_)
+* ``--print-dylib-search`` is now supported. (`D103985 <https://reviews.llvm.org/D103985>`_)
+* ``-force_load_swift_libs`` is now supported. (`D103709 <https://reviews.llvm.org/D103709>`_)
+* ``-reexport_framework``, ``-reexport_library``, ``-reexport-l`` are now supported.
+  (`D103497 <https://reviews.llvm.org/D103497>`_)
+* ``.weak_def_can_be_hidden`` is now supported. (`D101080 <https://reviews.llvm.org/D101080>`_)
+* ``-add_ast_path`` is now supported. (`D100076 <https://reviews.llvm.org/D100076>`_)
+* ``-segprot`` is now supported.  (`D99389 <https://reviews.llvm.org/D99389>`_)
+* ``-dependency_info`` is now partially supported. (`D98559 <https://reviews.llvm.org/D98559>`_)
+* ``--time-trace`` is now supported. (`D98419 <https://reviews.llvm.org/D98419>`_)
+* ``-mark_dead_strippable_dylib`` is now supported. (`D98262 <https://reviews.llvm.org/D98262>`_)
+* ``-[un]exported_symbol[s_list]`` is now supported. (`D98223 <https://reviews.llvm.org/D98223>`_)
+* ``-flat_namespace`` is now supported. (`D97641 <https://reviews.llvm.org/D97641>`_)
+* ``-rename_section`` and ``-rename_segment`` are now supported. (`D97600 <https://reviews.llvm.org/D97600>`_)
+* ``-bundle_loader`` is now supported. (`D95913 <https://reviews.llvm.org/D95913>`_)
+* ``-map`` is now partially supported. (`D98323 <https://reviews.llvm.org/D98323>`_)
+
+There were numerous other bug-fixes as well.
 
 WebAssembly Improvements
 ------------------------
diff --git a/lldb/source/Commands/CommandObjectMemoryTag.cpp b/lldb/source/Commands/CommandObjectMemoryTag.cpp
index 1dfb32a92f3b..840f81719d7d 100644
--- a/lldb/source/Commands/CommandObjectMemoryTag.cpp
+++ b/lldb/source/Commands/CommandObjectMemoryTag.cpp
@@ -7,8 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "CommandObjectMemoryTag.h"
+#include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Interpreter/OptionArgParser.h"
+#include "lldb/Interpreter/OptionGroupFormat.h"
+#include "lldb/Interpreter/OptionValueString.h"
 #include "lldb/Target/Process.h"
 
 using namespace lldb;
@@ -21,7 +24,8 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
 public:
   CommandObjectMemoryTagRead(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "tag",
-                            "Read memory tags for the given range of memory.",
+                            "Read memory tags for the given range of memory."
+                            " Mismatched tags will be marked.",
                             nullptr,
                             eCommandRequiresTarget | eCommandRequiresProcess |
                                 eCommandProcessMustBePaused) {
@@ -97,16 +101,17 @@ protected:
       return false;
     }
 
-    result.AppendMessageWithFormatv("Logical tag: {0:x}",
-                                    tag_manager->GetLogicalTag(start_addr));
+    lldb::addr_t logical_tag = tag_manager->GetLogicalTag(start_addr);
+    result.AppendMessageWithFormatv("Logical tag: {0:x}", logical_tag);
     result.AppendMessage("Allocation tags:");
 
     addr_t addr = tagged_range->GetRangeBase();
     for (auto tag : *tags) {
       addr_t next_addr = addr + tag_manager->GetGranuleSize();
       // Showing tagged adresses here until we have non address bit handling
-      result.AppendMessageWithFormatv("[{0:x}, {1:x}): {2:x}", addr, next_addr,
-                                      tag);
+      result.AppendMessageWithFormatv("[{0:x}, {1:x}): {2:x}{3}", addr,
+                                      next_addr, tag,
+                                      logical_tag == tag ? "" : " (mismatch)");
       addr = next_addr;
     }
 
@@ -115,6 +120,168 @@ protected:
   }
 };
 
+#define LLDB_OPTIONS_memory_tag_write
+#include "CommandOptions.inc"
+
+class CommandObjectMemoryTagWrite : public CommandObjectParsed {
+public:
+  class OptionGroupTagWrite : public OptionGroup {
+  public:
+    OptionGroupTagWrite() : OptionGroup(), m_end_addr(LLDB_INVALID_ADDRESS) {}
+
+    ~OptionGroupTagWrite() override = default;
+
+    llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
+      return llvm::makeArrayRef(g_memory_tag_write_options);
+    }
+
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_value,
+                          ExecutionContext *execution_context) override {
+      Status status;
+      const int short_option =
+          g_memory_tag_write_options[option_idx].short_option;
+
+      switch (short_option) {
+      case 'e':
+        m_end_addr = OptionArgParser::ToAddress(execution_context, option_value,
+                                                LLDB_INVALID_ADDRESS, &status);
+        break;
+      default:
+        llvm_unreachable("Unimplemented option");
+      }
+
+      return status;
+    }
+
+    void OptionParsingStarting(ExecutionContext *execution_context) override {
+      m_end_addr = LLDB_INVALID_ADDRESS;
+    }
+
+    lldb::addr_t m_end_addr;
+  };
+
+  CommandObjectMemoryTagWrite(CommandInterpreter &interpreter)
+      : CommandObjectParsed(interpreter, "tag",
+                            "Write memory tags starting from the granule that "
+                            "contains the given address.",
+                            nullptr,
+                            eCommandRequiresTarget | eCommandRequiresProcess |
+                                eCommandProcessMustBePaused),
+        m_option_group(), m_tag_write_options() {
+    // Address
+    m_arguments.push_back(
+        CommandArgumentEntry{CommandArgumentData(eArgTypeAddressOrExpression)});
+    // One or more tag values
+    m_arguments.push_back(CommandArgumentEntry{
+        CommandArgumentData(eArgTypeValue, eArgRepeatPlus)});
+
+    m_option_group.Append(&m_tag_write_options);
+    m_option_group.Finalize();
+  }
+
+  ~CommandObjectMemoryTagWrite() override = default;
+
+  Options *GetOptions() override { return &m_option_group; }
+
+protected:
+  bool DoExecute(Args &command, CommandReturnObject &result) override {
+    if (command.GetArgumentCount() < 2) {
+      result.AppendError("wrong number of arguments; expected "
+                         "<address-expression> <tag> [<tag> [...]]");
+      return false;
+    }
+
+    Status error;
+    addr_t start_addr = OptionArgParser::ToAddress(
+        &m_exe_ctx, command[0].ref(), LLDB_INVALID_ADDRESS, &error);
+    if (start_addr == LLDB_INVALID_ADDRESS) {
+      result.AppendErrorWithFormatv("Invalid address expression, {0}",
+                                    error.AsCString());
+      return false;
+    }
+
+    command.Shift(); // shift off start address
+
+    std::vector<lldb::addr_t> tags;
+    for (auto &entry : command) {
+      lldb::addr_t tag_value;
+      // getAsInteger returns true on failure
+      if (entry.ref().getAsInteger(0, tag_value)) {
+        result.AppendErrorWithFormat(
+            "'%s' is not a valid unsigned decimal string value.\n",
+            entry.c_str());
+        return false;
+      }
+      tags.push_back(tag_value);
+    }
+
+    Process *process = m_exe_ctx.GetProcessPtr();
+    llvm::Expected<const MemoryTagManager *> tag_manager_or_err =
+        process->GetMemoryTagManager();
+
+    if (!tag_manager_or_err) {
+      result.SetError(Status(tag_manager_or_err.takeError()));
+      return false;
+    }
+
+    const MemoryTagManager *tag_manager = *tag_manager_or_err;
+
+    MemoryRegionInfos memory_regions;
+    // If this fails the list of regions is cleared, so we don't need to read
+    // the return status here.
+    process->GetMemoryRegions(memory_regions);
+
+    // We have to assume start_addr is not granule aligned.
+    // So if we simply made a range:
+    // (start_addr, start_addr + (N * granule_size))
+    // We would end up with a range that isn't N granules but N+1
+    // granules. To avoid this we'll align the start first using the method that
+    // doesn't check memory attributes. (if the final range is untagged we'll
+    // handle that error later)
+    lldb::addr_t aligned_start_addr =
+        tag_manager->ExpandToGranule(MemoryTagManager::TagRange(start_addr, 1))
+            .GetRangeBase();
+
+    lldb::addr_t end_addr = 0;
+    // When you have an end address you want to align the range like tag read
+    // does. Meaning, align the start down (which we've done) and align the end
+    // up.
+    if (m_tag_write_options.m_end_addr != LLDB_INVALID_ADDRESS)
+      end_addr = m_tag_write_options.m_end_addr;
+    else
+      // Without an end address assume number of tags matches number of granules
+      // to write to
+      end_addr =
+          aligned_start_addr + (tags.size() * tag_manager->GetGranuleSize());
+
+    // Now we've aligned the start address so if we ask for another range
+    // using the number of tags N, we'll get back a range that is also N
+    // granules in size.
+    llvm::Expected<MemoryTagManager::TagRange> tagged_range =
+        tag_manager->MakeTaggedRange(aligned_start_addr, end_addr,
+                                     memory_regions);
+
+    if (!tagged_range) {
+      result.SetError(Status(tagged_range.takeError()));
+      return false;
+    }
+
+    Status status = process->WriteMemoryTags(tagged_range->GetRangeBase(),
+                                             tagged_range->GetByteSize(), tags);
+
+    if (status.Fail()) {
+      result.SetError(status);
+      return false;
+    }
+
+    result.SetStatus(eReturnStatusSuccessFinishResult);
+    return true;
+  }
+
+  OptionGroupOptions m_option_group;
+  OptionGroupTagWrite m_tag_write_options;
+};
+
 CommandObjectMemoryTag::CommandObjectMemoryTag(CommandInterpreter &interpreter)
     : CommandObjectMultiword(
           interpreter, "tag", "Commands for manipulating memory tags",
@@ -123,6 +290,11 @@ CommandObjectMemoryTag::CommandObjectMemoryTag(CommandInterpreter &interpreter)
       new CommandObjectMemoryTagRead(interpreter));
   read_command_object->SetCommandName("memory tag read");
   LoadSubCommand("read", read_command_object);
+
+  CommandObjectSP write_command_object(
+      new CommandObjectMemoryTagWrite(interpreter));
+  write_command_object->SetCommandName("memory tag write");
+  LoadSubCommand("write", write_command_object);
 }
 
 CommandObjectMemoryTag::~CommandObjectMemoryTag() = default;
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 9c9b7c6e9b82..6abb4788bed0 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -504,6 +504,14 @@ let Command = "memory write" in {
     Desc<"Start writing bytes from an offset within the input file.">;
 }
 
+let Command = "memory tag write" in {
+  def memory_write_end_addr : Option<"end-addr", "e">, Group<1>,
+  Arg<"AddressOrExpression">, Desc<
+    "Set tags for start address to end-addr, repeating tags as needed"
+    " to cover the range. (instead of calculating the range from the"
+    " number of tags given)">;
+}
+
 let Command = "register read" in {
   def register_read_alternate : Option<"alternate", "A">,
     Desc<"Display register names using the alternate register name if there "
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 5e69b5793f9f..8e1f6bc29a6f 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -3474,15 +3474,31 @@ GDBRemoteCommunicationServerLLGS::Handle_qMemTags(
   if (packet.GetBytesLeft() < 1 || packet.GetChar() != ':')
     return SendIllFormedResponse(packet, invalid_type_err);
 
-  int32_t type =
-      packet.GetS32(std::numeric_limits<int32_t>::max(), /*base=*/16);
-  if (type == std::numeric_limits<int32_t>::max() ||
+  // Type is a signed integer but packed into the packet as its raw bytes.
+  // However, our GetU64 uses strtoull which allows +/-. We do not want this.
+  const char *first_type_char = packet.Peek();
+  if (first_type_char && (*first_type_char == '+' || *first_type_char == '-'))
+    return SendIllFormedResponse(packet, invalid_type_err);
+
+  // Extract type as unsigned then cast to signed.
+  // Using a uint64_t here so that we have some value outside of the 32 bit
+  // range to use as the invalid return value.
+  uint64_t raw_type =
+      packet.GetU64(std::numeric_limits<uint64_t>::max(), /*base=*/16);
+
+  if ( // Make sure the cast below would be valid
+      raw_type > std::numeric_limits<uint32_t>::max() ||
       // To catch inputs like "123aardvark" that will parse but clearly aren't
       // valid in this case.
       packet.GetBytesLeft()) {
     return SendIllFormedResponse(packet, invalid_type_err);
   }
 
+  // First narrow to 32 bits otherwise the copy into type would take
+  // the wrong 4 bytes on big endian.
+  uint32_t raw_type_32 = raw_type;
+  int32_t type = reinterpret_cast<int32_t &>(raw_type_32);
+
   StreamGDBRemote response;
   std::vector<uint8_t> tags;
   Status error = m_current_process->ReadMemoryTags(type, addr, length, tags);
@@ -3552,7 +3568,11 @@ GDBRemoteCommunicationServerLLGS::Handle_QMemTags(
       packet.GetU64(std::numeric_limits<uint64_t>::max(), /*base=*/16);
   if (raw_type > std::numeric_limits<uint32_t>::max())
     return SendIllFormedResponse(packet, invalid_type_err);
-  int32_t type = static_cast<int32_t>(raw_type);
+
+  // First narrow to 32 bits. Otherwise the copy below would get the wrong
+  // 4 bytes on big endian.
+  uint32_t raw_type_32 = raw_type;
+  int32_t type = reinterpret_cast<int32_t &>(raw_type_32);
 
   // Tag data
   if (packet.GetBytesLeft() < 1 || packet.GetChar() != ':')
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 252b06e269d6..0b3f7e4f3bd4 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -223,62 +223,32 @@ void TypeSystemMap::ForEach(std::function<bool(TypeSystem *)> const &callback) {
 llvm::Expected<TypeSystem &> TypeSystemMap::GetTypeSystemForLanguage(
     lldb::LanguageType language,
     llvm::Optional<CreateCallback> create_callback) {
-  llvm::Error error = llvm::Error::success();
-  assert(!error); // Check the success value when assertions are enabled
   std::lock_guard<std::mutex> guard(m_mutex);
-  if (m_clear_in_progress) {
-    error = llvm::make_error<llvm::StringError>(
+  if (m_clear_in_progress)
+    return llvm::make_error<llvm::StringError>(
         "Unable to get TypeSystem because TypeSystemMap is being cleared",
         llvm::inconvertibleErrorCode());
-  } else {
-    collection::iterator pos = m_map.find(language);
-    if (pos != m_map.end()) {
-      auto *type_system = pos->second.get();
-      if (type_system) {
-        llvm::consumeError(std::move(error));
-        return *type_system;
-      }
-      error = llvm::make_error<llvm::StringError>(
-          "TypeSystem for language " +
-              llvm::StringRef(Language::GetNameForLanguageType(language)) +
-              " doesn't exist",
-          llvm::inconvertibleErrorCode());
-      return std::move(error);
-    }
 
-    for (const auto &pair : m_map) {
-      if (pair.second && pair.second->SupportsLanguage(language)) {
-        // Add a new mapping for "language" to point to an already existing
-        // TypeSystem that supports this language
-        m_map[language] = pair.second;
-        if (pair.second.get()) {
-          llvm::consumeError(std::move(error));
-          return *pair.second.get();
-        }
-        error = llvm::make_error<llvm::StringError>(
-            "TypeSystem for language " +
-                llvm::StringRef(Language::GetNameForLanguageType(language)) +
-                " doesn't exist",
-            llvm::inconvertibleErrorCode());
-        return std::move(error);
-      }
-    }
+  collection::iterator pos = m_map.find(language);
+  if (pos != m_map.end()) {
+    auto *type_system = pos->second.get();
+    if (type_system)
+      return *type_system;
+    return llvm::make_error<llvm::StringError>(
+        "TypeSystem for language " +
+            llvm::StringRef(Language::GetNameForLanguageType(language)) +
+            " doesn't exist",
+        llvm::inconvertibleErrorCode());
+  }
 
-    if (!create_callback) {
-      error = llvm::make_error<llvm::StringError>(
-          "Unable to find type system for language " +
-              llvm::StringRef(Language::GetNameForLanguageType(language)),
-          llvm::inconvertibleErrorCode());
-    } else {
-      // Cache even if we get a shared pointer that contains a null type system
-      // back
-      TypeSystemSP type_system_sp = (*create_callback)();
-      m_map[language] = type_system_sp;
-      if (type_system_sp.get()) {
-        llvm::consumeError(std::move(error));
-        return *type_system_sp.get();
-      }
-      error = llvm::make_error<llvm::StringError>(
+  for (const auto &pair : m_map) {
+    if (pair.second && pair.second->SupportsLanguage(language)) {
+      // Add a new mapping for "language" to point to an already existing
+      // TypeSystem that supports this language
+      m_map[language] = pair.second;
+      if (pair.second.get())
+        return *pair.second.get();
+      return llvm::make_error<llvm::StringError>(
           "TypeSystem for language " +
               llvm::StringRef(Language::GetNameForLanguageType(language)) +
               " doesn't exist",
@@ -286,7 +256,23 @@ llvm::Expected<TypeSystem &> TypeSystemMap::GetTypeSystemForLanguage(
     }
   }
 
-  return std::move(error);
+  if (!create_callback)
+    return llvm::make_error<llvm::StringError>(
+        "Unable to find type system for language " +
+            llvm::StringRef(Language::GetNameForLanguageType(language)),
+        llvm::inconvertibleErrorCode());
+
+  // Cache even if we get a shared pointer that contains a null type system
+  // back
+  TypeSystemSP type_system_sp = (*create_callback)();
+  m_map[language] = type_system_sp;
+  if (type_system_sp.get())
+    return *type_system_sp.get();
+  return llvm::make_error<llvm::StringError>(
+      "TypeSystem for language " +
+          llvm::StringRef(Language::GetNameForLanguageType(language)) +
+          " doesn't exist",
+      llvm::inconvertibleErrorCode());
 }
 
 llvm::Expected<TypeSystem &>
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 90ec742f18e6..f46e66641c08 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -744,6 +744,10 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
   /// minimum/maximum flavor.
   CmpInst::Predicate getInverseMinMaxPred(SelectPatternFlavor SPF);
 
+  /// Return the minimum or maximum constant value for the specified integer
+  /// min/max flavor and type.
+  APInt getMinMaxLimit(SelectPatternFlavor SPF, unsigned BitWidth);
+
   /// Check if the values in \p VL are select instructions that can be converted
   /// to a min or max (vector) intrinsic. Returns the intrinsic ID, if such a
   /// conversion is possible, together with a bool indicating whether all select
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 81e29d9b86e8..97aea5aedf22 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -324,6 +324,9 @@ public:
   /// name is not found.
   GlobalValue *getNamedValue(StringRef Name) const;
 
+  /// Return the number of global values in the module.
+  unsigned getNumNamedValues() const;
+
   /// Return a unique non-zero ID for the specified metadata kind. This ID is
   /// uniqued across modules in the current LLVMContext.
   unsigned getMDKindID(StringRef Name) const;
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 08a934e6985f..c0cedb23bdcf 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -1104,6 +1104,7 @@ namespace RawInstrProf {
 // Version 5: Bit 60 of FuncHash is reserved for the flag for the context
 // sensitive records.
 // Version 6: Added binary id.
+// Version 7: Reorder binary id and include version in signature.
 const uint64_t Version = INSTR_PROF_RAW_VERSION;
 
 template <class IntPtrT> inline uint64_t getMagic();
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 08a642469627..7d2097cfc297 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -129,6 +129,7 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::Type::getInt8PtrTy(Ctx), Next, \
 #endif
 INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic())
 INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version())
+INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
@@ -137,7 +138,6 @@ INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
-INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
 
@@ -646,7 +646,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 6
+#define INSTR_PROF_RAW_VERSION 7
 /* Indexed profile format version (start from 1). */
 #define INSTR_PROF_INDEX_VERSION 7
 /* Coverage mapping format version (start from 0). */
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index c93b8adcc890..c3c12fd23746 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1855,6 +1855,10 @@ public:
   ///
   static void createShallowWrapper(Function &F);
 
+  /// Returns true if the function \p F can be internalized. i.e. it has a
+  /// compatible linkage.
+  static bool isInternalizable(Function &F);
+
   /// Make another copy of the function \p F such that the copied version has
   /// internal linkage afterwards and can be analysed. Then we replace all uses
   /// of the original function to the copied one
@@ -1870,6 +1874,22 @@ public:
   /// null pointer.
   static Function *internalizeFunction(Function &F, bool Force = false);
 
+  /// Make copies of each function in the set \p FnSet such that the copied
+  /// version has internal linkage afterwards and can be analysed. Then we
+  /// replace all uses of the original function to the copied one. The map
+  /// \p FnMap contains a mapping of functions to their internalized versions.
+  ///
+  /// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
+  /// linkage can be internalized because these linkages guarantee that other
+  /// definitions with the same name have the same semantics as this one.
+  ///
+  /// This version will internalize all the functions in the set \p FnSet at
+  /// once and then replace the uses. This prevents internalized functions being
+  /// called by external functions when there is an internalized version in the
+  /// module.
+  static bool internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
+                                   DenseMap<Function *, Function *> &FnMap);
+
   /// Return the data layout associated with the anchor scope.
   const DataLayout &getDataLayout() const { return InfoCache.DL; }
 
diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
index c4030735d965..c922476ac79d 100644
--- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
+++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -51,11 +51,13 @@
 #define LLVM_TRANSFORMS_UTILS_PREDICATEINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -176,7 +178,7 @@ public:
 class PredicateInfo {
 public:
   PredicateInfo(Function &, DominatorTree &, AssumptionCache &);
-  ~PredicateInfo() = default;
+  ~PredicateInfo();
 
   void verifyPredicateInfo() const;
 
@@ -203,6 +205,8 @@ private:
   // the Predicate Info, they belong to the ValueInfo structs in the ValueInfos
   // vector.
   DenseMap<const Value *, const PredicateBase *> PredicateMap;
+  // The set of ssa_copy declarations we created with our custom mangling.
+  SmallSet<AssertingVH<Function>, 20> CreatedDeclarations;
 };
 
 // This pass does eager building and then printing of PredicateInfo. It is used
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 8662dbf385dc..59bf3a342caa 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -83,6 +83,9 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
   /// InsertedValues/InsertedPostIncValues.
   SmallPtrSet<Value *, 16> ReusedValues;
 
+  // The induction variables generated.
+  SmallVector<WeakVH, 2> InsertedIVs;
+
   /// A memoization of the "relevant" loop for a given SCEV.
   DenseMap<const SCEV *, const Loop *> RelevantLoops;
 
@@ -199,9 +202,11 @@ public:
     InsertedPostIncValues.clear();
     ReusedValues.clear();
     ChainedPhis.clear();
+    InsertedIVs.clear();
   }
 
   ScalarEvolution *getSE() { return &SE; }
+  const SmallVectorImpl<WeakVH> &getInsertedIVs() const { return InsertedIVs; }
 
   /// Return a vector containing all instructions inserted during expansion.
   SmallVector<Instruction *, 32> getAllInsertedInstructions() const {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 23083bc8178e..69ab0052b0a7 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4080,6 +4080,22 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
     std::swap(TrueVal, FalseVal);
   }
 
+  // Check for integer min/max with a limit constant:
+  // X > MIN_INT ? X : MIN_INT --> X
+  // X < MAX_INT ? X : MAX_INT --> X
+  if (TrueVal->getType()->isIntOrIntVectorTy()) {
+    Value *X, *Y;
+    SelectPatternFlavor SPF =
+        matchDecomposedSelectPattern(cast<ICmpInst>(CondVal), TrueVal, FalseVal,
+                                     X, Y).Flavor;
+    if (SelectPatternResult::isMinOrMax(SPF) && Pred == getMinMaxPred(SPF)) {
+      APInt LimitC = getMinMaxLimit(getInverseMinMaxFlavor(SPF),
+                                    X->getType()->getScalarSizeInBits());
+      if (match(Y, m_SpecificInt(LimitC)))
+        return X;
+    }
+  }
+
   if (Pred == ICmpInst::ICMP_EQ && match(CmpRHS, m_Zero())) {
     Value *X;
     const APInt *Y;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 522d21812c6a..6e3ca5c4e08a 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6253,6 +6253,16 @@ CmpInst::Predicate llvm::getInverseMinMaxPred(SelectPatternFlavor SPF) {
   return getMinMaxPred(getInverseMinMaxFlavor(SPF));
 }
 
+APInt llvm::getMinMaxLimit(SelectPatternFlavor SPF, unsigned BitWidth) {
+  switch (SPF) {
+  case SPF_SMAX: return APInt::getSignedMaxValue(BitWidth);
+  case SPF_SMIN: return APInt::getSignedMinValue(BitWidth);
+  case SPF_UMAX: return APInt::getMaxValue(BitWidth);
+  case SPF_UMIN: return APInt::getMinValue(BitWidth);
+  default: llvm_unreachable("Unexpected flavor");
+  }
+}
+
 std::pair<Intrinsic::ID, bool>
 llvm::canConvertToMinOrMaxIntrinsic(ArrayRef<Value *> VL) {
   // Check if VL contains select instructions that can be folded into a min/max
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1bba7232eb14..4f730b2cf372 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20560,8 +20560,12 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     //    otherwise => (extract_subvec V1, ExtIdx)
     uint64_t InsIdx = V.getConstantOperandVal(2);
     if (InsIdx * SmallVT.getScalarSizeInBits() ==
-        ExtIdx * NVT.getScalarSizeInBits())
+        ExtIdx * NVT.getScalarSizeInBits()) {
+      if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
+        return SDValue();
+
       return DAG.getBitcast(NVT, V.getOperand(1));
+    }
     return DAG.getNode(
         ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
         DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index add34eccc1f3..de096f95afcb 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -677,8 +677,9 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
   }
 
   if (Retain) {
-    if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
-        Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))
+    if ((Ctx.getAsmInfo()->useIntegratedAssembler() ||
+         Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
+        !TM.getTargetTriple().isOSSolaris())
       Flags |= ELF::SHF_GNU_RETAIN;
     return NextUniqueID++;
   }
@@ -855,8 +856,10 @@ static MCSection *selectELFSectionForGlobal(
     EmitUniqueSection = true;
     Flags |= ELF::SHF_LINK_ORDER;
   }
-  if (Retain && (Ctx.getAsmInfo()->useIntegratedAssembler() ||
-                 Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))) {
+  if (Retain &&
+      (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+       Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) &&
+      !TM.getTargetTriple().isOSSolaris()) {
     EmitUniqueSection = true;
     Flags |= ELF::SHF_GNU_RETAIN;
   }
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 5f05aa2e94e7..e1e28d1230b0 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -349,200 +349,6 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
   }
 }
 
-/// Wrapper around getFoldedSizeOfImpl() that adds caching.
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded,
-                                 DenseMap<Type *, Constant *> &Cache);
-
-/// Return a ConstantExpr with type DestTy for sizeof on Ty, with any known
-/// factors factored out. If Folded is false, return null if no factoring was
-/// possible, to avoid endlessly bouncing an unfoldable expression back into the
-/// top-level folder.
-static Constant *getFoldedSizeOfImpl(Type *Ty, Type *DestTy, bool Folded,
-                                     DenseMap<Type *, Constant *> &Cache) {
-  // This is the actual implementation of getFoldedSizeOf(). To get the caching
-  // behavior, we need to call getFoldedSizeOf() when we recurse.
-
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Constant *N = ConstantInt::get(DestTy, ATy->getNumElements());
-    Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true, Cache);
-    return ConstantExpr::getNUWMul(E, N);
-  }
-
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!STy->isPacked()) {
-      unsigned NumElems = STy->getNumElements();
-      // An empty struct has size zero.
-      if (NumElems == 0)
-        return ConstantExpr::getNullValue(DestTy);
-      // Check for a struct with all members having the same size.
-      Constant *MemberSize =
-          getFoldedSizeOf(STy->getElementType(0), DestTy, true, Cache);
-      bool AllSame = true;
-      for (unsigned i = 1; i != NumElems; ++i)
-        if (MemberSize !=
-            getFoldedSizeOf(STy->getElementType(i), DestTy, true, Cache)) {
-          AllSame = false;
-          break;
-        }
-      if (AllSame) {
-        Constant *N = ConstantInt::get(DestTy, NumElems);
-        return ConstantExpr::getNUWMul(MemberSize, N);
-      }
-    }
-
-  // Pointer size doesn't depend on the pointee type, so canonicalize them
-  // to an arbitrary pointee.
-  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-    if (!PTy->getElementType()->isIntegerTy(1))
-      return getFoldedSizeOf(
-          PointerType::get(IntegerType::get(PTy->getContext(), 1),
-                           PTy->getAddressSpace()),
-          DestTy, true, Cache);
-
-  // If there's no interesting folding happening, bail so that we don't create
-  // a constant that looks like it needs folding but really doesn't.
-  if (!Folded)
-    return nullptr;
-
-  // Base case: Get a regular sizeof expression.
-  Constant *C = ConstantExpr::getSizeOf(Ty);
-  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                    DestTy, false),
-                            C, DestTy);
-  return C;
-}
-
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded,
-                                 DenseMap<Type *, Constant *> &Cache) {
-  // Check for previously generated folded size constant.
-  auto It = Cache.find(Ty);
-  if (It != Cache.end())
-    return It->second;
-  return Cache[Ty] = getFoldedSizeOfImpl(Ty, DestTy, Folded, Cache);
-}
-
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded) {
-  DenseMap<Type *, Constant *> Cache;
-  return getFoldedSizeOf(Ty, DestTy, Folded, Cache);
-}
-
-/// Return a ConstantExpr with type DestTy for alignof on Ty, with any known
-/// factors factored out. If Folded is false, return null if no factoring was
-/// possible, to avoid endlessly bouncing an unfoldable expression back into the
-/// top-level folder.
-static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy, bool Folded) {
-  // The alignment of an array is equal to the alignment of the
-  // array element. Note that this is not always true for vectors.
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Constant *C = ConstantExpr::getAlignOf(ATy->getElementType());
-    C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                      DestTy,
-                                                      false),
-                              C, DestTy);
-    return C;
-  }
-
-  if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    // Packed structs always have an alignment of 1.
-    if (STy->isPacked())
-      return ConstantInt::get(DestTy, 1);
-
-    // Otherwise, struct alignment is the maximum alignment of any member.
-    // Without target data, we can't compare much, but we can check to see
-    // if all the members have the same alignment.
-    unsigned NumElems = STy->getNumElements();
-    // An empty struct has minimal alignment.
-    if (NumElems == 0)
-      return ConstantInt::get(DestTy, 1);
-    // Check for a struct with all members having the same alignment.
-    Constant *MemberAlign =
-      getFoldedAlignOf(STy->getElementType(0), DestTy, true);
-    bool AllSame = true;
-    for (unsigned i = 1; i != NumElems; ++i)
-      if (MemberAlign != getFoldedAlignOf(STy->getElementType(i), DestTy, true)) {
-        AllSame = false;
-        break;
-      }
-    if (AllSame)
-      return MemberAlign;
-  }
-
-  // Pointer alignment doesn't depend on the pointee type, so canonicalize them
-  // to an arbitrary pointee.
-  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-    if (!PTy->getElementType()->isIntegerTy(1))
-      return
-        getFoldedAlignOf(PointerType::get(IntegerType::get(PTy->getContext(),
-                                                           1),
-                                          PTy->getAddressSpace()),
-                         DestTy, true);
-
-  // If there's no interesting folding happening, bail so that we don't create
-  // a constant that looks like it needs folding but really doesn't.
-  if (!Folded)
-    return nullptr;
-
-  // Base case: Get a regular alignof expression.
-  Constant *C = ConstantExpr::getAlignOf(Ty);
-  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                    DestTy, false),
-                            C, DestTy);
-  return C;
-}
-
-/// Return a ConstantExpr with type DestTy for offsetof on Ty and FieldNo, with
-/// any known factors factored out. If Folded is false, return null if no
-/// factoring was possible, to avoid endlessly bouncing an unfoldable expression
-/// back into the top-level folder.
-static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo, Type *DestTy,
-                                   bool Folded) {
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo, false,
-                                                                DestTy, false),
-                                        FieldNo, DestTy);
-    Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
-    return ConstantExpr::getNUWMul(E, N);
-  }
-
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!STy->isPacked()) {
-      unsigned NumElems = STy->getNumElements();
-      // An empty struct has no members.
-      if (NumElems == 0)
-        return nullptr;
-      // Check for a struct with all members having the same size.
-      Constant *MemberSize =
-        getFoldedSizeOf(STy->getElementType(0), DestTy, true);
-      bool AllSame = true;
-      for (unsigned i = 1; i != NumElems; ++i)
-        if (MemberSize !=
-            getFoldedSizeOf(STy->getElementType(i), DestTy, true)) {
-          AllSame = false;
-          break;
-        }
-      if (AllSame) {
-        Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo,
-                                                                    false,
-                                                                    DestTy,
-                                                                    false),
-                                            FieldNo, DestTy);
-        return ConstantExpr::getNUWMul(MemberSize, N);
-      }
-    }
-
-  // If there's no interesting folding happening, bail so that we don't create
-  // a constant that looks like it needs folding but really doesn't.
-  if (!Folded)
-    return nullptr;
-
-  // Base case: Get a regular offsetof expression.
-  Constant *C = ConstantExpr::getOffsetOf(Ty, FieldNo);
-  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                    DestTy, false),
-                            C, DestTy);
-  return C;
-}
-
 Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                                             Type *DestTy) {
   if (isa<PoisonValue>(V))
@@ -666,53 +472,6 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     // Is it a null pointer value?
     if (V->isNullValue())
       return ConstantInt::get(DestTy, 0);
-    // If this is a sizeof-like expression, pull out multiplications by
-    // known factors to expose them to subsequent folding. If it's an
-    // alignof-like expression, factor out known factors.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-      if (CE->getOpcode() == Instruction::GetElementPtr &&
-          CE->getOperand(0)->isNullValue()) {
-        // FIXME: Looks like getFoldedSizeOf(), getFoldedOffsetOf() and
-        // getFoldedAlignOf() don't handle the case when DestTy is a vector of
-        // pointers yet. We end up in asserts in CastInst::getCastOpcode (see
-        // test/Analysis/ConstantFolding/cast-vector.ll). I've only seen this
-        // happen in one "real" C-code test case, so it does not seem to be an
-        // important optimization to handle vectors here. For now, simply bail
-        // out.
-        if (DestTy->isVectorTy())
-          return nullptr;
-        GEPOperator *GEPO = cast<GEPOperator>(CE);
-        Type *Ty = GEPO->getSourceElementType();
-        if (CE->getNumOperands() == 2) {
-          // Handle a sizeof-like expression.
-          Constant *Idx = CE->getOperand(1);
-          bool isOne = isa<ConstantInt>(Idx) && cast<ConstantInt>(Idx)->isOne();
-          if (Constant *C = getFoldedSizeOf(Ty, DestTy, !isOne)) {
-            Idx = ConstantExpr::getCast(CastInst::getCastOpcode(Idx, true,
-                                                                DestTy, false),
-                                        Idx, DestTy);
-            return ConstantExpr::getMul(C, Idx);
-          }
-        } else if (CE->getNumOperands() == 3 &&
-                   CE->getOperand(1)->isNullValue()) {
-          // Handle an alignof-like expression.
-          if (StructType *STy = dyn_cast<StructType>(Ty))
-            if (!STy->isPacked()) {
-              ConstantInt *CI = cast<ConstantInt>(CE->getOperand(2));
-              if (CI->isOne() &&
-                  STy->getNumElements() == 2 &&
-                  STy->getElementType(0)->isIntegerTy(1)) {
-                return getFoldedAlignOf(STy->getElementType(1), DestTy, false);
-              }
-            }
-          // Handle an offsetof-like expression.
-          if (Ty->isStructTy() || Ty->isArrayTy()) {
-            if (Constant *C = getFoldedOffsetOf(Ty, CE->getOperand(2),
-                                                DestTy, false))
-              return C;
-          }
-        }
-      }
     // Other pointer types cannot be casted
     return nullptr;
   case Instruction::UIToFP:
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 7c18dc0ed299..63ea41fba89a 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -114,6 +114,10 @@ GlobalValue *Module::getNamedValue(StringRef Name) const {
   return cast_or_null<GlobalValue>(getValueSymbolTable().lookup(Name));
 }
 
+unsigned Module::getNumNamedValues() const {
+  return getValueSymbolTable().size();
+}
+
 /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
 /// This ID is uniqued across modules in the current LLVMContext.
 unsigned Module::getMDKindID(StringRef Name) const {
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 8a4470ae207d..a0460062f307 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -366,6 +366,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   if (GET_VERSION(Version) != RawInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
+  BinaryIdsSize = swap(Header.BinaryIdsSize);
   CountersDelta = swap(Header.CountersDelta);
   NamesDelta = swap(Header.NamesDelta);
   auto DataSize = swap(Header.DataSize);
@@ -374,7 +375,6 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
   NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
-  BinaryIdsSize = swap(Header.BinaryIdsSize);
 
   auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>);
   auto PaddingSize = getNumPaddingBytes(NamesSize);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ca6b87a5ebb0..b27a02b8c182 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4353,8 +4353,13 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   if (IsFixedLength) {
     assert(Subtarget->useSVEForFixedLengthVectors() &&
            "Cannot lower when not using SVE for fixed vectors");
-    IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
-    MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
+      IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
+      MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    } else {
+      MemVT = getContainerForFixedLengthVector(DAG, MemVT);
+      IndexVT = MemVT.changeTypeToInteger();
+    }
     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
     Mask = DAG.getNode(
         ISD::ZERO_EXTEND, DL,
@@ -4453,8 +4458,13 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
   if (IsFixedLength) {
     assert(Subtarget->useSVEForFixedLengthVectors() &&
            "Cannot lower when not using SVE for fixed vectors");
-    IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
-    MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    if (MemVT.getScalarSizeInBits() <= IndexVT.getScalarSizeInBits()) {
+      IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
+      MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
+    } else {
+      MemVT = getContainerForFixedLengthVector(DAG, MemVT);
+      IndexVT = MemVT.changeTypeToInteger();
+    }
     InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
 
     StoreVal =
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index b03d421d3e6d..091a62aa4ada 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1120,6 +1120,16 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   if (!MI.getOperand(1).isReg())
     return false;
 
+  auto NormalizeCmpValue = [](int64_t Value) -> int {
+    // Comparison immediates may be 64-bit, but CmpValue is only an int.
+    // Normalize to 0/1/2 return value, where 2 indicates any value apart from
+    // 0 or 1.
+    // TODO: Switch CmpValue to int64_t in the API to avoid this.
+    if (Value == 0 || Value == 1)
+      return Value;
+    return 2;
+  };
+
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1155,8 +1165,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    // FIXME: In order to convert CmpValue to 0 or 1
-    CmpValue = MI.getOperand(2).getImm() != 0;
+    CmpValue = NormalizeCmpValue(MI.getOperand(2).getImm());
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
@@ -1165,14 +1174,9 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
-    // while the type of CmpValue is int. When converting uint64_t to int,
-    // the high 32 bits of uint64_t will be lost.
-    // In fact it causes a bug in spec2006-483.xalancbmk
-    // CmpValue is only used to compare with zero in OptimizeCompareInstr
-    CmpValue = AArch64_AM::decodeLogicalImmediate(
+    CmpValue = NormalizeCmpValue(AArch64_AM::decodeLogicalImmediate(
                    MI.getOperand(2).getImm(),
-                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64));
     return true;
   }
 
@@ -1462,10 +1466,9 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
 
-  // Continue only if we have a "ri" where immediate is zero.
-  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
-  // function.
-  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
+  // Warning: CmpValue == 2 indicates *any* value apart from 0 or 1.
+  assert((CmpValue == 0 || CmpValue == 1 || CmpValue == 2) &&
+         "CmpValue must be 0, 1, or 2!");
   if (SrcReg2 != 0)
     return false;
 
@@ -1473,9 +1476,10 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
     return false;
 
-  if (!CmpValue && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
+  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
     return true;
-  return removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
+  return (CmpValue == 0 || CmpValue == 1) &&
+         removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
 }
 
 /// Get opcode of S version of Instr.
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2167ad5d7467..e68a3aa8bf47 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1647,7 +1647,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
            "CMP_SWAP not expected to be custom expanded for Thumb1");
     assert((UxtOp == 0 || UxtOp == ARM::tUXTB || UxtOp == ARM::tUXTH) &&
            "ARMv8-M.baseline does not have t2UXTB/t2UXTH");
-    assert(ARM::tGPRRegClass.contains(DesiredReg) &&
+    assert((UxtOp == 0 || ARM::tGPRRegClass.contains(DesiredReg)) &&
            "DesiredReg used for UXT op must be tGPR");
   }
 
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index 3bc5556a62f4..417e8b6ffec3 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -54,6 +54,24 @@ public:
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                      I);
   }
+
+  InstructionCost getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+    TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+    TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+    TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+    ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+    const Instruction *CxtI = nullptr) {
+      int ISD = TLI->InstructionOpcodeToISD(Opcode);
+      if (ISD == ISD::ADD && CostKind == TTI::TCK_RecipThroughput)
+        return SCEVCheapExpansionBudget.getValue() + 1;
+
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                           Opd2Info, Opd1PropInfo,
+                                           Opd2PropInfo);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index d5a7873bd056..abf5b213bbac 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -485,6 +485,9 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
           case Intrinsic::experimental_constrained_sin:
           case Intrinsic::experimental_constrained_cos:
             return true;
+          // There is no corresponding FMA instruction for PPC double double.
+          // Thus, we need to disable CTR loop generation for this type.
+          case Intrinsic::fmuladd:
           case Intrinsic::copysign:
             if (CI->getArgOperand(0)->getType()->getScalarType()->
                 isPPC_FP128Ty())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 342497150d49..8af3c8f5cfdb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -77,6 +77,39 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Scheduling definitions.
+//===----------------------------------------------------------------------===//
+
+class VMVRSched<int n>: Sched <[!cast<SchedReadWrite>("WriteVMov" # n # "V"),
+                                !cast<SchedReadWrite>("ReadVMov" # n # "V")]>;
+
+class VLESched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDE" # n),
+                                ReadVLDX, ReadVMask]>;
+
+class VSESched<int n> : Sched <[!cast<SchedReadWrite>("WriteVSTE" # n),
+                                !cast<SchedReadWrite>("ReadVSTE" # n # "V"),
+                                ReadVSTX, ReadVMask]>;
+
+class VLSSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDS" # n),
+                                ReadVLDX, ReadVLDSX, ReadVMask]>;
+
+class VSSSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVSTS" # n),
+                                !cast<SchedReadWrite>("ReadVSTS" # n # "V"),
+                                ReadVSTX, ReadVSTSX, ReadVMask]>;
+
+class VLXSched<int n, string o> :
+  Sched <[!cast<SchedReadWrite>("WriteVLD" # o # "X" # n),
+          ReadVLDX, !cast<SchedReadWrite>("ReadVLD" # o # "XV"), ReadVMask]>;
+
+class VSXSched<int n, string o> :
+  Sched <[!cast<SchedReadWrite>("WriteVST" # o # "X" # n),
+          !cast<SchedReadWrite>("ReadVST" # o # "X" # n),
+          ReadVSTX, !cast<SchedReadWrite>("ReadVST" # o # "XV"), ReadVMask]>;
+
+class VLFSched<int n> : Sched <[!cast<SchedReadWrite>("WriteVLDFF" # n),
+                                ReadVLDX, ReadVMask]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -328,106 +361,417 @@ class VAMONoWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr>
 // Use these multiclasses to define instructions more easily.
 //===----------------------------------------------------------------------===//
 multiclass VALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
-  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
-  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
-  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
 }
 
 multiclass VALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
-  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
-}
-
-multiclass VALUr_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUrVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
-  def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>;
 }
 
 multiclass VALU_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
-  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
-  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
-}
-
-multiclass VALU_IV_V<string opcodestr, bits<6> funct6> {
-  def _VS  : VALUVV<funct6, OPIVV, opcodestr # ".vs">;
-}
-
-multiclass VALUr_IV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>;
 }
 
 multiclass VALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
-  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>;
 }
 
-multiclass VALU_MV_V<string opcodestr, bits<6> funct6> {
-  def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">;
+multiclass VMAC_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVMask]>;
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+          Sched<[WriteVIMulAddX, ReadVIMulAddV, ReadVIMulAddX, ReadVMask]>;
 }
 
-multiclass VALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
-  def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">;
+multiclass VWMAC_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVMask]>;
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+          Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
 }
 
-multiclass VALU_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
-}
-
-multiclass VALUr_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
-  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
-}
-
-multiclass VALUr_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+multiclass VWMAC_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+          Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>;
 }
 
 multiclass VALU_MV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
-  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>;
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+           Sched<[WriteVExtV, ReadVExtV, ReadVMask]>;
 }
 
 multiclass VALUm_IV_V_X_I<string opcodestr, bits<6> funct6> {
-  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
-  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
-  def IM : VALUmVI<funct6, opcodestr # ".vim">;
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+           Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+           Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
+  def IM : VALUmVI<funct6, opcodestr # ".vim">,
+           Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>;
+}
+
+multiclass VMRG_IV_V_X_I<string opcodestr, bits<6> funct6> {
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+           Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+           Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>;
+  def IM : VALUmVI<funct6, opcodestr # ".vim">,
+           Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>;
 }
 
 multiclass VALUm_IV_V_X<string opcodestr, bits<6> funct6> {
-  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
-  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">,
+           Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">,
+           Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>;
 }
 
 multiclass VALUNoVm_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5> {
-  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
-  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
-  def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>;
+  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">,
+          Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">,
+          Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
+  def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>,
+          Sched<[WriteVICALUI, ReadVIALUCV]>;
 }
 
 multiclass VALUNoVm_IV_V_X<string opcodestr, bits<6> funct6> {
-  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
-  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
+  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">,
+          Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>;
+  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">,
+          Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>;
 }
 
 multiclass VALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
-  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
 }
 
 multiclass VALU_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
 }
 
-multiclass VALUr_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
-  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
-  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+multiclass VWALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
 }
 
-multiclass VALU_FV_V<string opcodestr, bits<6> funct6> {
-  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">;
+multiclass VMUL_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>;
 }
 
-multiclass VALU_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
-  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>;
+multiclass VDIV_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
+}
+
+multiclass VRDIV_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
+}
+
+multiclass VWMUL_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>;
+}
+
+multiclass VMAC_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>;
+  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>;
+}
+
+multiclass VWMAC_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>;
+  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>;
+}
+
+multiclass VSQR_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
+}
+
+multiclass VRCP_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
+}
+
+multiclass VCMP_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VCMP_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
+}
+
+multiclass VSGNJ_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">,
+          Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>;
+}
+
+multiclass VCLS_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
+}
+
+multiclass VCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>;
+}
+
+multiclass VCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>;
+}
+
+multiclass VWCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>;
+}
+
+multiclass VWCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>;
+}
+
+multiclass VWCVTF_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>;
+}
+
+multiclass VNCVTF_IV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>;
+}
+
+multiclass VNCVTI_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>;
+}
+
+multiclass VNCVTF_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>,
+           Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>;
+}
+
+multiclass VRED_MV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">,
+            Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV0, ReadVMask]>;
+}
+
+multiclass VWRED_IV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPIVV, opcodestr # ".vs">,
+            Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV0, ReadVMask]>;
+}
+
+multiclass VRED_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV0, ReadVMask]>;
+}
+
+multiclass VREDO_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV0, ReadVMask]>;
+}
+
+multiclass VWRED_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV0, ReadVMask]>;
+}
+
+multiclass VWREDO_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">,
+            Sched<[WriteVFWRedOV, ReadVFWRedOV, ReadVFWRedOV0, ReadVMask]>;
+}
+
+multiclass VMALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+  def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">,
+          Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
+}
+
+multiclass VMSFS_MV_V<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+           Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>;
+}
+
+multiclass VMIOT_MV_V<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>,
+           Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
+}
+
+multiclass VSHT_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVShiftV, ReadVShiftV, ReadVShiftV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVShiftX, ReadVShiftV, ReadVShiftX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVShiftI, ReadVShiftV, ReadVMask]>;
+}
+
+multiclass VNSHT_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVNShiftV, ReadVNShiftV, ReadVNShiftV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVNShiftX, ReadVNShiftV, ReadVNShiftX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVNShiftI, ReadVNShiftV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>;
+}
+
+multiclass VCMP_IV_V_X<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>;
+}
+
+multiclass VMUL_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIMulV, ReadVIMulV, ReadVIMulV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIMulX, ReadVIMulV, ReadVIMulX, ReadVMask]>;
+}
+
+multiclass VWMUL_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIWMulV, ReadVIWMulV, ReadVIWMulV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIWMulX, ReadVIWMulV, ReadVIWMulX, ReadVMask]>;
+}
+
+multiclass VDIV_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVIDivV, ReadVIDivV, ReadVIDivV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVIDivX, ReadVIDivV, ReadVIDivX, ReadVMask]>;
+}
+
+multiclass VSALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVSALUI, ReadVSALUV, ReadVMask]>;
+}
+
+multiclass VSALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>;
+}
+
+multiclass VAALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVAALUV, ReadVAALUV, ReadVAALUV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVAALUX, ReadVAALUV, ReadVAALUX, ReadVMask]>;
+}
+
+multiclass VSMUL_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSMulV, ReadVSMulV, ReadVSMulV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSMulX, ReadVSMulV, ReadVSMulX, ReadVMask]>;
+}
+
+multiclass VSSHF_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVSShiftV, ReadVSShiftV, ReadVSShiftV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVSShiftX, ReadVSShiftV, ReadVSShiftX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVSShiftI, ReadVSShiftV, ReadVMask]>;
+}
+
+multiclass VNCLP_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVNClipV, ReadVNClipV, ReadVNClipV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVNClipX, ReadVNClipV, ReadVNClipX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVNClipI, ReadVNClipV, ReadVMask]>;
+}
+
+multiclass VSLD_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVISlideX, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVISlideI, ReadVISlideV, ReadVMask]>;
+}
+
+multiclass VSLD1_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>;
+}
+
+multiclass VSLD1_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">,
+          Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>;
+}
+
+multiclass VGTR_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
+           Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV, ReadVMask]>;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">,
+           Sched<[WriteVGatherX, ReadVGatherV, ReadVGatherX, ReadVMask]>;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>,
+           Sched<[WriteVGatherI, ReadVGatherV, ReadVMask]>;
+}
+
+multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+  def M  : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">,
+           Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>;
 }
 
 multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
@@ -435,11 +779,48 @@ multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> {
   def _UNWD : VAMONoWd<amoop, width, opcodestr>;
 }
 
-multiclass VWholeLoad<bits<3> nf, string opcodestr, RegisterClass VRC> {
-  def E8_V : VWholeLoad<nf, LSWidth8, opcodestr # "e8.v", VRC>;
-  def E16_V : VWholeLoad<nf, LSWidth16, opcodestr # "e16.v", VRC>;
-  def E32_V : VWholeLoad<nf, LSWidth32, opcodestr # "e32.v", VRC>;
-  def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v", VRC>;
+multiclass VWholeLoad1<string opcodestr, RegisterClass VRC> {
+  def E8_V : VWholeLoad<0, LSWidth8, opcodestr # "e8.v", VRC>,
+             Sched<[WriteVLD1R8, ReadVLDX]>;
+  def E16_V : VWholeLoad<0, LSWidth16, opcodestr # "e16.v", VRC>,
+              Sched<[WriteVLD1R16, ReadVLDX]>;
+  def E32_V : VWholeLoad<0, LSWidth32, opcodestr # "e32.v", VRC>,
+              Sched<[WriteVLD1R32, ReadVLDX]>;
+  def E64_V : VWholeLoad<0, LSWidth64, opcodestr # "e64.v", VRC>,
+              Sched<[WriteVLD1R64, ReadVLDX]>;
+}
+
+multiclass VWholeLoad2<string opcodestr, RegisterClass VRC> {
+  def E8_V : VWholeLoad<1, LSWidth8, opcodestr # "e8.v", VRC>,
+             Sched<[WriteVLD2R8, ReadVLDX]>;
+  def E16_V : VWholeLoad<1, LSWidth16, opcodestr # "e16.v", VRC>,
+              Sched<[WriteVLD2R16, ReadVLDX]>;
+  def E32_V : VWholeLoad<1, LSWidth32, opcodestr # "e32.v", VRC>,
+              Sched<[WriteVLD2R32, ReadVLDX]>;
+  def E64_V : VWholeLoad<1, LSWidth64, opcodestr # "e64.v", VRC>,
+              Sched<[WriteVLD2R64, ReadVLDX]>;
+}
+
+multiclass VWholeLoad4<string opcodestr, RegisterClass VRC> {
+  def E8_V : VWholeLoad<3, LSWidth8, opcodestr # "e8.v", VRC>,
+             Sched<[WriteVLD4R8, ReadVLDX]>;
+  def E16_V : VWholeLoad<3, LSWidth16, opcodestr # "e16.v", VRC>,
+              Sched<[WriteVLD4R16, ReadVLDX]>;
+  def E32_V : VWholeLoad<3, LSWidth32, opcodestr # "e32.v", VRC>,
+              Sched<[WriteVLD4R32, ReadVLDX]>;
+  def E64_V : VWholeLoad<3, LSWidth64, opcodestr # "e64.v", VRC>,
+              Sched<[WriteVLD1R64, ReadVLDX]>;
+}
+
+multiclass VWholeLoad8<string opcodestr, RegisterClass VRC> {
+  def E8_V : VWholeLoad<7, LSWidth8, opcodestr # "e8.v", VRC>,
+             Sched<[WriteVLD8R8, ReadVLDX]>;
+  def E16_V : VWholeLoad<7, LSWidth16, opcodestr # "e16.v", VRC>,
+              Sched<[WriteVLD8R16, ReadVLDX]>;
+  def E32_V : VWholeLoad<7, LSWidth32, opcodestr # "e32.v", VRC>,
+              Sched<[WriteVLD8R32, ReadVLDX]>;
+  def E64_V : VWholeLoad<7, LSWidth64, opcodestr # "e64.v", VRC>,
+              Sched<[WriteVLD8R64, ReadVLDX]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -459,69 +840,94 @@ def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
 
 // Vector Unit-Stride Instructions
-def VLE8_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth8, "vle8.v">;
-def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">;
-def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">;
-def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">;
+def VLE8_V  : VUnitStrideLoad<LUMOPUnitStride, LSWidth8,  "vle8.v">,
+              VLESched<8>;
+def VLE16_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth16, "vle16.v">,
+              VLESched<16>;
+def VLE32_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth32, "vle32.v">,
+              VLESched<32>;
+def VLE64_V : VUnitStrideLoad<LUMOPUnitStride, LSWidth64, "vle64.v">,
+              VLESched<64>;
 
-def VLE8FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8, "vle8ff.v">;
-def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">;
-def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">;
-def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">;
+def VLE8FF_V  : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth8,  "vle8ff.v">,
+                VLFSched<8>;
+def VLE16FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth16, "vle16ff.v">,
+                VLFSched<16>;
+def VLE32FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth32, "vle32ff.v">,
+                VLFSched<32>;
+def VLE64FF_V : VUnitStrideLoad<LUMOPUnitStrideFF, LSWidth64, "vle64ff.v">,
+                VLFSched<64>;
 
-def VLE1_V : VUnitStrideLoadMask<"vle1.v">;
-def VSE1_V : VUnitStrideStoreMask<"vse1.v">;
+def VLE1_V : VUnitStrideLoadMask<"vle1.v">,
+             Sched<[WriteVLDM, ReadVLDX]>;
+def VSE1_V : VUnitStrideStoreMask<"vse1.v">,
+             Sched<[WriteVSTM, ReadVSTM, ReadVSTX]>;
 
-def VSE8_V : VUnitStrideStore<SUMOPUnitStride, LSWidth8, "vse8.v">;
-def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">;
-def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">;
-def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">;
+def VSE8_V  : VUnitStrideStore<SUMOPUnitStride, LSWidth8,  "vse8.v">,
+              VSESched<8>;
+def VSE16_V : VUnitStrideStore<SUMOPUnitStride, LSWidth16, "vse16.v">,
+              VSESched<16>;
+def VSE32_V : VUnitStrideStore<SUMOPUnitStride, LSWidth32, "vse32.v">,
+              VSESched<32>;
+def VSE64_V : VUnitStrideStore<SUMOPUnitStride, LSWidth64, "vse64.v">,
+              VSESched<64>;
 
 // Vector Strided Instructions
-def VLSE8_V : VStridedLoad<LSWidth8, "vlse8.v">;
-def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">;
-def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">;
-def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">;
+def VLSE8_V  : VStridedLoad<LSWidth8,  "vlse8.v">,
+               VLSSched<8>;
+def VLSE16_V : VStridedLoad<LSWidth16, "vlse16.v">,
+               VLSSched<16>;
+def VLSE32_V : VStridedLoad<LSWidth32, "vlse32.v">,
+               VLSSched<32>;
+def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">,
+               VLSSched<32>;
 
-def VSSE8_V : VStridedStore<LSWidth8, "vsse8.v">;
-def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">;
-def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">;
-def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">;
+def VSSE8_V  : VStridedStore<LSWidth8,  "vsse8.v">,
+               VSSSched<8>;
+def VSSE16_V : VStridedStore<LSWidth16, "vsse16.v">,
+               VSSSched<16>;
+def VSSE32_V : VStridedStore<LSWidth32, "vsse32.v">,
+               VSSSched<32>;
+def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">,
+               VSSSched<64>;
 
 // Vector Indexed Instructions
-def VLUXEI8_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth8, "vluxei8.v">;
-def VLUXEI16_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth16, "vluxei16.v">;
-def VLUXEI32_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth32, "vluxei32.v">;
-def VLUXEI64_V : VIndexedLoad<MOPLDIndexedUnord, LSWidth64, "vluxei64.v">;
+foreach n = [8, 16, 32, 64] in {
+defvar w = !cast<RISCVWidth>("LSWidth" # n);
 
-def VLOXEI8_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth8, "vloxei8.v">;
-def VLOXEI16_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth16, "vloxei16.v">;
-def VLOXEI32_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth32, "vloxei32.v">;
-def VLOXEI64_V : VIndexedLoad<MOPLDIndexedOrder, LSWidth64, "vloxei64.v">;
+def VLUXEI # n # _V :
+  VIndexedLoad<MOPLDIndexedUnord, w, "vluxei" # n # ".v">,
+  VLXSched<n, "U">;
+def VLOXEI # n # _V :
+  VIndexedLoad<MOPLDIndexedOrder, w, "vloxei" # n # ".v">,
+  VLXSched<n, "O">;
 
-def VSUXEI8_V : VIndexedStore<MOPSTIndexedUnord, LSWidth8, "vsuxei8.v">;
-def VSUXEI16_V : VIndexedStore<MOPSTIndexedUnord, LSWidth16, "vsuxei16.v">;
-def VSUXEI32_V : VIndexedStore<MOPSTIndexedUnord, LSWidth32, "vsuxei32.v">;
-def VSUXEI64_V : VIndexedStore<MOPSTIndexedUnord, LSWidth64, "vsuxei64.v">;
+def VSUXEI # n # _V :
+  VIndexedStore<MOPSTIndexedUnord, w, "vsuxei" # n # ".v">,
+  VSXSched<n, "U">;
+def VSOXEI # n # _V :
+  VIndexedStore<MOPSTIndexedOrder, w, "vsoxei" # n # ".v">,
+  VSXSched<n, "O">;
+}
 
-def VSOXEI8_V : VIndexedStore<MOPSTIndexedOrder, LSWidth8, "vsoxei8.v">;
-def VSOXEI16_V : VIndexedStore<MOPSTIndexedOrder, LSWidth16, "vsoxei16.v">;
-def VSOXEI32_V : VIndexedStore<MOPSTIndexedOrder, LSWidth32, "vsoxei32.v">;
-def VSOXEI64_V : VIndexedStore<MOPSTIndexedOrder, LSWidth64, "vsoxei64.v">;
+defm VL1R : VWholeLoad1<"vl1r", VR>;
+defm VL2R : VWholeLoad2<"vl2r", VRM2>;
+defm VL4R : VWholeLoad4<"vl4r", VRM4>;
+defm VL8R : VWholeLoad8<"vl8r", VRM8>;
 
-defm VL1R : VWholeLoad<0, "vl1r", VR>;
-defm VL2R : VWholeLoad<1, "vl2r", VRM2>;
-defm VL4R : VWholeLoad<3, "vl4r", VRM4>;
-defm VL8R : VWholeLoad<7, "vl8r", VRM8>;
 def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
 def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>;
 def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>;
 def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>;
 
-def VS1R_V : VWholeStore<0, "vs1r.v", VR>;
-def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>;
-def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>;
-def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>;
+def VS1R_V : VWholeStore<0, "vs1r.v", VR>,
+             Sched<[WriteVST1R, ReadVST1R, ReadVSTX]>;
+def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>,
+             Sched<[WriteVST2R, ReadVST2R, ReadVSTX]>;
+def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>,
+             Sched<[WriteVST4R, ReadVST4R, ReadVSTX]>;
+def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>,
+             Sched<[WriteVST8R, ReadVST8R, ReadVSTX]>;
 
 // Vector Single-Width Integer Add and Subtract
 defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
@@ -588,9 +994,9 @@ def : InstAlias<"vnot.v $vd, $vs$vm",
                 (VXOR_VI VR:$vd, VR:$vs, -1, VMaskOp:$vm)>;
 
 // Vector Single-Width Bit Shift Instructions
-defm VSLL_V : VALU_IV_V_X_I<"vsll", 0b100101, uimm5>;
-defm VSRL_V : VALU_IV_V_X_I<"vsrl", 0b101000, uimm5>;
-defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
+defm VSLL_V : VSHT_IV_V_X_I<"vsll", 0b100101, uimm5>;
+defm VSRL_V : VSHT_IV_V_X_I<"vsrl", 0b101000, uimm5>;
+defm VSRA_V : VSHT_IV_V_X_I<"vsra", 0b101001, uimm5>;
 
 // Vector Narrowing Integer Right Shift Instructions
 // Refer to 11.3. Narrowing Vector Arithmetic Instructions
@@ -598,8 +1004,8 @@ defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
 // vector register group (specified by vs2). The destination vector register
 // group cannot overlap the mask register if used, unless LMUL=1.
 let Constraints = "@earlyclobber $vd" in {
-defm VNSRL_W : VALU_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
-defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
+defm VNSRL_W : VNSHT_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
+defm VNSRA_W : VNSHT_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
 } // Constraints = "@earlyclobber $vd"
 
 def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
@@ -607,14 +1013,14 @@ def : InstAlias<"vncvt.x.x.w $vd, $vs$vm",
 
 // Vector Integer Comparison Instructions
 let RVVConstraint = NoConstraint in {
-defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>;
-defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>;
-defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>;
-defm VMSLT_V : VALU_IV_V_X<"vmslt", 0b011011>;
-defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>;
-defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>;
-defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>;
-defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>;
+defm VMSEQ_V : VCMP_IV_V_X_I<"vmseq", 0b011000>;
+defm VMSNE_V : VCMP_IV_V_X_I<"vmsne", 0b011001>;
+defm VMSLTU_V : VCMP_IV_V_X<"vmsltu", 0b011010>;
+defm VMSLT_V : VCMP_IV_V_X<"vmslt", 0b011011>;
+defm VMSLEU_V : VCMP_IV_V_X_I<"vmsleu", 0b011100>;
+defm VMSLE_V : VCMP_IV_V_X_I<"vmsle", 0b011101>;
+defm VMSGTU_V : VCMP_IV_X_I<"vmsgtu", 0b011110>;
+defm VMSGT_V : VCMP_IV_X_I<"vmsgt", 0b011111>;
 } // RVVConstraint = NoConstraint
 
 def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
@@ -672,84 +1078,87 @@ def PseudoVMSGE_VX_M_T : Pseudo<(outs VR:$vd, VRNoV0:$scratch),
 }
 
 // Vector Integer Min/Max Instructions
-defm VMINU_V : VALU_IV_V_X<"vminu", 0b000100>;
-defm VMIN_V : VALU_IV_V_X<"vmin", 0b000101>;
-defm VMAXU_V : VALU_IV_V_X<"vmaxu", 0b000110>;
-defm VMAX_V : VALU_IV_V_X<"vmax", 0b000111>;
+defm VMINU_V : VCMP_IV_V_X<"vminu", 0b000100>;
+defm VMIN_V : VCMP_IV_V_X<"vmin", 0b000101>;
+defm VMAXU_V : VCMP_IV_V_X<"vmaxu", 0b000110>;
+defm VMAX_V : VCMP_IV_V_X<"vmax", 0b000111>;
 
 // Vector Single-Width Integer Multiply Instructions
-defm VMUL_V : VALU_MV_V_X<"vmul", 0b100101>;
-defm VMULH_V : VALU_MV_V_X<"vmulh", 0b100111>;
-defm VMULHU_V : VALU_MV_V_X<"vmulhu", 0b100100>;
-defm VMULHSU_V : VALU_MV_V_X<"vmulhsu", 0b100110>;
+defm VMUL_V : VMUL_MV_V_X<"vmul", 0b100101>;
+defm VMULH_V : VMUL_MV_V_X<"vmulh", 0b100111>;
+defm VMULHU_V : VMUL_MV_V_X<"vmulhu", 0b100100>;
+defm VMULHSU_V : VMUL_MV_V_X<"vmulhsu", 0b100110>;
 
 // Vector Integer Divide Instructions
-defm VDIVU_V : VALU_MV_V_X<"vdivu", 0b100000>;
-defm VDIV_V : VALU_MV_V_X<"vdiv", 0b100001>;
-defm VREMU_V : VALU_MV_V_X<"vremu", 0b100010>;
-defm VREM_V : VALU_MV_V_X<"vrem", 0b100011>;
+defm VDIVU_V : VDIV_MV_V_X<"vdivu", 0b100000>;
+defm VDIV_V : VDIV_MV_V_X<"vdiv", 0b100001>;
+defm VREMU_V : VDIV_MV_V_X<"vremu", 0b100010>;
+defm VREM_V : VDIV_MV_V_X<"vrem", 0b100011>;
 
 // Vector Widening Integer Multiply Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VWMUL_V : VALU_MV_V_X<"vwmul", 0b111011>;
-defm VWMULU_V : VALU_MV_V_X<"vwmulu", 0b111000>;
-defm VWMULSU_V : VALU_MV_V_X<"vwmulsu", 0b111010>;
+defm VWMUL_V : VWMUL_MV_V_X<"vwmul", 0b111011>;
+defm VWMULU_V : VWMUL_MV_V_X<"vwmulu", 0b111000>;
+defm VWMULSU_V : VWMUL_MV_V_X<"vwmulsu", 0b111010>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Single-Width Integer Multiply-Add Instructions
-defm VMACC_V : VALUr_MV_V_X<"vmacc", 0b101101>;
-defm VNMSAC_V : VALUr_MV_V_X<"vnmsac", 0b101111>;
-defm VMADD_V : VALUr_MV_V_X<"vmadd", 0b101001>;
-defm VNMSUB_V : VALUr_MV_V_X<"vnmsub", 0b101011>;
+defm VMACC_V : VMAC_MV_V_X<"vmacc", 0b101101>;
+defm VNMSAC_V : VMAC_MV_V_X<"vnmsac", 0b101111>;
+defm VMADD_V : VMAC_MV_V_X<"vmadd", 0b101001>;
+defm VNMSUB_V : VMAC_MV_V_X<"vnmsub", 0b101011>;
 
 // Vector Widening Integer Multiply-Add Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VWMACCU_V : VALUr_MV_V_X<"vwmaccu", 0b111100>;
-defm VWMACC_V : VALUr_MV_V_X<"vwmacc", 0b111101>;
-defm VWMACCSU_V : VALUr_MV_V_X<"vwmaccsu", 0b111111>;
-defm VWMACCUS_V : VALUr_MV_X<"vwmaccus", 0b111110>;
+defm VWMACCU_V : VWMAC_MV_V_X<"vwmaccu", 0b111100>;
+defm VWMACC_V : VWMAC_MV_V_X<"vwmacc", 0b111101>;
+defm VWMACCSU_V : VWMAC_MV_V_X<"vwmaccsu", 0b111111>;
+defm VWMACCUS_V : VWMAC_MV_X<"vwmaccus", 0b111110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Integer Merge Instructions
-defm VMERGE_V : VALUm_IV_V_X_I<"vmerge", 0b010111>;
+defm VMERGE_V : VMRG_IV_V_X_I<"vmerge", 0b010111>;
 
 // Vector Integer Move Instructions
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1,
     RVVConstraint = NoConstraint  in {
 // op vd, vs1
 def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VR:$vd),
-                       (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">;
+                       (ins VR:$vs1), "vmv.v.v", "$vd, $vs1">,
+              Sched<[WriteVIMovV, ReadVIMovV]>;
 // op vd, rs1
 def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VR:$vd),
-                       (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">;
+                       (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">,
+              Sched<[WriteVIMovX, ReadVIMovX]>;
 // op vd, imm
 def VMV_V_I : RVInstIVI<0b010111, (outs VR:$vd),
-                       (ins simm5:$imm), "vmv.v.i", "$vd, $imm">;
+                       (ins simm5:$imm), "vmv.v.i", "$vd, $imm">,
+              Sched<[WriteVIMovI]>;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 // Vector Fixed-Point Arithmetic Instructions
-defm VSADDU_V : VALU_IV_V_X_I<"vsaddu", 0b100000>;
-defm VSADD_V : VALU_IV_V_X_I<"vsadd", 0b100001>;
-defm VSSUBU_V : VALU_IV_V_X<"vssubu", 0b100010>;
-defm VSSUB_V : VALU_IV_V_X<"vssub", 0b100011>;
+defm VSADDU_V : VSALU_IV_V_X_I<"vsaddu", 0b100000>;
+defm VSADD_V : VSALU_IV_V_X_I<"vsadd", 0b100001>;
+defm VSSUBU_V : VSALU_IV_V_X<"vssubu", 0b100010>;
+defm VSSUB_V : VSALU_IV_V_X<"vssub", 0b100011>;
 
 // Vector Single-Width Averaging Add and Subtract
-defm VAADDU_V : VALU_MV_V_X<"vaaddu", 0b001000>;
-defm VAADD_V : VALU_MV_V_X<"vaadd", 0b001001>;
-defm VASUBU_V : VALU_MV_V_X<"vasubu", 0b001010>;
-defm VASUB_V : VALU_MV_V_X<"vasub", 0b001011>;
+defm VAADDU_V : VAALU_MV_V_X<"vaaddu", 0b001000>;
+defm VAADD_V : VAALU_MV_V_X<"vaadd", 0b001001>;
+defm VASUBU_V : VAALU_MV_V_X<"vasubu", 0b001010>;
+defm VASUB_V : VAALU_MV_V_X<"vasub", 0b001011>;
 
 // Vector Single-Width Fractional Multiply with Rounding and Saturation
-defm VSMUL_V : VALU_IV_V_X<"vsmul", 0b100111>;
+defm VSMUL_V : VSMUL_IV_V_X<"vsmul", 0b100111>;
 
 // Vector Single-Width Scaling Shift Instructions
-defm VSSRL_V : VALU_IV_V_X_I<"vssrl", 0b101010, uimm5>;
-defm VSSRA_V : VALU_IV_V_X_I<"vssra", 0b101011, uimm5>;
+defm VSSRL_V : VSSHF_IV_V_X_I<"vssrl", 0b101010, uimm5>;
+defm VSSRA_V : VSSHF_IV_V_X_I<"vssra", 0b101011, uimm5>;
 
 // Vector Narrowing Fixed-Point Clip Instructions
 let Constraints = "@earlyclobber $vd" in {
-defm VNCLIPU_W : VALU_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
-defm VNCLIP_W : VALU_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
+defm VNCLIPU_W : VNCLP_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
+defm VNCLIP_W : VNCLP_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
 } // Constraints = "@earlyclobber $vd"
 } // Predicates = [HasStdExtV]
 
@@ -762,60 +1171,60 @@ defm VFRSUB_V : VALU_FV_F<"vfrsub", 0b100111>;
 // Vector Widening Floating-Point Add/Subtract Instructions
 let Constraints = "@earlyclobber $vd" in {
 let RVVConstraint = WidenV in {
-defm VFWADD_V : VALU_FV_V_F<"vfwadd", 0b110000>;
-defm VFWSUB_V : VALU_FV_V_F<"vfwsub", 0b110010>;
+defm VFWADD_V : VWALU_FV_V_F<"vfwadd", 0b110000>;
+defm VFWSUB_V : VWALU_FV_V_F<"vfwsub", 0b110010>;
 } // RVVConstraint = WidenV
 // Set earlyclobber for following instructions for second and mask operands.
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
 let RVVConstraint = WidenW in {
-defm VFWADD_W : VALU_FV_V_F<"vfwadd", 0b110100, "w">;
-defm VFWSUB_W : VALU_FV_V_F<"vfwsub", 0b110110, "w">;
+defm VFWADD_W : VWALU_FV_V_F<"vfwadd", 0b110100, "w">;
+defm VFWSUB_W : VWALU_FV_V_F<"vfwsub", 0b110110, "w">;
 } // RVVConstraint = WidenW
 } // Constraints = "@earlyclobber $vd"
 
 // Vector Single-Width Floating-Point Multiply/Divide Instructions
-defm VFMUL_V : VALU_FV_V_F<"vfmul", 0b100100>;
-defm VFDIV_V : VALU_FV_V_F<"vfdiv", 0b100000>;
-defm VFRDIV_V : VALU_FV_F<"vfrdiv", 0b100001>;
+defm VFMUL_V : VMUL_FV_V_F<"vfmul", 0b100100>;
+defm VFDIV_V : VDIV_FV_V_F<"vfdiv", 0b100000>;
+defm VFRDIV_V : VRDIV_FV_F<"vfrdiv", 0b100001>;
 
 // Vector Widening Floating-Point Multiply
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VFWMUL_V : VALU_FV_V_F<"vfwmul", 0b111000>;
+defm VFWMUL_V : VWMUL_FV_V_F<"vfwmul", 0b111000>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Single-Width Floating-Point Fused Multiply-Add Instructions
-defm VFMACC_V : VALUr_FV_V_F<"vfmacc", 0b101100>;
-defm VFNMACC_V : VALUr_FV_V_F<"vfnmacc", 0b101101>;
-defm VFMSAC_V : VALUr_FV_V_F<"vfmsac", 0b101110>;
-defm VFNMSAC_V : VALUr_FV_V_F<"vfnmsac", 0b101111>;
-defm VFMADD_V : VALUr_FV_V_F<"vfmadd", 0b101000>;
-defm VFNMADD_V : VALUr_FV_V_F<"vfnmadd", 0b101001>;
-defm VFMSUB_V : VALUr_FV_V_F<"vfmsub", 0b101010>;
-defm VFNMSUB_V : VALUr_FV_V_F<"vfnmsub", 0b101011>;
+defm VFMACC_V : VMAC_FV_V_F<"vfmacc", 0b101100>;
+defm VFNMACC_V : VMAC_FV_V_F<"vfnmacc", 0b101101>;
+defm VFMSAC_V : VMAC_FV_V_F<"vfmsac", 0b101110>;
+defm VFNMSAC_V : VMAC_FV_V_F<"vfnmsac", 0b101111>;
+defm VFMADD_V : VMAC_FV_V_F<"vfmadd", 0b101000>;
+defm VFNMADD_V : VMAC_FV_V_F<"vfnmadd", 0b101001>;
+defm VFMSUB_V : VMAC_FV_V_F<"vfmsub", 0b101010>;
+defm VFNMSUB_V : VMAC_FV_V_F<"vfnmsub", 0b101011>;
 
 // Vector Widening Floating-Point Fused Multiply-Add Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
-defm VFWMACC_V : VALUr_FV_V_F<"vfwmacc", 0b111100>;
-defm VFWNMACC_V : VALUr_FV_V_F<"vfwnmacc", 0b111101>;
-defm VFWMSAC_V : VALUr_FV_V_F<"vfwmsac", 0b111110>;
-defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
+defm VFWMACC_V : VWMAC_FV_V_F<"vfwmacc", 0b111100>;
+defm VFWNMACC_V : VWMAC_FV_V_F<"vfwnmacc", 0b111101>;
+defm VFWMSAC_V : VWMAC_FV_V_F<"vfwmsac", 0b111110>;
+defm VFWNMSAC_V : VWMAC_FV_V_F<"vfwnmsac", 0b111111>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
 
 // Vector Floating-Point Square-Root Instruction
-defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
-defm VFRSQRT7_V : VALU_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
-defm VFREC7_V : VALU_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
+defm VFSQRT_V : VSQR_FV_VS2<"vfsqrt.v", 0b010011, 0b00000>;
+defm VFRSQRT7_V : VRCP_FV_VS2<"vfrsqrt7.v", 0b010011, 0b00100>;
+defm VFREC7_V : VRCP_FV_VS2<"vfrec7.v", 0b010011, 0b00101>;
 
 // Vector Floating-Point MIN/MAX Instructions
-defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
-defm VFMAX_V : VALU_FV_V_F<"vfmax", 0b000110>;
+defm VFMIN_V : VCMP_FV_V_F<"vfmin", 0b000100>;
+defm VFMAX_V : VCMP_FV_V_F<"vfmax", 0b000110>;
 
 // Vector Floating-Point Sign-Injection Instructions
-defm VFSGNJ_V : VALU_FV_V_F<"vfsgnj", 0b001000>;
-defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>;
-defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>;
+defm VFSGNJ_V : VSGNJ_FV_V_F<"vfsgnj", 0b001000>;
+defm VFSGNJN_V : VSGNJ_FV_V_F<"vfsgnjn", 0b001001>;
+defm VFSGNJX_V : VSGNJ_FV_V_F<"vfsgnjx", 0b001010>;
 
 def : InstAlias<"vfneg.v $vd, $vs$vm",
                 (VFSGNJN_VV VR:$vd, VR:$vs, VR:$vs, VMaskOp:$vm)>;
@@ -824,12 +1233,12 @@ def : InstAlias<"vfabs.v $vd, $vs$vm",
 
 // Vector Floating-Point Compare Instructions
 let RVVConstraint = NoConstraint in {
-defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>;
-defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>;
-defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>;
-defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>;
-defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>;
-defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>;
+defm VMFEQ_V : VCMP_FV_V_F<"vmfeq", 0b011000>;
+defm VMFNE_V : VCMP_FV_V_F<"vmfne", 0b011100>;
+defm VMFLT_V : VCMP_FV_V_F<"vmflt", 0b011011>;
+defm VMFLE_V : VCMP_FV_V_F<"vmfle", 0b011001>;
+defm VMFGT_V : VCMP_FV_F<"vmfgt", 0b011101>;
+defm VMFGE_V : VCMP_FV_F<"vmfge", 0b011111>;
 } // RVVConstraint = NoConstraint
 
 def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
@@ -838,68 +1247,70 @@ def : InstAlias<"vmfge.vv $vd, $va, $vb$vm",
                 (VMFLE_VV VR:$vd, VR:$vb, VR:$va, VMaskOp:$vm), 0>;
 
 // Vector Floating-Point Classify Instruction
-defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
+defm VFCLASS_V : VCLS_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+
 // Vector Floating-Point Merge Instruction
+let vm = 0 in
 def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
                            (ins VR:$vs2, FPR32:$rs1, VMV0:$v0),
-                           "vfmerge.vfm", "$vd, $vs2, $rs1, v0"> {
-  let vm = 0;
-}
+                           "vfmerge.vfm", "$vd, $vs2, $rs1, v0">,
+                  Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>;
 
 // Vector Floating-Point Move Instruction
 let RVVConstraint = NoConstraint in
+let vm = 1, vs2 = 0 in
 def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
-                       (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1"> {
-  let vs2 = 0;
-  let vm = 1;
-}
+                       (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1">,
+               Sched<[WriteVFMovV, ReadVFMovF]>;
+
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 // Single-Width Floating-Point/Integer Type-Convert Instructions
-defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
-defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
-defm VFCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
-defm VFCVT_RTZ_X_F_V : VALU_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
-defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
-defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
+defm VFCVT_XU_F_V : VCVTI_FV_VS2<"vfcvt.xu.f.v", 0b010010, 0b00000>;
+defm VFCVT_X_F_V : VCVTI_FV_VS2<"vfcvt.x.f.v", 0b010010, 0b00001>;
+defm VFCVT_RTZ_XU_F_V : VCVTI_FV_VS2<"vfcvt.rtz.xu.f.v", 0b010010, 0b00110>;
+defm VFCVT_RTZ_X_F_V : VCVTI_FV_VS2<"vfcvt.rtz.x.f.v", 0b010010, 0b00111>;
+defm VFCVT_F_XU_V : VCVTF_IV_VS2<"vfcvt.f.xu.v", 0b010010, 0b00010>;
+defm VFCVT_F_X_V : VCVTF_IV_VS2<"vfcvt.f.x.v", 0b010010, 0b00011>;
 
 // Widening Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in {
-defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
-defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
-defm VFWCVT_RTZ_XU_F_V : VALU_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
-defm VFWCVT_RTZ_X_F_V : VALU_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
-defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
-defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
-defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
+defm VFWCVT_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.xu.f.v", 0b010010, 0b01000>;
+defm VFWCVT_X_F_V : VWCVTI_FV_VS2<"vfwcvt.x.f.v", 0b010010, 0b01001>;
+defm VFWCVT_RTZ_XU_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.xu.f.v", 0b010010, 0b01110>;
+defm VFWCVT_RTZ_X_F_V : VWCVTI_FV_VS2<"vfwcvt.rtz.x.f.v", 0b010010, 0b01111>;
+defm VFWCVT_F_XU_V : VWCVTF_IV_VS2<"vfwcvt.f.xu.v", 0b010010, 0b01010>;
+defm VFWCVT_F_X_V : VWCVTF_IV_VS2<"vfwcvt.f.x.v", 0b010010, 0b01011>;
+defm VFWCVT_F_F_V : VWCVTF_FV_VS2<"vfwcvt.f.f.v", 0b010010, 0b01100>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
 
 // Narrowing Floating-Point/Integer Type-Convert Instructions
 let Constraints = "@earlyclobber $vd" in {
-defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
-defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
-defm VFNCVT_RTZ_XU_F_W : VALU_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
-defm VFNCVT_RTZ_X_F_W : VALU_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
-defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
-defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
-defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
-defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
+defm VFNCVT_XU_F_W : VNCVTI_FV_VS2<"vfncvt.xu.f.w", 0b010010, 0b10000>;
+defm VFNCVT_X_F_W : VNCVTI_FV_VS2<"vfncvt.x.f.w", 0b010010, 0b10001>;
+defm VFNCVT_RTZ_XU_F_W : VNCVTI_FV_VS2<"vfncvt.rtz.xu.f.w", 0b010010, 0b10110>;
+defm VFNCVT_RTZ_X_F_W : VNCVTI_FV_VS2<"vfncvt.rtz.x.f.w", 0b010010, 0b10111>;
+defm VFNCVT_F_XU_W : VNCVTF_IV_VS2<"vfncvt.f.xu.w", 0b010010, 0b10010>;
+defm VFNCVT_F_X_W : VNCVTF_IV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
+defm VFNCVT_F_F_W : VNCVTF_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
+defm VFNCVT_ROD_F_F_W : VNCVTF_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
 } // Constraints = "@earlyclobber $vd"
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
+
 // Vector Single-Width Integer Reduction Instructions
 let RVVConstraint = NoConstraint in {
-defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>;
-defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>;
-defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>;
-defm VREDMINU : VALU_MV_V<"vredminu", 0b000100>;
-defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>;
-defm VREDAND : VALU_MV_V<"vredand", 0b000001>;
-defm VREDOR : VALU_MV_V<"vredor", 0b000010>;
-defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>;
+defm VREDSUM : VRED_MV_V<"vredsum", 0b000000>;
+defm VREDMAXU : VRED_MV_V<"vredmaxu", 0b000110>;
+defm VREDMAX : VRED_MV_V<"vredmax", 0b000111>;
+defm VREDMINU : VRED_MV_V<"vredminu", 0b000100>;
+defm VREDMIN : VRED_MV_V<"vredmin", 0b000101>;
+defm VREDAND : VRED_MV_V<"vredand", 0b000001>;
+defm VREDOR : VRED_MV_V<"vredor", 0b000010>;
+defm VREDXOR : VRED_MV_V<"vredxor", 0b000011>;
 } // RVVConstraint = NoConstraint
 
 // Vector Widening Integer Reduction Instructions
@@ -908,18 +1319,19 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
-defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>;
-defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>;
+defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>;
+defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
+
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtV, HasStdExtF] in {
 // Vector Single-Width Floating-Point Reduction Instructions
 let RVVConstraint = NoConstraint in {
-defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>;
-defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>;
-defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>;
-defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>;
+defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>;
+defm VFREDSUM : VRED_FV_V<"vfredsum", 0b000001>;
+defm VFREDMAX : VRED_FV_V<"vfredmax", 0b000111>;
+defm VFREDMIN : VRED_FV_V<"vfredmin", 0b000101>;
 } // RVVConstraint = NoConstraint
 
 // Vector Widening Floating-Point Reduction Instructions
@@ -928,22 +1340,22 @@ let Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint in {
 // This has the downside that the earlyclobber constraint is too coarse and
 // will impose unnecessary restrictions by not allowing the destination to
 // overlap with the first (wide) operand.
-defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>;
-defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>;
+defm VFWREDOSUM : VWREDO_FV_V<"vfwredosum", 0b110011>;
+defm VFWREDSUM : VWRED_FV_V<"vfwredsum", 0b110001>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
 // Vector Mask-Register Logical Instructions
 let RVVConstraint = NoConstraint in {
-defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">;
-defm VMNAND_M : VALU_MV_Mask<"vmnand", 0b011101, "m">;
-defm VMANDNOT_M : VALU_MV_Mask<"vmandnot", 0b011000, "m">;
-defm VMXOR_M : VALU_MV_Mask<"vmxor", 0b011011, "m">;
-defm VMOR_M : VALU_MV_Mask<"vmor", 0b011010, "m">;
-defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">;
-defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">;
-defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">;
+defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">;
+defm VMNAND_M : VMALU_MV_Mask<"vmnand", 0b011101, "m">;
+defm VMANDNOT_M : VMALU_MV_Mask<"vmandnot", 0b011000, "m">;
+defm VMXOR_M : VMALU_MV_Mask<"vmxor", 0b011011, "m">;
+defm VMOR_M : VMALU_MV_Mask<"vmor", 0b011010, "m">;
+defm VMNOR_M : VMALU_MV_Mask<"vmnor", 0b011110, "m">;
+defm VMORNOT_M : VMALU_MV_Mask<"vmornot", 0b011100, "m">;
+defm VMXNOR_M : VMALU_MV_Mask<"vmxnor", 0b011111, "m">;
 }
 
 def : InstAlias<"vmmv.m $vd, $vs",
@@ -957,98 +1369,113 @@ def : InstAlias<"vmnot.m $vd, $vs",
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     RVVConstraint = NoConstraint  in {
+
 // Vector mask population count vpopc
 def VPOPC_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
-                        (ins VR:$vs2, VMaskOp:$vm),
-                        "vpopc.m", "$vd, $vs2$vm">;
+                      (ins VR:$vs2, VMaskOp:$vm),
+                      "vpopc.m", "$vd, $vs2$vm">,
+              Sched<[WriteVMPopV, ReadVMPopV, ReadVMask]>;
 
 // vfirst find-first-set mask bit
 def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
-                        (ins VR:$vs2, VMaskOp:$vm),
-                        "vfirst.m", "$vd, $vs2$vm">;
+                       (ins VR:$vs2, VMaskOp:$vm),
+                       "vfirst.m", "$vd, $vs2$vm">,
+              Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMask]>;
+
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
+
 // vmsbf.m set-before-first mask bit
-defm VMSBF_M : VALU_MV_VS2<"vmsbf.m", 0b010100, 0b00001>;
+defm VMSBF_M : VMSFS_MV_V<"vmsbf.m", 0b010100, 0b00001>;
 // vmsif.m set-including-first mask bit
-defm VMSIF_M : VALU_MV_VS2<"vmsif.m", 0b010100, 0b00011>;
+defm VMSIF_M : VMSFS_MV_V<"vmsif.m", 0b010100, 0b00011>;
 // vmsof.m set-only-first mask bit
-defm VMSOF_M : VALU_MV_VS2<"vmsof.m", 0b010100, 0b00010>;
+defm VMSOF_M : VMSFS_MV_V<"vmsof.m", 0b010100, 0b00010>;
 // Vector Iota Instruction
-defm VIOTA_M : VALU_MV_VS2<"viota.m", 0b010100, 0b10000>;
+defm VIOTA_M : VMIOT_MV_V<"viota.m", 0b010100, 0b10000>;
+
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Iota
 
 // Vector Element Index Instruction
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+
+let vs2 = 0 in
 def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VR:$vd),
-                      (ins VMaskOp:$vm), "vid.v", "$vd$vm"> {
-  let vs2 = 0;
-}
+                    (ins VMaskOp:$vm), "vid.v", "$vd$vm">,
+            Sched<[WriteVMIdxV, ReadVMask]>;
 
 // Integer Scalar Move Instructions
 let vm = 1, RVVConstraint = NoConstraint in {
 def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
-                      (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">;
+                      (ins VR:$vs2), "vmv.x.s", "$vd, $vs2">,
+              Sched<[WriteVIMovVX, ReadVIMovVX]>;
 let Constraints = "$vd = $vd_wb" in
 def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
-                      (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
-
+                      (ins VR:$vd, GPR:$rs1), "vmv.s.x", "$vd, $rs1">,
+              Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>;
 }
+
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtV, HasStdExtF] in {
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
     RVVConstraint = NoConstraint  in {
 // Floating-Point Scalar Move Instructions
 def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
-                      (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">;
+                      (ins VR:$vs2), "vfmv.f.s", "$vd, $vs2">,
+               Sched<[WriteVFMovVF, ReadVFMovVF]>;
 let Constraints = "$vd = $vd_wb" in
 def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
-                      (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
+                       (ins VR:$vd, FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">,
+               Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>;
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
+
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
 // Vector Slide Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VSLIDEUP_V : VALU_IV_X_I<"vslideup", 0b001110, uimm5>;
-defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
+defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, uimm5>;
+defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>;
-defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, uimm5>;
+defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
 } // Predicates = [HasStdExtV]
 
 let Predicates = [HasStdExtV, HasStdExtF] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
-defm VFSLIDE1UP_V : VALU_FV_F<"vfslide1up", 0b001110>;
+defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
-defm VFSLIDE1DOWN_V : VALU_FV_F<"vfslide1down", 0b001111>;
+defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
 } // Predicates = [HasStdExtV, HasStdExtF]
 
 let Predicates = [HasStdExtV] in {
 // Vector Register Gather Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
-defm VRGATHER_V : VALU_IV_V_X_I<"vrgather", 0b001100, uimm5>;
-def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">;
+defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100, uimm5>;
+def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
+                      Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV]>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
 
 // Vector Compress Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress in {
-defm VCOMPRESS_V : VALU_MV_Mask<"vcompress", 0b010111>;
+defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     RVVConstraint = NoConstraint in {
-foreach nf = [1, 2, 4, 8] in {
-  def VMV#nf#R_V  : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VR:$vd),
-                            (ins VR:$vs2), "vmv" # nf # "r.v",
-                            "$vd, $vs2"> {
-    let Uses = [];
-    let vm = 1;
-  }
+foreach n = [1, 2, 4, 8] in {
+  def VMV#n#R_V  : RVInstV<0b100111, !add(n, -1), OPIVI, (outs VR:$vd),
+                           (ins VR:$vs2), "vmv" # n # "r.v", "$vd, $vs2">,
+                   VMVRSched<n> {
+  let Uses = [];
+  let vm = 1;
+}
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 } // Predicates = [HasStdExtV]
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index ed26a5026114..14f59152ed42 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -231,6 +231,9 @@ def : ReadAdvance<ReadFMovI64ToF64, 0>;
 def : ReadAdvance<ReadFClass32, 0>;
 def : ReadAdvance<ReadFClass64, 0>;
 
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
 defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
 defm : UnsupportedSchedZfh;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 314af180aca1..75ca6ca861be 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -219,6 +219,9 @@ def : ReadAdvance<ReadFMovI64ToF64, 0>;
 def : ReadAdvance<ReadFClass32, 0>;
 def : ReadAdvance<ReadFClass64, 0>;
 
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
 defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
 defm : UnsupportedSchedZfh;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index f31e4af46c1b..4971ca1d4e3e 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -230,3 +230,4 @@ def : ReadAdvance<ReadFSqrt16, 0>;
 
 // Include the scheduler resources for other instruction extensions.
 include "RISCVScheduleB.td"
+include "RISCVScheduleV.td"
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
new file mode 100644
index 000000000000..43af1802d706
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -0,0 +1,820 @@
+//===-- RISCVScheduleV.td - RISCV Scheduling Definitions V -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// Define scheduler resources associated with def operands.
+
+// 7. Vector Loads and Stores
+// 7.4. Vector Unit-Stride Instructions
+def WriteVLDE8        : SchedWrite;
+def WriteVLDE16       : SchedWrite;
+def WriteVLDE32       : SchedWrite;
+def WriteVLDE64       : SchedWrite;
+def WriteVSTE8        : SchedWrite;
+def WriteVSTE16       : SchedWrite;
+def WriteVSTE32       : SchedWrite;
+def WriteVSTE64       : SchedWrite;
+// 7.4.1. Vector Unit-Strided Mask
+def WriteVLDM         : SchedWrite;
+def WriteVSTM         : SchedWrite;
+// 7.5. Vector Strided Instructions
+def WriteVLDS8        : SchedWrite;
+def WriteVLDS16       : SchedWrite;
+def WriteVLDS32       : SchedWrite;
+def WriteVLDS64       : SchedWrite;
+def WriteVSTS8        : SchedWrite;
+def WriteVSTS16       : SchedWrite;
+def WriteVSTS32       : SchedWrite;
+def WriteVSTS64       : SchedWrite;
+// 7.6. Vector Indexed Instructions
+def WriteVLDUX8       : SchedWrite;
+def WriteVLDUX16      : SchedWrite;
+def WriteVLDUX32      : SchedWrite;
+def WriteVLDUX64      : SchedWrite;
+def WriteVLDOX8       : SchedWrite;
+def WriteVLDOX16      : SchedWrite;
+def WriteVLDOX32      : SchedWrite;
+def WriteVLDOX64      : SchedWrite;
+def WriteVSTUX8       : SchedWrite;
+def WriteVSTUX16      : SchedWrite;
+def WriteVSTUX32      : SchedWrite;
+def WriteVSTUX64      : SchedWrite;
+def WriteVSTOX8       : SchedWrite;
+def WriteVSTOX16      : SchedWrite;
+def WriteVSTOX32      : SchedWrite;
+def WriteVSTOX64      : SchedWrite;
+// 7.7. Vector Unit-stride Fault-Only-First Loads
+def WriteVLDFF8       : SchedWrite;
+def WriteVLDFF16      : SchedWrite;
+def WriteVLDFF32      : SchedWrite;
+def WriteVLDFF64      : SchedWrite;
+// 7.9. Vector Whole Register Instructions
+def WriteVLD1R8       : SchedWrite;
+def WriteVLD1R16      : SchedWrite;
+def WriteVLD1R32      : SchedWrite;
+def WriteVLD1R64      : SchedWrite;
+def WriteVLD2R8       : SchedWrite;
+def WriteVLD2R16      : SchedWrite;
+def WriteVLD2R32      : SchedWrite;
+def WriteVLD2R64      : SchedWrite;
+def WriteVLD4R8       : SchedWrite;
+def WriteVLD4R16      : SchedWrite;
+def WriteVLD4R32      : SchedWrite;
+def WriteVLD4R64      : SchedWrite;
+def WriteVLD8R8       : SchedWrite;
+def WriteVLD8R16      : SchedWrite;
+def WriteVLD8R32      : SchedWrite;
+def WriteVLD8R64      : SchedWrite;
+def WriteVST1R        : SchedWrite;
+def WriteVST2R        : SchedWrite;
+def WriteVST4R        : SchedWrite;
+def WriteVST8R        : SchedWrite;
+
+// 11. Vector Integer Arithmetic Instructions
+// 11.1. Vector Single-Width Integer Add and Subtract
+// 11.5. Vector Bitwise Logical Instructions
+def WriteVIALUV       : SchedWrite;
+def WriteVIALUX       : SchedWrite;
+def WriteVIALUI       : SchedWrite;
+// 11.2. Vector Widening Integer Add/Subtract
+def WriteVIWALUV      : SchedWrite;
+def WriteVIWALUX      : SchedWrite;
+def WriteVIWALUI      : SchedWrite;
+// 11.3. Vector Integer Extension
+def WriteVExtV        : SchedWrite;
+// 11.4. Vector Integer Arithmetic with Carry or Borrow Instructions
+def WriteVICALUV      : SchedWrite;
+def WriteVICALUX      : SchedWrite;
+def WriteVICALUI      : SchedWrite;
+// 11.6. Vector Single-Width Bit Shift Instructions
+def WriteVShiftV      : SchedWrite;
+def WriteVShiftX      : SchedWrite;
+def WriteVShiftI      : SchedWrite;
+// 11.7. Vector Narrowing Integer Right Shift Instructions
+def WriteVNShiftV     : SchedWrite;
+def WriteVNShiftX     : SchedWrite;
+def WriteVNShiftI     : SchedWrite;
+// 11.8. Vector Integer Comparison Instructions
+// 11.9. Vector Integer Min/Max Instructions
+def WriteVICmpV       : SchedWrite;
+def WriteVICmpX       : SchedWrite;
+def WriteVICmpI       : SchedWrite;
+// 11.10. Vector Single-Width Integer Multiply Instructions
+def WriteVIMulV       : SchedWrite;
+def WriteVIMulX       : SchedWrite;
+// 11.11. Vector Integer Divide Instructions
+def WriteVIDivV       : SchedWrite;
+def WriteVIDivX       : SchedWrite;
+// 11.12. Vector Widening Integer Multiply Instructions
+def WriteVIWMulV      : SchedWrite;
+def WriteVIWMulX      : SchedWrite;
+// 11.13. Vector Single-Width Integer Multiply-Add Instructions
+def WriteVIMulAddV    : SchedWrite;
+def WriteVIMulAddX    : SchedWrite;
+// 11.14. Vector Widening Integer Multiply-Add Instructions
+def WriteVIWMulAddV   : SchedWrite;
+def WriteVIWMulAddX   : SchedWrite;
+// 11.15. Vector Integer Merge Instructions
+def WriteVIMergeV     : SchedWrite;
+def WriteVIMergeX     : SchedWrite;
+def WriteVIMergeI     : SchedWrite;
+// 11.16. Vector Integer Move Instructions
+def WriteVIMovV       : SchedWrite;
+def WriteVIMovX       : SchedWrite;
+def WriteVIMovI       : SchedWrite;
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+// 12.1. Vector Single-Width Saturating Add and Subtract
+def WriteVSALUV       : SchedWrite;
+def WriteVSALUX       : SchedWrite;
+def WriteVSALUI       : SchedWrite;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+def WriteVAALUV       : SchedWrite;
+def WriteVAALUX       : SchedWrite;
+// 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+def WriteVSMulV       : SchedWrite;
+def WriteVSMulX       : SchedWrite;
+// 12.4. Vector Single-Width Scaling Shift Instructions
+def WriteVSShiftV     : SchedWrite;
+def WriteVSShiftX     : SchedWrite;
+def WriteVSShiftI     : SchedWrite;
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+def WriteVNClipV      : SchedWrite;
+def WriteVNClipX      : SchedWrite;
+def WriteVNClipI      : SchedWrite;
+
+// 13. Vector Floating-Point Instructions
+// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+def WriteVFALUV       : SchedWrite;
+def WriteVFALUF       : SchedWrite;
+// 13.3. Vector Widening Floating-Point Add/Subtract Instructions
+def WriteVFWALUV      : SchedWrite;
+def WriteVFWALUF      : SchedWrite;
+// 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+def WriteVFMulV       : SchedWrite;
+def WriteVFMulF       : SchedWrite;
+def WriteVFDivV       : SchedWrite;
+def WriteVFDivF       : SchedWrite;
+// 13.5. Vector Widening Floating-Point Multiply
+def WriteVFWMulV      : SchedWrite;
+def WriteVFWMulF      : SchedWrite;
+// 13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+def WriteVFMulAddV    : SchedWrite;
+def WriteVFMulAddF    : SchedWrite;
+// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+def WriteVFWMulAddV   : SchedWrite;
+def WriteVFWMulAddF   : SchedWrite;
+// 13.8. Vector Floating-Point Square-Root Instruction
+def WriteVFSqrtV      : SchedWrite;
+// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
+def WriteVFRecpV      : SchedWrite;
+// 13.11. Vector Floating-Point MIN/MAX Instructions
+// 13.13. Vector Floating-Point Compare Instructions
+def WriteVFCmpV       : SchedWrite;
+def WriteVFCmpF       : SchedWrite;
+// 13.12. Vector Floating-Point Sign-Injection Instructions
+def WriteVFSgnjV      : SchedWrite;
+def WriteVFSgnjF      : SchedWrite;
+// 13.14. Vector Floating-Point Classify Instruction
+def WriteVFClassV     : SchedWrite;
+// 13.15. Vector Floating-Point Merge Instruction
+def WriteVFMergeV     : SchedWrite;
+// 13.16. Vector Floating-Point Move Instruction
+def WriteVFMovV       : SchedWrite;
+// 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+def WriteVFCvtIToFV   : SchedWrite;
+def WriteVFCvtFToIV   : SchedWrite;
+def WriteVFCvtFToFV   : SchedWrite;
+// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
+def WriteVFWCvtIToFV  : SchedWrite;
+def WriteVFWCvtFToIV  : SchedWrite;
+def WriteVFWCvtFToFV  : SchedWrite;
+// 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+def WriteVFNCvtIToFV  : SchedWrite;
+def WriteVFNCvtFToIV  : SchedWrite;
+def WriteVFNCvtFToFV  : SchedWrite;
+
+// 14. Vector Reduction Operations
+// 14.1. Vector Single-Width Integer Reduction Instructions
+def WriteVIRedV       : SchedWrite;
+// 14.2. Vector Widening Integer Reduction Instructions
+def WriteVIWRedV      : SchedWrite;
+// 14.3. Vector Single-Width Floating-Point Reduction Instructions
+def WriteVFRedV       : SchedWrite;
+def WriteVFRedOV      : SchedWrite;
+// 14.4. Vector Widening Floating-Point Reduction Instructions
+def WriteVFWRedV      : SchedWrite;
+def WriteVFWRedOV     : SchedWrite;
+
+// 15. Vector Mask Instructions
+// 15.1. Vector Mask-Register Logical Instructions
+def WriteVMALUV       : SchedWrite;
+// 15.2. Vector Mask Population Count
+def WriteVMPopV       : SchedWrite;
+// 15.3. Vector Find-First-Set Mask Bit
+def WriteVMFFSV       : SchedWrite;
+// 15.4. Vector Set-Before-First Mask Bit
+// 15.5. Vector Set-Including-First Mask Bit
+// 15.6. Vector Set-only-First Mask Bit
+def WriteVMSFSV       : SchedWrite;
+// 15.8. Vector Iota Instruction
+def WriteVMIotV       : SchedWrite;
+// 15.9. Vector Element Index Instruction
+def WriteVMIdxV       : SchedWrite;
+
+// 16. Vector Permutation Instructions
+// 16.1. Integer Scalar Move Instructions
+def WriteVIMovVX      : SchedWrite;
+def WriteVIMovXV      : SchedWrite;
+// 16.2. Floating-Point Scalar Move Instructions
+def WriteVFMovVF      : SchedWrite;
+def WriteVFMovFV      : SchedWrite;
+// 16.3. Vector Slide Instructions
+def WriteVISlideX     : SchedWrite;
+def WriteVISlideI     : SchedWrite;
+def WriteVISlide1X    : SchedWrite;
+def WriteVFSlide1F    : SchedWrite;
+// 16.4. Vector Register Gather Instructions
+def WriteVGatherV     : SchedWrite;
+def WriteVGatherX     : SchedWrite;
+def WriteVGatherI     : SchedWrite;
+// 16.5. Vector Compress Instruction
+def WriteVCompressV   : SchedWrite;
+// 16.6. Whole Vector Register Move
+def WriteVMov1V       : SchedWrite;
+def WriteVMov2V       : SchedWrite;
+def WriteVMov4V       : SchedWrite;
+def WriteVMov8V       : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+/// Define scheduler resources associated with use operands.
+
+// 7. Vector Loads and Stores
+def ReadVLDX          : SchedRead;
+def ReadVSTX          : SchedRead;
+// 7.4. Vector Unit-Stride Instructions
+def ReadVSTE8V        : SchedRead;
+def ReadVSTE16V       : SchedRead;
+def ReadVSTE32V       : SchedRead;
+def ReadVSTE64V       : SchedRead;
+// 7.4.1. Vector Unit-Strided Mask
+def ReadVSTM          : SchedRead;
+// 7.5. Vector Strided Instructions
+def ReadVLDSX         : SchedRead;
+def ReadVSTSX         : SchedRead;
+def ReadVSTS8V        : SchedRead;
+def ReadVSTS16V       : SchedRead;
+def ReadVSTS32V       : SchedRead;
+def ReadVSTS64V       : SchedRead;
+// 7.6. Vector Indexed Instructions
+def ReadVLDUXV        : SchedRead;
+def ReadVLDOXV        : SchedRead;
+def ReadVSTUX8        : SchedRead;
+def ReadVSTUX16       : SchedRead;
+def ReadVSTUX32       : SchedRead;
+def ReadVSTUX64       : SchedRead;
+def ReadVSTUXV        : SchedRead;
+def ReadVSTUX8V       : SchedRead;
+def ReadVSTUX16V      : SchedRead;
+def ReadVSTUX32V      : SchedRead;
+def ReadVSTUX64V      : SchedRead;
+def ReadVSTOX8        : SchedRead;
+def ReadVSTOX16       : SchedRead;
+def ReadVSTOX32       : SchedRead;
+def ReadVSTOX64       : SchedRead;
+def ReadVSTOXV        : SchedRead;
+def ReadVSTOX8V       : SchedRead;
+def ReadVSTOX16V      : SchedRead;
+def ReadVSTOX32V      : SchedRead;
+def ReadVSTOX64V      : SchedRead;
+// 7.9. Vector Whole Register Instructions
+def ReadVST1R         : SchedRead;
+def ReadVST2R         : SchedRead;
+def ReadVST4R         : SchedRead;
+def ReadVST8R         : SchedRead;
+
+// 11. Vector Integer Arithmetic Instructions
+// 11.1. Vector Single-Width Integer Add and Subtract
+// 11.5. Vector Bitwise Logical Instructions
+def ReadVIALUV        : SchedRead;
+def ReadVIALUX        : SchedRead;
+// 11.2. Vector Widening Integer Add/Subtract
+def ReadVIWALUV       : SchedRead;
+def ReadVIWALUX       : SchedRead;
+// 11.3. Vector Integer Extension
+def ReadVExtV         : SchedRead;
+// 11.4. Vector Integer Arithmetic with Carry or Borrow Instructions
+def ReadVIALUCV       : SchedRead;
+def ReadVIALUCX       : SchedRead;
+// 11.6. Vector Single-Width Bit Shift Instructions
+def ReadVShiftV       : SchedRead;
+def ReadVShiftX       : SchedRead;
+// 11.7. Vector Narrowing Integer Right Shift Instructions
+def ReadVNShiftV      : SchedRead;
+def ReadVNShiftX      : SchedRead;
+// 11.8. Vector Integer Comparison Instructions
+// 11.9. Vector Integer Min/Max Instructions
+def ReadVICmpV        : SchedRead;
+def ReadVICmpX        : SchedRead;
+// 11.10. Vector Single-Width Integer Multiply Instructions
+def ReadVIMulV        : SchedRead;
+def ReadVIMulX        : SchedRead;
+// 11.11. Vector Integer Divide Instructions
+def ReadVIDivV        : SchedRead;
+def ReadVIDivX        : SchedRead;
+// 11.12. Vector Widening Integer Multiply Instructions
+def ReadVIWMulV       : SchedRead;
+def ReadVIWMulX       : SchedRead;
+// 11.13. Vector Single-Width Integer Multiply-Add Instructions
+def ReadVIMulAddV     : SchedRead;
+def ReadVIMulAddX     : SchedRead;
+// 11.14. Vector Widening Integer Multiply-Add Instructions
+def ReadVIWMulAddV    : SchedRead;
+def ReadVIWMulAddX    : SchedRead;
+// 11.15. Vector Integer Merge Instructions
+def ReadVIMergeV      : SchedRead;
+def ReadVIMergeX      : SchedRead;
+// 11.16. Vector Integer Move Instructions
+def ReadVIMovV        : SchedRead;
+def ReadVIMovX        : SchedRead;
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+// 12.1. Vector Single-Width Saturating Add and Subtract
+def ReadVSALUV        : SchedRead;
+def ReadVSALUX        : SchedRead;
+// 12.2. Vector Single-Width Averaging Add and Subtract
+def ReadVAALUV        : SchedRead;
+def ReadVAALUX        : SchedRead;
+// 12.3. Vector Single-Width Fractional Multiply with Rounding and Saturation
+def ReadVSMulV        : SchedRead;
+def ReadVSMulX        : SchedRead;
+// 12.4. Vector Single-Width Scaling Shift Instructions
+def ReadVSShiftV      : SchedRead;
+def ReadVSShiftX      : SchedRead;
+// 12.5. Vector Narrowing Fixed-Point Clip Instructions
+def ReadVNClipV       : SchedRead;
+def ReadVNClipX       : SchedRead;
+
+// 13. Vector Floating-Point Instructions
+// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
+def ReadVFALUV        : SchedRead;
+def ReadVFALUF        : SchedRead;
+// 13.3. Vector Widening Floating-Point Add/Subtract Instructions
+def ReadVFWALUV       : SchedRead;
+def ReadVFWALUF       : SchedRead;
+// 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
+def ReadVFMulV        : SchedRead;
+def ReadVFMulF        : SchedRead;
+def ReadVFDivV        : SchedRead;
+def ReadVFDivF        : SchedRead;
+// 13.5. Vector Widening Floating-Point Multiply
+def ReadVFWMulV       : SchedRead;
+def ReadVFWMulF       : SchedRead;
+// 13.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+def ReadVFMulAddV     : SchedRead;
+def ReadVFMulAddF     : SchedRead;
+// 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions
+def ReadVFWMulAddV    : SchedRead;
+def ReadVFWMulAddF    : SchedRead;
+// 13.8. Vector Floating-Point Square-Root Instruction
+def ReadVFSqrtV       : SchedRead;
+// 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
+// 13.10. Vector Floating-Point Reciprocal Estimate Instruction
+def ReadVFRecpV       : SchedRead;
+// 13.11. Vector Floating-Point MIN/MAX Instructions
+// 13.13. Vector Floating-Point Compare Instructions
+def ReadVFCmpV        : SchedRead;
+def ReadVFCmpF        : SchedRead;
+// 13.12. Vector Floating-Point Sign-Injection Instructions
+def ReadVFSgnjV       : SchedRead;
+def ReadVFSgnjF       : SchedRead;
+// 13.14. Vector Floating-Point Classify Instruction
+def ReadVFClassV      : SchedRead;
+// 13.15. Vector Floating-Point Merge Instruction
+def ReadVFMergeV      : SchedRead;
+def ReadVFMergeF      : SchedRead;
+// 13.16. Vector Floating-Point Move Instruction
+def ReadVFMovF        : SchedRead;
+// 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
+def ReadVFCvtIToFV    : SchedRead;
+def ReadVFCvtFToIV    : SchedRead;
+// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
+def ReadVFWCvtIToFV   : SchedRead;
+def ReadVFWCvtFToIV   : SchedRead;
+def ReadVFWCvtFToFV   : SchedRead;
+// 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions
+def ReadVFNCvtIToFV   : SchedRead;
+def ReadVFNCvtFToIV   : SchedRead;
+def ReadVFNCvtFToFV   : SchedRead;
+
+// 14. Vector Reduction Operations
+// 14.1. Vector Single-Width Integer Reduction Instructions
+def ReadVIRedV        : SchedRead;
+def ReadVIRedV0       : SchedRead;
+// 14.2. Vector Widening Integer Reduction Instructions
+def ReadVIWRedV       : SchedRead;
+def ReadVIWRedV0      : SchedRead;
+// 14.3. Vector Single-Width Floating-Point Reduction Instructions
+def ReadVFRedV        : SchedRead;
+def ReadVFRedV0       : SchedRead;
+def ReadVFRedOV       : SchedRead;
+def ReadVFRedOV0      : SchedRead;
+// 14.4. Vector Widening Floating-Point Reduction Instructions
+def ReadVFWRedV       : SchedRead;
+def ReadVFWRedV0      : SchedRead;
+def ReadVFWRedOV      : SchedRead;
+def ReadVFWRedOV0     : SchedRead;
+
+// 15. Vector Mask Instructions
+// 15.1. Vector Mask-Register Logical Instructions
+def ReadVMALUV        : SchedRead;
+// 15.2. Vector Mask Population Count
+def ReadVMPopV        : SchedRead;
+// 15.3. Vector Find-First-Set Mask Bit
+def ReadVMFFSV        : SchedRead;
+// 15.4. Vector Set-Before-First Mask Bit
+// 15.5. Vector Set-Including-First Mask Bit
+// 15.6. Vector Set-only-First Mask Bit
+def ReadVMSFSV        : SchedRead;
+// 15.8. Vector Iota Instruction
+def ReadVMIotV        : SchedRead;
+
+// 16. Vector Permutation Instructions
+// 16.1. Integer Scalar Move Instructions
+def ReadVIMovVX       : SchedRead;
+def ReadVIMovXV       : SchedRead;
+def ReadVIMovXX       : SchedRead;
+// 16.2. Floating-Point Scalar Move Instructions
+def ReadVFMovVF       : SchedRead;
+def ReadVFMovFV       : SchedRead;
+def ReadVFMovFX       : SchedRead;
+// 16.3. Vector Slide Instructions
+def ReadVISlideV      : SchedRead;
+def ReadVISlideX      : SchedRead;
+def ReadVFSlideV      : SchedRead;
+def ReadVFSlideF      : SchedRead;
+// 16.4. Vector Register Gather Instructions
+def ReadVGatherV      : SchedRead;
+def ReadVGatherX      : SchedRead;
+// 16.5. Vector Compress Instruction
+def ReadVCompressV    : SchedRead;
+// 16.6. Whole Vector Register Move
+def ReadVMov1V        : SchedRead;
+def ReadVMov2V        : SchedRead;
+def ReadVMov4V        : SchedRead;
+def ReadVMov8V        : SchedRead;
+
+// Others
+def ReadVMask         : SchedRead;
+
+//===----------------------------------------------------------------------===//
+/// Define default scheduler resources for V.
+
+multiclass UnsupportedSchedV {
+let Unsupported = true in {
+
+// 7. Vector Loads and Stores
+def : WriteRes<WriteVLDE8, []>;
+def : WriteRes<WriteVLDE16, []>;
+def : WriteRes<WriteVLDE32, []>;
+def : WriteRes<WriteVLDE64, []>;
+def : WriteRes<WriteVSTE8, []>;
+def : WriteRes<WriteVSTE16, []>;
+def : WriteRes<WriteVSTE32, []>;
+def : WriteRes<WriteVSTE64, []>;
+def : WriteRes<WriteVLDM, []>;
+def : WriteRes<WriteVSTM, []>;
+def : WriteRes<WriteVLDS8, []>;
+def : WriteRes<WriteVLDS16, []>;
+def : WriteRes<WriteVLDS32, []>;
+def : WriteRes<WriteVLDS64, []>;
+def : WriteRes<WriteVSTS8, []>;
+def : WriteRes<WriteVSTS16, []>;
+def : WriteRes<WriteVSTS32, []>;
+def : WriteRes<WriteVSTS64, []>;
+def : WriteRes<WriteVLDUX8, []>;
+def : WriteRes<WriteVLDUX16, []>;
+def : WriteRes<WriteVLDUX32, []>;
+def : WriteRes<WriteVLDUX64, []>;
+def : WriteRes<WriteVLDOX8, []>;
+def : WriteRes<WriteVLDOX16, []>;
+def : WriteRes<WriteVLDOX32, []>;
+def : WriteRes<WriteVLDOX64, []>;
+def : WriteRes<WriteVSTUX8, []>;
+def : WriteRes<WriteVSTUX16, []>;
+def : WriteRes<WriteVSTUX32, []>;
+def : WriteRes<WriteVSTUX64, []>;
+def : WriteRes<WriteVSTOX8, []>;
+def : WriteRes<WriteVSTOX16, []>;
+def : WriteRes<WriteVSTOX32, []>;
+def : WriteRes<WriteVSTOX64, []>;
+def : WriteRes<WriteVLDFF8, []>;
+def : WriteRes<WriteVLDFF16, []>;
+def : WriteRes<WriteVLDFF32, []>;
+def : WriteRes<WriteVLDFF64, []>;
+def : WriteRes<WriteVLD1R8, []>;
+def : WriteRes<WriteVLD1R16, []>;
+def : WriteRes<WriteVLD1R32, []>;
+def : WriteRes<WriteVLD1R64, []>;
+def : WriteRes<WriteVLD2R8, []>;
+def : WriteRes<WriteVLD2R16, []>;
+def : WriteRes<WriteVLD2R32, []>;
+def : WriteRes<WriteVLD2R64, []>;
+def : WriteRes<WriteVLD4R8, []>;
+def : WriteRes<WriteVLD4R16, []>;
+def : WriteRes<WriteVLD4R32, []>;
+def : WriteRes<WriteVLD4R64, []>;
+def : WriteRes<WriteVLD8R8, []>;
+def : WriteRes<WriteVLD8R16, []>;
+def : WriteRes<WriteVLD8R32, []>;
+def : WriteRes<WriteVLD8R64, []>;
+def : WriteRes<WriteVST1R, []>;
+def : WriteRes<WriteVST2R, []>;
+def : WriteRes<WriteVST4R, []>;
+def : WriteRes<WriteVST8R, []>;
+
+// 12. Vector Integer Arithmetic Instructions
+def : WriteRes<WriteVIALUV, []>;
+def : WriteRes<WriteVIALUX, []>;
+def : WriteRes<WriteVIALUI, []>;
+def : WriteRes<WriteVIWALUV, []>;
+def : WriteRes<WriteVIWALUX, []>;
+def : WriteRes<WriteVIWALUI, []>;
+def : WriteRes<WriteVExtV, []>;
+def : WriteRes<WriteVICALUV, []>;
+def : WriteRes<WriteVICALUX, []>;
+def : WriteRes<WriteVICALUI, []>;
+def : WriteRes<WriteVShiftV, []>;
+def : WriteRes<WriteVShiftX, []>;
+def : WriteRes<WriteVShiftI, []>;
+def : WriteRes<WriteVNShiftV, []>;
+def : WriteRes<WriteVNShiftX, []>;
+def : WriteRes<WriteVNShiftI, []>;
+def : WriteRes<WriteVICmpV, []>;
+def : WriteRes<WriteVICmpX, []>;
+def : WriteRes<WriteVICmpI, []>;
+def : WriteRes<WriteVIMulV, []>;
+def : WriteRes<WriteVIMulX, []>;
+def : WriteRes<WriteVIDivV, []>;
+def : WriteRes<WriteVIDivX, []>;
+def : WriteRes<WriteVIWMulV, []>;
+def : WriteRes<WriteVIWMulX, []>;
+def : WriteRes<WriteVIMulAddV, []>;
+def : WriteRes<WriteVIMulAddX, []>;
+def : WriteRes<WriteVIWMulAddV, []>;
+def : WriteRes<WriteVIWMulAddX, []>;
+def : WriteRes<WriteVIMergeV, []>;
+def : WriteRes<WriteVIMergeX, []>;
+def : WriteRes<WriteVIMergeI, []>;
+def : WriteRes<WriteVIMovV, []>;
+def : WriteRes<WriteVIMovX, []>;
+def : WriteRes<WriteVIMovI, []>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+def : WriteRes<WriteVSALUV, []>;
+def : WriteRes<WriteVSALUX, []>;
+def : WriteRes<WriteVSALUI, []>;
+def : WriteRes<WriteVAALUV, []>;
+def : WriteRes<WriteVAALUX, []>;
+def : WriteRes<WriteVSMulV, []>;
+def : WriteRes<WriteVSMulX, []>;
+def : WriteRes<WriteVSShiftV, []>;
+def : WriteRes<WriteVSShiftX, []>;
+def : WriteRes<WriteVSShiftI, []>;
+def : WriteRes<WriteVNClipV, []>;
+def : WriteRes<WriteVNClipX, []>;
+def : WriteRes<WriteVNClipI, []>;
+
+// 14. Vector Floating-Point Instructions
+def : WriteRes<WriteVFALUV, []>;
+def : WriteRes<WriteVFALUF, []>;
+def : WriteRes<WriteVFWALUV, []>;
+def : WriteRes<WriteVFWALUF, []>;
+def : WriteRes<WriteVFMulV, []>;
+def : WriteRes<WriteVFMulF, []>;
+def : WriteRes<WriteVFDivV, []>;
+def : WriteRes<WriteVFDivF, []>;
+def : WriteRes<WriteVFWMulV, []>;
+def : WriteRes<WriteVFWMulF, []>;
+def : WriteRes<WriteVFMulAddV, []>;
+def : WriteRes<WriteVFMulAddF, []>;
+def : WriteRes<WriteVFWMulAddV, []>;
+def : WriteRes<WriteVFWMulAddF, []>;
+def : WriteRes<WriteVFSqrtV, []>;
+def : WriteRes<WriteVFRecpV, []>;
+def : WriteRes<WriteVFCmpV, []>;
+def : WriteRes<WriteVFCmpF, []>;
+def : WriteRes<WriteVFSgnjV, []>;
+def : WriteRes<WriteVFSgnjF, []>;
+def : WriteRes<WriteVFClassV, []>;
+def : WriteRes<WriteVFMergeV, []>;
+def : WriteRes<WriteVFMovV, []>;
+def : WriteRes<WriteVFCvtIToFV, []>;
+def : WriteRes<WriteVFCvtFToIV, []>;
+def : WriteRes<WriteVFCvtFToFV, []>;
+def : WriteRes<WriteVFWCvtIToFV, []>;
+def : WriteRes<WriteVFWCvtFToIV, []>;
+def : WriteRes<WriteVFWCvtFToFV, []>;
+def : WriteRes<WriteVFNCvtIToFV, []>;
+def : WriteRes<WriteVFNCvtFToIV, []>;
+def : WriteRes<WriteVFNCvtFToFV, []>;
+
+// 15. Vector Reduction Operations
+def : WriteRes<WriteVIRedV, []>;
+def : WriteRes<WriteVIWRedV, []>;
+def : WriteRes<WriteVFRedV, []>;
+def : WriteRes<WriteVFRedOV, []>;
+def : WriteRes<WriteVFWRedV, []>;
+def : WriteRes<WriteVFWRedOV, []>;
+
+// 16. Vector Mask Instructions
+def : WriteRes<WriteVMALUV, []>;
+def : WriteRes<WriteVMPopV, []>;
+def : WriteRes<WriteVMFFSV, []>;
+def : WriteRes<WriteVMSFSV, []>;
+def : WriteRes<WriteVMIotV, []>;
+def : WriteRes<WriteVMIdxV, []>;
+
+// 17. Vector Permutation Instructions
+def : WriteRes<WriteVIMovVX, []>;
+def : WriteRes<WriteVIMovXV, []>;
+def : WriteRes<WriteVFMovVF, []>;
+def : WriteRes<WriteVFMovFV, []>;
+def : WriteRes<WriteVISlideX, []>;
+def : WriteRes<WriteVISlideI, []>;
+def : WriteRes<WriteVISlide1X, []>;
+def : WriteRes<WriteVFSlide1F, []>;
+def : WriteRes<WriteVGatherV, []>;
+def : WriteRes<WriteVGatherX, []>;
+def : WriteRes<WriteVGatherI, []>;
+def : WriteRes<WriteVCompressV, []>;
+def : WriteRes<WriteVMov1V, []>;
+def : WriteRes<WriteVMov2V, []>;
+def : WriteRes<WriteVMov4V, []>;
+def : WriteRes<WriteVMov8V, []>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+def : ReadAdvance<ReadVSTE8V, 0>;
+def : ReadAdvance<ReadVSTE16V, 0>;
+def : ReadAdvance<ReadVSTE32V, 0>;
+def : ReadAdvance<ReadVSTE64V, 0>;
+def : ReadAdvance<ReadVSTM, 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+def : ReadAdvance<ReadVSTS8V, 0>;
+def : ReadAdvance<ReadVSTS16V, 0>;
+def : ReadAdvance<ReadVSTS32V, 0>;
+def : ReadAdvance<ReadVSTS64V, 0>;
+def : ReadAdvance<ReadVLDUXV, 0>;
+def : ReadAdvance<ReadVLDOXV, 0>;
+def : ReadAdvance<ReadVSTUXV, 0>;
+def : ReadAdvance<ReadVSTUX8, 0>;
+def : ReadAdvance<ReadVSTUX16, 0>;
+def : ReadAdvance<ReadVSTUX32, 0>;
+def : ReadAdvance<ReadVSTUX64, 0>;
+def : ReadAdvance<ReadVSTUX8V, 0>;
+def : ReadAdvance<ReadVSTUX16V, 0>;
+def : ReadAdvance<ReadVSTUX32V, 0>;
+def : ReadAdvance<ReadVSTUX64V, 0>;
+def : ReadAdvance<ReadVSTOX8, 0>;
+def : ReadAdvance<ReadVSTOX16, 0>;
+def : ReadAdvance<ReadVSTOX32, 0>;
+def : ReadAdvance<ReadVSTOX64, 0>;
+def : ReadAdvance<ReadVSTOXV, 0>;
+def : ReadAdvance<ReadVSTOX8V, 0>;
+def : ReadAdvance<ReadVSTOX16V, 0>;
+def : ReadAdvance<ReadVSTOX32V, 0>;
+def : ReadAdvance<ReadVSTOX64V, 0>;
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+def : ReadAdvance<ReadVIALUV, 0>;
+def : ReadAdvance<ReadVIALUX, 0>;
+def : ReadAdvance<ReadVIWALUV, 0>;
+def : ReadAdvance<ReadVIWALUX, 0>;
+def : ReadAdvance<ReadVExtV, 0>;
+def : ReadAdvance<ReadVIALUCV, 0>;
+def : ReadAdvance<ReadVIALUCX, 0>;
+def : ReadAdvance<ReadVShiftV, 0>;
+def : ReadAdvance<ReadVShiftX, 0>;
+def : ReadAdvance<ReadVNShiftV, 0>;
+def : ReadAdvance<ReadVNShiftX, 0>;
+def : ReadAdvance<ReadVICmpV, 0>;
+def : ReadAdvance<ReadVICmpX, 0>;
+def : ReadAdvance<ReadVIMulV, 0>;
+def : ReadAdvance<ReadVIMulX, 0>;
+def : ReadAdvance<ReadVIDivV, 0>;
+def : ReadAdvance<ReadVIDivX, 0>;
+def : ReadAdvance<ReadVIWMulV, 0>;
+def : ReadAdvance<ReadVIWMulX, 0>;
+def : ReadAdvance<ReadVIMulAddV, 0>;
+def : ReadAdvance<ReadVIMulAddX, 0>;
+def : ReadAdvance<ReadVIWMulAddV, 0>;
+def : ReadAdvance<ReadVIWMulAddX, 0>;
+def : ReadAdvance<ReadVIMergeV, 0>;
+def : ReadAdvance<ReadVIMergeX, 0>;
+def : ReadAdvance<ReadVIMovV, 0>;
+def : ReadAdvance<ReadVIMovX, 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+def : ReadAdvance<ReadVSALUV, 0>;
+def : ReadAdvance<ReadVSALUX, 0>;
+def : ReadAdvance<ReadVAALUV, 0>;
+def : ReadAdvance<ReadVAALUX, 0>;
+def : ReadAdvance<ReadVSMulV, 0>;
+def : ReadAdvance<ReadVSMulX, 0>;
+def : ReadAdvance<ReadVSShiftV, 0>;
+def : ReadAdvance<ReadVSShiftX, 0>;
+def : ReadAdvance<ReadVNClipV, 0>;
+def : ReadAdvance<ReadVNClipX, 0>;
+
+// 14. Vector Floating-Point Instructions
+def : ReadAdvance<ReadVFALUV, 0>;
+def : ReadAdvance<ReadVFALUF, 0>;
+def : ReadAdvance<ReadVFWALUV, 0>;
+def : ReadAdvance<ReadVFWALUF, 0>;
+def : ReadAdvance<ReadVFMulV, 0>;
+def : ReadAdvance<ReadVFMulF, 0>;
+def : ReadAdvance<ReadVFDivV, 0>;
+def : ReadAdvance<ReadVFDivF, 0>;
+def : ReadAdvance<ReadVFWMulV, 0>;
+def : ReadAdvance<ReadVFWMulF, 0>;
+def : ReadAdvance<ReadVFMulAddV, 0>;
+def : ReadAdvance<ReadVFMulAddF, 0>;
+def : ReadAdvance<ReadVFWMulAddV, 0>;
+def : ReadAdvance<ReadVFWMulAddF, 0>;
+def : ReadAdvance<ReadVFSqrtV, 0>;
+def : ReadAdvance<ReadVFRecpV, 0>;
+def : ReadAdvance<ReadVFCmpV, 0>;
+def : ReadAdvance<ReadVFCmpF, 0>;
+def : ReadAdvance<ReadVFSgnjV, 0>;
+def : ReadAdvance<ReadVFSgnjF, 0>;
+def : ReadAdvance<ReadVFClassV, 0>;
+def : ReadAdvance<ReadVFMergeV, 0>;
+def : ReadAdvance<ReadVFMergeF, 0>;
+def : ReadAdvance<ReadVFMovF, 0>;
+def : ReadAdvance<ReadVFCvtIToFV, 0>;
+def : ReadAdvance<ReadVFCvtFToIV, 0>;
+def : ReadAdvance<ReadVFWCvtIToFV, 0>;
+def : ReadAdvance<ReadVFWCvtFToIV, 0>;
+def : ReadAdvance<ReadVFWCvtFToFV, 0>;
+def : ReadAdvance<ReadVFNCvtIToFV, 0>;
+def : ReadAdvance<ReadVFNCvtFToIV, 0>;
+def : ReadAdvance<ReadVFNCvtFToFV, 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+def : ReadAdvance<ReadVMALUV, 0>;
+def : ReadAdvance<ReadVMPopV, 0>;
+def : ReadAdvance<ReadVMFFSV, 0>;
+def : ReadAdvance<ReadVMSFSV, 0>;
+def : ReadAdvance<ReadVMIotV, 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVIMovVX, 0>;
+def : ReadAdvance<ReadVIMovXV, 0>;
+def : ReadAdvance<ReadVIMovXX, 0>;
+def : ReadAdvance<ReadVFMovVF, 0>;
+def : ReadAdvance<ReadVFMovFV, 0>;
+def : ReadAdvance<ReadVFMovFX, 0>;
+def : ReadAdvance<ReadVISlideV, 0>;
+def : ReadAdvance<ReadVISlideX, 0>;
+def : ReadAdvance<ReadVFSlideV, 0>;
+def : ReadAdvance<ReadVFSlideF, 0>;
+def : ReadAdvance<ReadVGatherV, 0>;
+def : ReadAdvance<ReadVGatherX, 0>;
+def : ReadAdvance<ReadVCompressV, 0>;
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+
+} // Unsupported
+} // UnsupportedSchedV
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3a64b3460030..a69850896436 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6704,17 +6704,21 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
     SDValue Ptr = MemIntr->getBasePtr();
+    // The source constant may be larger than the subvector broadcast,
+    // ensure we extract the correct subvector constants.
     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
       Type *CstTy = Cst->getType();
       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
-      if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+      unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
+      if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
+          (SizeInBits % SubVecSizeInBits) != 0)
         return false;
-      unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
-      unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
-      unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+      unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
+      unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
+      unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
       APInt UndefSubElts(NumSubElts, 0);
       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
-                                        APInt(SubEltSizeInBits, 0));
+                                        APInt(CstEltSizeInBits, 0));
       for (unsigned i = 0; i != NumSubElts; ++i) {
         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
                                  UndefSubElts, i))
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index e83e1e74ff52..ba00e7da81f9 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -708,6 +708,19 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
         mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
     Sched<[sched.Folded, sched.ReadAfterFold]>;
 
+// BinOpRM - Instructions like "adc reg, reg, [mem]".
+// There is an implicit register read at the end of the operand sequence.
+class BinOpRM_ImplicitUse<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
+  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
+    Sched<[sched.Folded, sched.ReadAfterFold,
+           // base, scale, index, offset, segment.
+           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+           // implicit register read.
+           sched.ReadAfterFold]>;
+
 // BinOpRM_F - Instructions like "cmp reg, [mem]".
 class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDNode opnode>
@@ -725,7 +738,7 @@ class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
 class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
+  : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
                     EFLAGS))]>;
@@ -805,7 +818,11 @@ class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
   : BinOpMR<opcode, mnemonic, typeinfo,
           [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
-           (implicit EFLAGS)]>, Sched<[WriteALURMW]>;
+           (implicit EFLAGS)]>, Sched<[WriteALURMW,
+                                       // base, scale, index, offset, segment
+                                       ReadDefault, ReadDefault, ReadDefault,
+                                       ReadDefault, ReadDefault,
+                                       WriteALU.ReadAfterFold]>;  // reg
 
 // BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
 class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -813,7 +830,12 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMR<opcode, mnemonic, typeinfo,
             [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
                     addr:$dst),
-             (implicit EFLAGS)]>, Sched<[WriteADCRMW]>;
+             (implicit EFLAGS)]>, Sched<[WriteADCRMW,
+                                         // base, scale, index, offset, segment
+                                         ReadDefault, ReadDefault, ReadDefault,
+                                         ReadDefault, ReadDefault,
+                                         WriteALU.ReadAfterFold,    // reg
+                                         WriteALU.ReadAfterFold]>;  // EFLAGS
 
 // BinOpMR_F - Instructions like "cmp [mem], reg".
 class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 762317425026..91b16ec66ee3 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/ValueHandle.h"
@@ -250,10 +251,12 @@ Value *AA::getWithType(Value &V, Type &Ty) {
       return Constant::getNullValue(&Ty);
     if (C->getType()->isPointerTy() && Ty.isPointerTy())
       return ConstantExpr::getPointerCast(C, &Ty);
-    if (C->getType()->isIntegerTy() && Ty.isIntegerTy())
-      return ConstantExpr::getTrunc(C, &Ty, /* OnlyIfReduced */ true);
-    if (C->getType()->isFloatingPointTy() && Ty.isFloatingPointTy())
-      return ConstantExpr::getFPTrunc(C, &Ty, /* OnlyIfReduced */ true);
+    if (C->getType()->getPrimitiveSizeInBits() >= Ty.getPrimitiveSizeInBits()) {
+      if (C->getType()->isIntegerTy() && Ty.isIntegerTy())
+        return ConstantExpr::getTrunc(C, &Ty, /* OnlyIfReduced */ true);
+      if (C->getType()->isFloatingPointTy() && Ty.isFloatingPointTy())
+        return ConstantExpr::getFPTrunc(C, &Ty, /* OnlyIfReduced */ true);
+    }
   }
   return nullptr;
 }
@@ -1023,7 +1026,7 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
 
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
-    if (!Visited.insert(U).second)
+    if (isa<PHINode>(U->getUser()) && !Visited.insert(U).second)
       continue;
     LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in "
                       << *U->getUser() << "\n");
@@ -1925,49 +1928,85 @@ void Attributor::createShallowWrapper(Function &F) {
   NumFnShallowWrappersCreated++;
 }
 
+bool Attributor::isInternalizable(Function &F) {
+  if (F.isDeclaration() || F.hasLocalLinkage() ||
+      GlobalValue::isInterposableLinkage(F.getLinkage()))
+    return false;
+  return true;
+}
+
 Function *Attributor::internalizeFunction(Function &F, bool Force) {
   if (!AllowDeepWrapper && !Force)
     return nullptr;
-  if (F.isDeclaration() || F.hasLocalLinkage() ||
-      GlobalValue::isInterposableLinkage(F.getLinkage()))
+  if (!isInternalizable(F))
     return nullptr;
 
-  Module &M = *F.getParent();
-  FunctionType *FnTy = F.getFunctionType();
+  SmallPtrSet<Function *, 2> FnSet = {&F};
+  DenseMap<Function *, Function *> InternalizedFns;
+  internalizeFunctions(FnSet, InternalizedFns);
 
-  // create a copy of the current function
-  Function *Copied = Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(),
-                                      F.getName() + ".internalized");
-  ValueToValueMapTy VMap;
-  auto *NewFArgIt = Copied->arg_begin();
-  for (auto &Arg : F.args()) {
-    auto ArgName = Arg.getName();
-    NewFArgIt->setName(ArgName);
-    VMap[&Arg] = &(*NewFArgIt++);
+  return InternalizedFns[&F];
+}
+
+bool Attributor::internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
+                                      DenseMap<Function *, Function *> &FnMap) {
+  for (Function *F : FnSet)
+    if (!Attributor::isInternalizable(*F))
+      return false;
+
+  FnMap.clear();
+  // Generate the internalized version of each function.
+  for (Function *F : FnSet) {
+    Module &M = *F->getParent();
+    FunctionType *FnTy = F->getFunctionType();
+
+    // Create a copy of the current function
+    Function *Copied =
+        Function::Create(FnTy, F->getLinkage(), F->getAddressSpace(),
+                         F->getName() + ".internalized");
+    ValueToValueMapTy VMap;
+    auto *NewFArgIt = Copied->arg_begin();
+    for (auto &Arg : F->args()) {
+      auto ArgName = Arg.getName();
+      NewFArgIt->setName(ArgName);
+      VMap[&Arg] = &(*NewFArgIt++);
+    }
+    SmallVector<ReturnInst *, 8> Returns;
+
+    // Copy the body of the original function to the new one
+    CloneFunctionInto(Copied, F, VMap,
+                      CloneFunctionChangeType::LocalChangesOnly, Returns);
+
+    // Set the linakage and visibility late as CloneFunctionInto has some
+    // implicit requirements.
+    Copied->setVisibility(GlobalValue::DefaultVisibility);
+    Copied->setLinkage(GlobalValue::PrivateLinkage);
+
+    // Copy metadata
+    SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+    F->getAllMetadata(MDs);
+    for (auto MDIt : MDs)
+      if (!Copied->hasMetadata())
+        Copied->addMetadata(MDIt.first, *MDIt.second);
+
+    M.getFunctionList().insert(F->getIterator(), Copied);
+    Copied->setDSOLocal(true);
+    FnMap[F] = Copied;
   }
-  SmallVector<ReturnInst *, 8> Returns;
 
-  // Copy the body of the original function to the new one
-  CloneFunctionInto(Copied, &F, VMap, CloneFunctionChangeType::LocalChangesOnly,
-                    Returns);
+  // Replace all uses of the old function with the new internalized function
+  // unless the caller is a function that was just internalized.
+  for (Function *F : FnSet) {
+    auto &InternalizedFn = FnMap[F];
+    auto IsNotInternalized = [&](Use &U) -> bool {
+      if (auto *CB = dyn_cast<CallBase>(U.getUser()))
+        return !FnMap.lookup(CB->getCaller());
+      return false;
+    };
+    F->replaceUsesWithIf(InternalizedFn, IsNotInternalized);
+  }
 
-  // Set the linakage and visibility late as CloneFunctionInto has some implicit
-  // requirements.
-  Copied->setVisibility(GlobalValue::DefaultVisibility);
-  Copied->setLinkage(GlobalValue::PrivateLinkage);
-
-  // Copy metadata
-  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
-  F.getAllMetadata(MDs);
-  for (auto MDIt : MDs)
-    if (!Copied->hasMetadata())
-      Copied->addMetadata(MDIt.first, *MDIt.second);
-
-  M.getFunctionList().insert(F.getIterator(), Copied);
-  F.replaceAllUsesWith(Copied);
-  Copied->setDSOLocal(true);
-
-  return Copied;
+  return true;
 }
 
 bool Attributor::isValidFunctionSignatureRewrite(
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 98ce286d5139..3529923a9082 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1149,19 +1149,23 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
     return true;
   };
 
+  /// Helper struct, will support ranges eventually.
+  struct OffsetInfo {
+    int64_t Offset = AA::PointerInfo::OffsetAndSize::Unknown;
+
+    bool operator==(const OffsetInfo &OI) const { return Offset == OI.Offset; }
+  };
+
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     using namespace AA::PointerInfo;
     State S = getState();
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
     Value &AssociatedValue = getAssociatedValue();
-    struct OffsetInfo {
-      int64_t Offset = 0;
-    };
 
     const DataLayout &DL = A.getDataLayout();
     DenseMap<Value *, OffsetInfo> OffsetInfoMap;
-    OffsetInfoMap[&AssociatedValue] = {};
+    OffsetInfoMap[&AssociatedValue] = OffsetInfo{0};
 
     auto HandlePassthroughUser = [&](Value *Usr, OffsetInfo &PtrOI,
                                      bool &Follow) {
@@ -1219,8 +1223,48 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         Follow = true;
         return true;
       }
-      if (isa<CastInst>(Usr) || isa<PHINode>(Usr) || isa<SelectInst>(Usr))
+      if (isa<CastInst>(Usr) || isa<SelectInst>(Usr))
         return HandlePassthroughUser(Usr, PtrOI, Follow);
+
+      // For PHIs we need to take care of the recurrence explicitly as the value
+      // might change while we iterate through a loop. For now, we give up if
+      // the PHI is not invariant.
+      if (isa<PHINode>(Usr)) {
+        // Check if the PHI is invariant (so far).
+        OffsetInfo &UsrOI = OffsetInfoMap[Usr];
+        if (UsrOI == PtrOI)
+          return true;
+
+        // Check if the PHI operand has already an unknown offset as we can't
+        // improve on that anymore.
+        if (PtrOI.Offset == OffsetAndSize::Unknown) {
+          UsrOI = PtrOI;
+          Follow = true;
+          return true;
+        }
+
+        // Check if the PHI operand is not dependent on the PHI itself.
+        APInt Offset(DL.getIndexTypeSizeInBits(AssociatedValue.getType()), 0);
+        if (&AssociatedValue == CurPtr->stripAndAccumulateConstantOffsets(
+                                    DL, Offset, /* AllowNonInbounds */ true)) {
+          if (Offset != PtrOI.Offset) {
+            LLVM_DEBUG(dbgs()
+                       << "[AAPointerInfo] PHI operand pointer offset mismatch "
+                       << *CurPtr << " in " << *Usr << "\n");
+            return false;
+          }
+          return HandlePassthroughUser(Usr, PtrOI, Follow);
+        }
+
+        // TODO: Approximate in case we know the direction of the recurrence.
+        LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI operand is too complex "
+                          << *CurPtr << " in " << *Usr << "\n");
+        UsrOI = PtrOI;
+        UsrOI.Offset = OffsetAndSize::Unknown;
+        Follow = true;
+        return true;
+      }
+
       if (auto *LoadI = dyn_cast<LoadInst>(Usr))
         return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr,
                             AccessKind::AK_READ, PtrOI.Offset, Changed,
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b80349352719..d6b97915ede6 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4176,28 +4176,32 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
     ORE.emit([&]() {
       OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
       return ORA << "Could not internalize function. "
-                 << "Some optimizations may not be possible.";
+                 << "Some optimizations may not be possible. [OMP140]";
     });
   };
 
   // Create internal copies of each function if this is a kernel Module. This
   // allows iterprocedural passes to see every call edge.
-  DenseSet<const Function *> InternalizedFuncs;
-  if (isOpenMPDevice(M))
+  DenseMap<Function *, Function *> InternalizedMap;
+  if (isOpenMPDevice(M)) {
+    SmallPtrSet<Function *, 16> InternalizeFns;
     for (Function &F : M)
       if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
           !DisableInternalization) {
-        if (Attributor::internalizeFunction(F, /* Force */ true)) {
-          InternalizedFuncs.insert(&F);
+        if (Attributor::isInternalizable(F)) {
+          InternalizeFns.insert(&F);
         } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
           EmitRemark(F);
         }
       }
 
+    Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
+  }
+
   // Look at every function in the Module unless it was internalized.
   SmallVector<Function *, 16> SCC;
   for (Function &F : M)
-    if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
+    if (!F.isDeclaration() && !InternalizedMap.lookup(&F))
       SCC.push_back(&F);
 
   if (SCC.empty())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2b0ef0c5f2cc..c5e14ebf3ae3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5158,6 +5158,83 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
   if (!isa<Constant>(Op1) && Op1Min == Op1Max)
     return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
 
+  // Don't break up a clamp pattern -- (min(max X, Y), Z) -- by replacing a
+  // min/max canonical compare with some other compare. That could lead to
+  // conflict with select canonicalization and infinite looping.
+  // FIXME: This constraint may go away if min/max intrinsics are canonical.
+  auto isMinMaxCmp = [&](Instruction &Cmp) {
+    if (!Cmp.hasOneUse())
+      return false;
+    Value *A, *B;
+    SelectPatternFlavor SPF = matchSelectPattern(Cmp.user_back(), A, B).Flavor;
+    if (!SelectPatternResult::isMinOrMax(SPF))
+      return false;
+    return match(Op0, m_MaxOrMin(m_Value(), m_Value())) ||
+           match(Op1, m_MaxOrMin(m_Value(), m_Value()));
+  };
+  if (!isMinMaxCmp(I)) {
+    switch (Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_ULT: {
+      if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        // A <u C -> A == C-1 if min(A)+1 == C
+        if (*CmpC == Op0Min + 1)
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC - 1));
+        // X <u C --> X == 0, if the number of zero bits in the bottom of X
+        // exceeds the log2 of C.
+        if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              Constant::getNullValue(Op1->getType()));
+      }
+      break;
+    }
+    case ICmpInst::ICMP_UGT: {
+      if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        // A >u C -> A == C+1 if max(a)-1 == C
+        if (*CmpC == Op0Max - 1)
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC + 1));
+        // X >u C --> X != 0, if the number of zero bits in the bottom of X
+        // exceeds the log2 of C.
+        if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
+          return new ICmpInst(ICmpInst::ICMP_NE, Op0,
+                              Constant::getNullValue(Op1->getType()));
+      }
+      break;
+    }
+    case ICmpInst::ICMP_SLT: {
+      if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC - 1));
+      }
+      break;
+    }
+    case ICmpInst::ICMP_SGT: {
+      if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      const APInt *CmpC;
+      if (match(Op1, m_APInt(CmpC))) {
+        if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                              ConstantInt::get(Op1->getType(), *CmpC + 1));
+      }
+      break;
+    }
+    }
+  }
+
   // Based on the range information we know about the LHS, see if we can
   // simplify this comparison.  For example, (x&4) < 8 is always true.
   switch (Pred) {
@@ -5219,21 +5296,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      // A <u C -> A == C-1 if min(A)+1 == C
-      if (*CmpC == Op0Min + 1)
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC - 1));
-      // X <u C --> X == 0, if the number of zero bits in the bottom of X
-      // exceeds the log2 of C.
-      if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            Constant::getNullValue(Op1->getType()));
-    }
     break;
   }
   case ICmpInst::ICMP_UGT: {
@@ -5241,21 +5303,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      // A >u C -> A == C+1 if max(a)-1 == C
-      if (*CmpC == Op0Max - 1)
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC + 1));
-      // X >u C --> X != 0, if the number of zero bits in the bottom of X
-      // exceeds the log2 of C.
-      if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
-        return new ICmpInst(ICmpInst::ICMP_NE, Op0,
-                            Constant::getNullValue(Op1->getType()));
-    }
     break;
   }
   case ICmpInst::ICMP_SLT: {
@@ -5263,14 +5310,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC - 1));
-    }
     break;
   }
   case ICmpInst::ICMP_SGT: {
@@ -5278,14 +5317,6 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
       return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
     if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
       return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-    if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
-      return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-    const APInt *CmpC;
-    if (match(Op1, m_APInt(CmpC))) {
-      if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
-        return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            ConstantInt::get(Op1->getType(), *CmpC + 1));
-    }
     break;
   }
   case ICmpInst::ICMP_SGE:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index a8474e27383d..80abc775299a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -261,8 +261,8 @@ private:
 
 bool PointerReplacer::collectUsers(Instruction &I) {
   for (auto U : I.users()) {
-    Instruction *Inst = cast<Instruction>(&*U);
-    if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+    auto *Inst = cast<Instruction>(&*U);
+    if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
       Worklist.insert(Load);
@@ -270,7 +270,9 @@ bool PointerReplacer::collectUsers(Instruction &I) {
       Worklist.insert(Inst);
       if (!collectUsers(*Inst))
         return false;
-    } else if (isa<MemTransferInst>(Inst)) {
+    } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
+      if (MI->isVolatile())
+        return false;
       Worklist.insert(Inst);
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index ce2b913dba61..5bbc3c87ca4f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3230,7 +3230,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   Value *Mask;
   if (match(TrueVal, m_Zero()) &&
       match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
-                                   m_CombineOr(m_Undef(), m_Zero())))) {
+                                   m_CombineOr(m_Undef(), m_Zero()))) &&
+      (CondVal->getType() == Mask->getType())) {
     // We can remove the select by ensuring the load zeros all lanes the
     // select would have.  We determine this by proving there is no overlap
     // between the load and select masks.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index b585818af595..404852f1dd4d 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1981,6 +1981,9 @@ class LSRInstance {
   /// IV users that belong to profitable IVChains.
   SmallPtrSet<Use*, MaxChains> IVIncSet;
 
+  /// Induction variables that were generated and inserted by the SCEV Expander.
+  SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
+
   void OptimizeShadowIV();
   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
@@ -2085,6 +2088,9 @@ public:
               TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
 
   bool getChanged() const { return Changed; }
+  const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
+    return ScalarEvolutionIVs;
+  }
 
   void print_factors_and_types(raw_ostream &OS) const;
   void print_fixups(raw_ostream &OS) const;
@@ -5589,6 +5595,11 @@ void LSRInstance::ImplementSolution(
     GenerateIVChain(Chain, Rewriter, DeadInsts);
     Changed = true;
   }
+
+  for (const WeakVH &IV : Rewriter.getInsertedIVs())
+    if (IV && dyn_cast<Instruction>(&*IV)->getParent())
+      ScalarEvolutionIVs.push_back(IV);
+
   // Clean up after ourselves. This must be done before deleting any
   // instructions.
   Rewriter.clear();
@@ -5859,87 +5870,399 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MemorySSAWrapperPass>();
 }
 
-using EqualValues = SmallVector<std::tuple<WeakVH, int64_t>, 4>;
-using EqualValuesMap =
-    DenseMap<DbgValueInst *, SmallVector<std::pair<unsigned, EqualValues>>>;
-using LocationMap =
-    DenseMap<DbgValueInst *, std::pair<DIExpression *, Metadata *>>;
+struct SCEVDbgValueBuilder {
+  SCEVDbgValueBuilder() = default;
+  SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) {
+    Values = Base.Values;
+    Expr = Base.Expr;
+  }
 
-static void DbgGatherEqualValues(Loop *L, ScalarEvolution &SE,
-                                 EqualValuesMap &DbgValueToEqualSet,
-                                 LocationMap &DbgValueToLocation) {
+  /// The DIExpression as we translate the SCEV.
+  SmallVector<uint64_t, 6> Expr;
+  /// The location ops of the DIExpression.
+  SmallVector<llvm::ValueAsMetadata *, 2> Values;
+
+  void pushOperator(uint64_t Op) { Expr.push_back(Op); }
+  void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
+
+  /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
+  /// in the set of values referenced by the expression.
+  void pushValue(llvm::Value *V) {
+    Expr.push_back(llvm::dwarf::DW_OP_LLVM_arg);
+    auto *It =
+        std::find(Values.begin(), Values.end(), llvm::ValueAsMetadata::get(V));
+    unsigned ArgIndex = 0;
+    if (It != Values.end()) {
+      ArgIndex = std::distance(Values.begin(), It);
+    } else {
+      ArgIndex = Values.size();
+      Values.push_back(llvm::ValueAsMetadata::get(V));
+    }
+    Expr.push_back(ArgIndex);
+  }
+
+  void pushValue(const SCEVUnknown *U) {
+    llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
+    pushValue(V);
+  }
+
+  bool pushConst(const SCEVConstant *C) {
+    if (C->getAPInt().getMinSignedBits() > 64)
+      return false;
+    Expr.push_back(llvm::dwarf::DW_OP_consts);
+    Expr.push_back(C->getAPInt().getSExtValue());
+    return true;
+  }
+
+  /// Several SCEV types are sequences of the same arithmetic operator applied
+  /// to constants and values that may be extended or truncated.
+  bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
+                          uint64_t DwarfOp) {
+    assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
+           "Expected arithmetic SCEV type");
+    bool Success = true;
+    unsigned EmitOperator = 0;
+    for (auto &Op : CommExpr->operands()) {
+      Success &= pushSCEV(Op);
+
+      if (EmitOperator >= 1)
+        pushOperator(DwarfOp);
+      ++EmitOperator;
+    }
+    return Success;
+  }
+
+  // TODO: Identify and omit noop casts.
+  bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
+    const llvm::SCEV *Inner = C->getOperand(0);
+    const llvm::Type *Type = C->getType();
+    uint64_t ToWidth = Type->getIntegerBitWidth();
+    bool Success = pushSCEV(Inner);
+    uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
+                          IsSigned ? llvm::dwarf::DW_ATE_signed
+                                   : llvm::dwarf::DW_ATE_unsigned};
+    for (const auto &Op : CastOps)
+      pushOperator(Op);
+    return Success;
+  }
+
+  // TODO: MinMax - although these haven't been encountered in the test suite.
+  bool pushSCEV(const llvm::SCEV *S) {
+    bool Success = true;
+    if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
+      Success &= pushConst(StartInt);
+
+    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+      if (!U->getValue())
+        return false;
+      pushValue(U->getValue());
+
+    } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
+      Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
+
+    } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
+      Success &= pushSCEV(UDiv->getLHS());
+      Success &= pushSCEV(UDiv->getRHS());
+      pushOperator(llvm::dwarf::DW_OP_div);
+
+    } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
+      // Assert if a new and unknown SCEVCastEXpr type is encountered.
+      assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
+              isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
+             "Unexpected cast type in SCEV.");
+      Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
+
+    } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
+      Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
+
+    } else if (isa<SCEVAddRecExpr>(S)) {
+      // Nested SCEVAddRecExpr are generated by nested loops and are currently
+      // unsupported.
+      return false;
+
+    } else {
+      return false;
+    }
+    return Success;
+  }
+
+  void setFinalExpression(llvm::DbgValueInst &DI, const DIExpression *OldExpr) {
+    // Re-state assumption that this dbg.value is not variadic. Any remaining
+    // opcodes in its expression operate on a single value already on the
+    // expression stack. Prepend our operations, which will re-compute and
+    // place that value on the expression stack.
+    assert(!DI.hasArgList());
+    auto *NewExpr =
+        DIExpression::prependOpcodes(OldExpr, Expr, /*StackValue*/ true);
+    DI.setExpression(NewExpr);
+
+    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(Values);
+    DI.setRawLocation(llvm::DIArgList::get(DI.getContext(), ValArrayRef));
+  }
+
+  /// If a DVI can be emitted without a DIArgList, omit DW_OP_llvm_arg and the
+  /// location op index 0.
+  void setShortFinalExpression(llvm::DbgValueInst &DI,
+                               const DIExpression *OldExpr) {
+    assert((Expr[0] == llvm::dwarf::DW_OP_LLVM_arg && Expr[1] == 0) &&
+           "Expected DW_OP_llvm_arg and 0.");
+    DI.replaceVariableLocationOp(
+        0u, llvm::MetadataAsValue::get(DI.getContext(), Values[0]));
+
+    // See setFinalExpression: prepend our opcodes on the start of any old
+    // expression opcodes.
+    assert(!DI.hasArgList());
+    llvm::SmallVector<uint64_t, 6> FinalExpr(Expr.begin() + 2, Expr.end());
+    auto *NewExpr =
+        DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true);
+    DI.setExpression(NewExpr);
+  }
+
+  /// Once the IV and variable SCEV translation is complete, write it to the
+  /// source DVI.
+  void applyExprToDbgValue(llvm::DbgValueInst &DI,
+                           const DIExpression *OldExpr) {
+    assert(!Expr.empty() && "Unexpected empty expression.");
+    // Emit a simpler form if only a single location is referenced.
+    if (Values.size() == 1 && Expr[0] == llvm::dwarf::DW_OP_LLVM_arg &&
+        Expr[1] == 0) {
+      setShortFinalExpression(DI, OldExpr);
+    } else {
+      setFinalExpression(DI, OldExpr);
+    }
+  }
+
+  /// Return true if the combination of arithmetic operator and underlying
+  /// SCEV constant value is an identity function.
+  bool isIdentityFunction(uint64_t Op, const SCEV *S) {
+    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+      if (C->getAPInt().getMinSignedBits() > 64)
+        return false;
+      int64_t I = C->getAPInt().getSExtValue();
+      switch (Op) {
+      case llvm::dwarf::DW_OP_plus:
+      case llvm::dwarf::DW_OP_minus:
+        return I == 0;
+      case llvm::dwarf::DW_OP_mul:
+      case llvm::dwarf::DW_OP_div:
+        return I == 1;
+      }
+    }
+    return false;
+  }
+
+  /// Convert a SCEV of a value to a DIExpression that is pushed onto the
+  /// builder's expression stack. The stack should already contain an
+  /// expression for the iteration count, so that it can be multiplied by
+  /// the stride and added to the start.
+  /// Components of the expression are omitted if they are an identity function.
+  /// Chain (non-affine) SCEVs are not supported.
+  bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
+    assert(SAR.isAffine() && "Expected affine SCEV");
+    // TODO: Is this check needed?
+    if (isa<SCEVAddRecExpr>(SAR.getStart()))
+      return false;
+
+    const SCEV *Start = SAR.getStart();
+    const SCEV *Stride = SAR.getStepRecurrence(SE);
+
+    // Skip pushing arithmetic noops.
+    if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
+      if (!pushSCEV(Stride))
+        return false;
+      pushOperator(llvm::dwarf::DW_OP_mul);
+    }
+    if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
+      if (!pushSCEV(Start))
+        return false;
+      pushOperator(llvm::dwarf::DW_OP_plus);
+    }
+    return true;
+  }
+
+  /// Convert a SCEV of a value to a DIExpression that is pushed onto the
+  /// builder's expression stack. The stack should already contain an
+  /// expression for the iteration count, so that it can be multiplied by
+  /// the stride and added to the start.
+  /// Components of the expression are omitted if they are an identity function.
+  bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
+                           ScalarEvolution &SE) {
+    assert(SAR.isAffine() && "Expected affine SCEV");
+    if (isa<SCEVAddRecExpr>(SAR.getStart())) {
+      LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
+                        << SAR << '\n');
+      return false;
+    }
+    const SCEV *Start = SAR.getStart();
+    const SCEV *Stride = SAR.getStepRecurrence(SE);
+
+    // Skip pushing arithmetic noops.
+    if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
+      if (!pushSCEV(Start))
+        return false;
+      pushOperator(llvm::dwarf::DW_OP_minus);
+    }
+    if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
+      if (!pushSCEV(Stride))
+        return false;
+      pushOperator(llvm::dwarf::DW_OP_div);
+    }
+    return true;
+  }
+};
+
+struct DVIRecoveryRec {
+  DbgValueInst *DVI;
+  DIExpression *Expr;
+  Metadata *LocationOp;
+  const llvm::SCEV *SCEV;
+};
+
+static bool RewriteDVIUsingIterCount(DVIRecoveryRec CachedDVI,
+                                     const SCEVDbgValueBuilder &IterationCount,
+                                     ScalarEvolution &SE) {
+  // LSR may add locations to previously single location-op DVIs which
+  // are currently not supported.
+  if (CachedDVI.DVI->getNumVariableLocationOps() != 1)
+    return false;
+
+  // SCEVs for SSA values are most frquently of the form
+  // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
+  // This is because %a is a PHI node that is not the IV. However, these
+  // SCEVs have not been observed to result in debuginfo-lossy optimisations,
+  // so its not expected this point will be reached.
+  if (!isa<SCEVAddRecExpr>(CachedDVI.SCEV))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "scev-salvage: Value to salvage SCEV: "
+                    << *CachedDVI.SCEV << '\n');
+
+  const auto *Rec = cast<SCEVAddRecExpr>(CachedDVI.SCEV);
+  if (!Rec->isAffine())
+    return false;
+
+  // Initialise a new builder with the iteration count expression. In
+  // combination with the value's SCEV this enables recovery.
+  SCEVDbgValueBuilder RecoverValue(IterationCount);
+  if (!RecoverValue.SCEVToValueExpr(*Rec, SE))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "scev-salvage: Updating: " << *CachedDVI.DVI << '\n');
+  RecoverValue.applyExprToDbgValue(*CachedDVI.DVI, CachedDVI.Expr);
+  LLVM_DEBUG(dbgs() << "scev-salvage: to: " << *CachedDVI.DVI << '\n');
+  return true;
+}
+
+static bool
+DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
+                          llvm::PHINode *LSRInductionVar,
+                          SmallVector<DVIRecoveryRec, 2> &DVIToUpdate) {
+  if (DVIToUpdate.empty())
+    return false;
+
+  const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
+  assert(SCEVInductionVar &&
+         "Anticipated a SCEV for the post-LSR induction variable");
+
+  bool Changed = false;
+  if (const SCEVAddRecExpr *IVAddRec =
+          dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
+    if (!IVAddRec->isAffine())
+      return false;
+
+    SCEVDbgValueBuilder IterCountExpr;
+    IterCountExpr.pushValue(LSRInductionVar);
+    if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
+                      << '\n');
+
+    // Needn't salvage if the location op hasn't been undef'd by LSR.
+    for (auto &DVIRec : DVIToUpdate) {
+      if (!DVIRec.DVI->isUndef())
+        continue;
+
+      // Some DVIs that were single location-op when cached are now multi-op,
+      // due to LSR optimisations. However, multi-op salvaging is not yet
+      // supported by SCEV salvaging. But, we can attempt a salvage by restoring
+      // the pre-LSR single-op expression.
+      if (DVIRec.DVI->hasArgList()) {
+        if (!DVIRec.DVI->getVariableLocationOp(0))
+          continue;
+        llvm::Type *Ty = DVIRec.DVI->getVariableLocationOp(0)->getType();
+        DVIRec.DVI->setRawLocation(
+            llvm::ValueAsMetadata::get(UndefValue::get(Ty)));
+        DVIRec.DVI->setExpression(DVIRec.Expr);
+      }
+
+      Changed |= RewriteDVIUsingIterCount(DVIRec, IterCountExpr, SE);
+    }
+  }
+  return Changed;
+}
+
+/// Identify and cache salvageable DVI locations and expressions along with the
+/// corresponding SCEV(s). Also ensure that the DVI is not deleted before
+static void
+DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE,
+                       SmallVector<DVIRecoveryRec, 2> &SalvageableDVISCEVs,
+                       SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
   for (auto &B : L->getBlocks()) {
     for (auto &I : *B) {
       auto DVI = dyn_cast<DbgValueInst>(&I);
       if (!DVI)
         continue;
-      for (unsigned Idx = 0; Idx < DVI->getNumVariableLocationOps(); ++Idx) {
-        // TODO: We can duplicate results if the same arg appears more than
-        // once.
-        Value *V = DVI->getVariableLocationOp(Idx);
-        if (!V || !SE.isSCEVable(V->getType()))
-          continue;
-        auto DbgValueSCEV = SE.getSCEV(V);
-        EqualValues EqSet;
-        for (PHINode &Phi : L->getHeader()->phis()) {
-          if (V->getType() != Phi.getType())
-            continue;
-          if (!SE.isSCEVable(Phi.getType()))
-            continue;
-          auto PhiSCEV = SE.getSCEV(&Phi);
-          Optional<APInt> Offset =
-              SE.computeConstantDifference(DbgValueSCEV, PhiSCEV);
-          if (Offset && Offset->getMinSignedBits() <= 64)
-            EqSet.emplace_back(
-                std::make_tuple(&Phi, Offset.getValue().getSExtValue()));
-        }
-        DbgValueToEqualSet[DVI].push_back({Idx, std::move(EqSet)});
-        // If we fall back to using this raw location, at least one location op
-        // must be dead. A DIArgList will automatically undef arguments when
-        // they become unavailable, but a ValueAsMetadata will not; since we
-        // know the value should be undef, we use the undef value directly here.
-        Metadata *RawLocation =
-            DVI->hasArgList() ? DVI->getRawLocation()
-                              : ValueAsMetadata::get(UndefValue::get(
-                                    DVI->getVariableLocationOp(0)->getType()));
-        DbgValueToLocation[DVI] = {DVI->getExpression(), RawLocation};
-      }
+
+      if (DVI->hasArgList())
+        continue;
+
+      if (!DVI->getVariableLocationOp(0) ||
+          !SE.isSCEVable(DVI->getVariableLocationOp(0)->getType()))
+        continue;
+
+      SalvageableDVISCEVs.push_back(
+          {DVI, DVI->getExpression(), DVI->getRawLocation(),
+           SE.getSCEV(DVI->getVariableLocationOp(0))});
+      DVIHandles.insert(DVI);
     }
   }
 }
 
-static void DbgApplyEqualValues(EqualValuesMap &DbgValueToEqualSet,
-                                LocationMap &DbgValueToLocation) {
-  for (auto A : DbgValueToEqualSet) {
-    auto *DVI = A.first;
-    // Only update those that are now undef.
-    if (!DVI->isUndef())
+/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
+/// any PHi from the loop header is usable, but may have less chance of
+/// surviving subsequent transforms.
+static llvm::PHINode *GetInductionVariable(const Loop &L, ScalarEvolution &SE,
+                                           const LSRInstance &LSR) {
+  // For now, just pick the first IV generated and inserted. Ideally pick an IV
+  // that is unlikely to be optimised away by subsequent transforms.
+  for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
+    if (!IV)
       continue;
-    // The dbg.value may have had its value or expression changed during LSR by
-    // a failed salvage attempt; refresh them from the map.
-    auto *DbgDIExpr = DbgValueToLocation[DVI].first;
-    DVI->setRawLocation(DbgValueToLocation[DVI].second);
-    DVI->setExpression(DbgDIExpr);
-    assert(DVI->isUndef() && "dbg.value with non-undef location should not "
-                             "have been modified by LSR.");
-    for (auto IdxEV : A.second) {
-      unsigned Idx = IdxEV.first;
-      for (auto EV : IdxEV.second) {
-        auto EVHandle = std::get<WeakVH>(EV);
-        if (!EVHandle)
-          continue;
-        int64_t Offset = std::get<int64_t>(EV);
-        DVI->replaceVariableLocationOp(Idx, EVHandle);
-        if (Offset) {
-          SmallVector<uint64_t, 8> Ops;
-          DIExpression::appendOffset(Ops, Offset);
-          DbgDIExpr = DIExpression::appendOpsToArg(DbgDIExpr, Ops, Idx, true);
-        }
-        DVI->setExpression(DbgDIExpr);
-        break;
-      }
+
+    assert(isa<PHINode>(&*IV) && "Expected PhI node.");
+    if (SE.isSCEVable((*IV).getType())) {
+      PHINode *Phi = dyn_cast<PHINode>(&*IV);
+      LLVM_DEBUG(dbgs() << "scev-salvage: IV : " << *IV
+                        << "with SCEV: " << *SE.getSCEV(Phi) << "\n");
+      return Phi;
     }
   }
+
+  for (PHINode &Phi : L.getHeader()->phis()) {
+    if (!SE.isSCEVable(Phi.getType()))
+      continue;
+
+    const llvm::SCEV *PhiSCEV = SE.getSCEV(&Phi);
+    if (const llvm::SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(PhiSCEV))
+      if (!Rec->isAffine())
+        continue;
+
+    LLVM_DEBUG(dbgs() << "scev-salvage: Selected IV from loop header: " << Phi
+                      << " with SCEV: " << *PhiSCEV << "\n");
+    return &Phi;
+  }
+  return nullptr;
 }
 
 static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
@@ -5948,20 +6271,21 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                                AssumptionCache &AC, TargetLibraryInfo &TLI,
                                MemorySSA *MSSA) {
 
+  // Debug preservation - before we start removing anything identify which DVI
+  // meet the salvageable criteria and store their DIExpression and SCEVs.
+  SmallVector<DVIRecoveryRec, 2> SalvageableDVI;
+  SmallSet<AssertingVH<DbgValueInst>, 2> DVIHandles;
+  DbgGatherSalvagableDVI(L, SE, SalvageableDVI, DVIHandles);
+
   bool Changed = false;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   if (MSSA)
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
   // Run the main LSR transformation.
-  Changed |=
-      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
-
-  // Debug preservation - before we start removing anything create equivalence
-  // sets for the llvm.dbg.value intrinsics.
-  EqualValuesMap DbgValueToEqualSet;
-  LocationMap DbgValueToLocation;
-  DbgGatherEqualValues(L, SE, DbgValueToEqualSet, DbgValueToLocation);
+  const LSRInstance &Reducer =
+      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
+  Changed |= Reducer.getChanged();
 
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
@@ -5981,8 +6305,22 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
     }
   }
 
-  DbgApplyEqualValues(DbgValueToEqualSet, DbgValueToLocation);
+  if (SalvageableDVI.empty())
+    return Changed;
 
+  // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
+  // expressions composed using the derived iteration count.
+  // TODO: Allow for multiple IV references for nested AddRecSCEVs
+  for (auto &L : LI) {
+    if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
+      DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVI);
+    else {
+      LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
+                           "could not be identified.\n");
+    }
+  }
+
+  DVIHandles.clear();
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 5ec01454e5b2..fe160d5415bd 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2811,10 +2811,11 @@ private:
       if (BeginOffset > NewAllocaBeginOffset ||
           EndOffset < NewAllocaEndOffset)
         return false;
+      // Length must be in range for FixedVectorType.
       auto *C = cast<ConstantInt>(II.getLength());
-      if (C->getBitWidth() > 64)
+      const uint64_t Len = C->getLimitedValue();
+      if (Len > std::numeric_limits<unsigned>::max())
         return false;
-      const auto Len = C->getZExtValue();
       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
       auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
       return canConvertValue(DL, SrcTy, AllocaTy) &&
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 91280762aaa7..bd2b6fafdf2e 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -23,6 +24,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -566,10 +568,18 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
     // to ensure we dominate all of our uses.  Always insert right before the
     // relevant instruction (terminator, assume), so that we insert in proper
     // order in the case of multiple predicateinfo in the same block.
+    // The number of named values is used to detect if a new declaration was
+    // added. If so, that declaration is tracked so that it can be removed when
+    // the analysis is done. The corner case were a new declaration results in
+    // a name clash and the old name being renamed is not considered as that
+    // represents an invalid module.
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
+      auto NumDecls = F.getParent()->getNumNamedValues();
       Function *IF = Intrinsic::getDeclaration(
           F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      if (NumDecls != F.getParent()->getNumNamedValues())
+        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
       PI.PredicateMap.insert({PIC, ValInfo});
@@ -581,8 +591,11 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
       // Insert the predicate directly after the assume. While it also holds
       // directly before it, assume(i1 true) is not a useful fact.
       IRBuilder<> B(PAssume->AssumeInst->getNextNode());
+      auto NumDecls = F.getParent()->getNumNamedValues();
       Function *IF = Intrinsic::getDeclaration(
           F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      if (NumDecls != F.getParent()->getNumNamedValues())
+        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
@@ -761,6 +774,23 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
   Builder.buildPredicateInfo();
 }
 
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+  // Collect function pointers in set first, as SmallSet uses a SmallVector
+  // internally and we have to remove the asserting value handles first.
+  SmallPtrSet<Function *, 20> FunctionPtrs;
+  for (auto &F : CreatedDeclarations)
+    FunctionPtrs.insert(&*F);
+  CreatedDeclarations.clear();
+
+  for (Function *F : FunctionPtrs) {
+    assert(F->user_begin() == F->user_end() &&
+           "PredicateInfo consumer did not remove all SSA copies.");
+    F->eraseFromParent();
+  }
+}
+
 Optional<PredicateConstraint> PredicateBase::getConstraint() const {
   switch (Type) {
   case PT_Assume:
@@ -827,6 +857,19 @@ void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AssumptionCacheTracker>();
 }
 
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+  for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) {
+    const auto *PI = PredInfo.getPredicateInfoFor(&Inst);
+    auto *II = dyn_cast<IntrinsicInst>(&Inst);
+    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+      continue;
+
+    Inst.replaceAllUsesWith(II->getOperand(0));
+    Inst.eraseFromParent();
+  }
+}
+
 bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -834,6 +877,8 @@ bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   PredInfo->print(dbgs());
   if (VerifyPredicateInfo)
     PredInfo->verifyPredicateInfo();
+
+  replaceCreatedSSACopys(*PredInfo, F);
   return false;
 }
 
@@ -845,6 +890,7 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
   auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
   PredInfo->print(OS);
 
+  replaceCreatedSSACopys(*PredInfo, F);
   return PreservedAnalyses::all();
 }
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 5af1c37e6197..3978e1e29825 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1393,9 +1393,10 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   // can ensure that IVIncrement dominates the current uses.
   PostIncLoops = SavedPostIncLoops;
 
-  // Remember this PHI, even in post-inc mode.
+  // Remember this PHI, even in post-inc mode. LSR SCEV-based salvaging is most
+  // effective when we are able to use an IV inserted here, so record it.
   InsertedValues.insert(PN);
-
+  InsertedIVs.push_back(PN);
   return PN;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f24ae6b100d5..671bc6b5212b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5433,6 +5433,21 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::sideeffect:
+        case Intrinsic::experimental_noalias_scope_decl:
+        case Intrinsic::assume:
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+          if (TheLoop->hasLoopInvariantOperands(&I))
+            addToWorklistIfAllowed(&I);
+          break;
+        default:
+          break;
+        }
+      }
+
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
       if (!Ptr)
@@ -8916,6 +8931,37 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
 
+  // Even if the instruction is not marked as uniform, there are certain
+  // intrinsic calls that can be effectively treated as such, so we check for
+  // them here. Conservatively, we only do this for scalable vectors, since
+  // for fixed-width VFs we can always fall back on full scalarization.
+  if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
+    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    case Intrinsic::assume:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // For scalable vectors if one of the operands is variant then we still
+      // want to mark as uniform, which will generate one instruction for just
+      // the first lane of the vector. We can't scalarize the call in the same
+      // way as for fixed-width vectors because we don't know how many lanes
+      // there are.
+      //
+      // The reasons for doing it this way for scalable vectors are:
+      //   1. For the assume intrinsic generating the instruction for the first
+      //      lane is still be better than not generating any at all. For
+      //      example, the input may be a splat across all lanes.
+      //   2. For the lifetime start/end intrinsics the pointer operand only
+      //      does anything useful when the input comes from a stack object,
+      //      which suggests it should always be uniform. For non-stack objects
+      //      the effect is to poison the object, which still allows us to
+      //      remove the call.
+      IsUniform = true;
+      break;
+    default:
+      break;
+    }
+  }
+
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
   setRecipe(I, Recipe);
diff --git a/llvm/tools/llvm-mca/Views/TimelineView.cpp b/llvm/tools/llvm-mca/Views/TimelineView.cpp
index 9a949761bb75..4ecc3015529c 100644
--- a/llvm/tools/llvm-mca/Views/TimelineView.cpp
+++ b/llvm/tools/llvm-mca/Views/TimelineView.cpp
@@ -145,10 +145,11 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
 
   double AverageTime1, AverageTime2, AverageTime3;
   AverageTime1 =
-      (double)Entry.CyclesSpentInSchedulerQueue / CumulativeExecutions;
-  AverageTime2 = (double)Entry.CyclesSpentInSQWhileReady / CumulativeExecutions;
-  AverageTime3 =
-      (double)Entry.CyclesSpentAfterWBAndBeforeRetire / CumulativeExecutions;
+      (double)(Entry.CyclesSpentInSchedulerQueue * 10) / CumulativeExecutions;
+  AverageTime2 =
+      (double)(Entry.CyclesSpentInSQWhileReady * 10) / CumulativeExecutions;
+  AverageTime3 = (double)(Entry.CyclesSpentAfterWBAndBeforeRetire * 10) /
+                 CumulativeExecutions;
 
   OS << Executions;
   OS.PadToColumn(13);
@@ -157,18 +158,18 @@ void TimelineView::printWaitTimeEntry(formatted_raw_ostream &OS,
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentInSchedulerQueue, CumulativeExecutions,
                    BufferSize);
-  OS << format("%.1f", floor((AverageTime1 * 10) + 0.5) / 10);
+  OS << format("%.1f", floor(AverageTime1 + 0.5) / 10);
   OS.PadToColumn(20);
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentInSQWhileReady, CumulativeExecutions,
                    BufferSize);
-  OS << format("%.1f", floor((AverageTime2 * 10) + 0.5) / 10);
+  OS << format("%.1f", floor(AverageTime2 + 0.5) / 10);
   OS.PadToColumn(27);
   if (!PrintingTotals)
     tryChangeColor(OS, Entry.CyclesSpentAfterWBAndBeforeRetire,
                    CumulativeExecutions,
                    getSubTargetInfo().getSchedModel().MicroOpBufferSize);
-  OS << format("%.1f", floor((AverageTime3 * 10) + 0.5) / 10);
+  OS << format("%.1f", floor(AverageTime3 + 0.5) / 10);
 
   if (OS.has_colors())
     OS.resetColor();
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index 162fb38e1eed..dd3e7688d33f 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -344,6 +344,13 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
         // link node as successor of all nodes in the prev_set if any
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, prev_set);
+        if (dep_barrier) {
+          // clean last_out and prev_set if any; don't touch last_set
+          __kmp_node_deref(thread, last_out);
+          info->last_out = NULL;
+          __kmp_depnode_list_free(thread, prev_set);
+          info->prev_set = NULL;
+        }
       } else { // last_set is of different dep kind, make it prev_set
         // link node as successor of all nodes in the last_set
         npredecessors +=
@@ -353,13 +360,21 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
         info->last_out = NULL;
         // clean prev_set if any
         __kmp_depnode_list_free(thread, prev_set);
-        // move last_set to prev_set, new last_set will be allocated
-        info->prev_set = last_set;
+        if (!dep_barrier) {
+          // move last_set to prev_set, new last_set will be allocated
+          info->prev_set = last_set;
+        } else {
+          info->prev_set = NULL;
+          info->last_flag = 0;
+        }
         info->last_set = NULL;
       }
-      info->last_flag = dep->flag; // store dep kind of the last_set
-      info->last_set = __kmp_add_node(thread, info->last_set, node);
-
+      // for dep_barrier last_flag value should remain:
+      // 0 if last_set is empty, unchanged otherwise
+      if (!dep_barrier) {
+        info->last_flag = dep->flag; // store dep kind of the last_set
+        info->last_set = __kmp_add_node(thread, info->last_set, node);
+      }
       // check if we are processing MTX dependency
       if (dep->flag == KMP_DEP_MTX) {
         if (info->mtx_lock == NULL) {
@@ -756,8 +771,6 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
 
   kmp_depnode_t node = {0};
   __kmp_init_node(&node);
-  // the stack owns the node
-  __kmp_node_ref(&node);
 
   if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index d1576dd5b791..73abf07018f3 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -23,8 +23,7 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
     return;
 
   kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
-  // TODO: temporarily disable assertion until the bug with dependences is fixed
-  //  KMP_DEBUG_ASSERT(n >= 0);
+  KMP_DEBUG_ASSERT(n >= 0);
   if (n == 0) {
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6c3e2c95cb5a..55e9c307638a 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1441,6 +1441,7 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (__kmp_enable_hidden_helper) {
     auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
     input_flags.hidden_helper = TRUE;
+    input_flags.tiedness = TASK_UNTIED;
   }
 
   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,