From 947eb1449054880e65af3acb5a5c77c590aba9ce Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sat, 4 Jun 2022 14:20:35 +0200 Subject: [PATCH] Vendor import of llvm-project branch release/14.x llvmorg-14.0.4-0-g29f1039a7285. --- clang/include/clang/AST/ASTContext.h | 4 +- clang/lib/AST/ASTContext.cpp | 37 +- clang/lib/AST/ItaniumMangle.cpp | 280 ++-- clang/lib/AST/TypePrinter.cpp | 3 +- clang/lib/Basic/TargetInfo.cpp | 6 +- clang/lib/CodeGen/CGCUDANV.cpp | 2 +- clang/lib/CodeGen/CGExprCXX.cpp | 2 +- clang/lib/CodeGen/CodeGenModule.cpp | 17 +- clang/lib/CodeGen/CodeGenModule.h | 7 +- clang/lib/Driver/ToolChains/AMDGPU.cpp | 2 +- clang/lib/Driver/ToolChains/Ananas.cpp | 14 +- clang/lib/Driver/ToolChains/CommonArgs.cpp | 12 + clang/lib/Driver/ToolChains/CommonArgs.h | 3 + clang/lib/Driver/ToolChains/Linux.cpp | 16 +- clang/lib/Driver/ToolChains/Solaris.cpp | 12 +- compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S | 2 + libcxx/include/__ranges/concepts.h | 4 - libcxx/include/__ranges/data.h | 4 +- libcxx/include/__ranges/size.h | 4 +- lld/ELF/Arch/AArch64.cpp | 8 +- lld/ELF/Arch/ARM.cpp | 8 +- lld/ELF/InputSection.cpp | 4 +- lld/ELF/Options.td | 1 + .../llvm/Support/AArch64TargetParser.def | 2 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 7 +- llvm/lib/MC/ELFObjectWriter.cpp | 20 +- llvm/lib/Support/Host.cpp | 6 + llvm/lib/Target/AArch64/AArch64.td | 18 + .../lib/Target/AArch64/AArch64SchedAmpere1.td | 1136 +++++++++++++++++ .../Target/AArch64/AArch64SchedPredAmpere.td | 25 + .../Target/AArch64/AArch64SchedPredicates.td | 2 +- llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 6 + llvm/lib/Target/AArch64/AArch64Subtarget.h | 1 + llvm/lib/Target/AVR/AVRCallingConv.td | 2 + llvm/lib/Target/AVR/AVRInstrInfo.td | 28 +- .../MCTargetDesc/SystemZMCCodeEmitter.cpp | 9 +- .../Target/SystemZ/SystemZISelLowering.cpp | 16 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- .../InstCombine/InstCombineAndOrXor.cpp | 6 +- llvm/lib/Transforms/Scalar/SCCP.cpp | 23 +- 40 files changed, 1540 insertions(+), 225 deletions(-) create mode 100644 llvm/lib/Target/AArch64/AArch64SchedAmpere1.td create mode 100644 llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 63c11e237d6c..1bd5d7a6c1d7 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -3279,10 +3279,10 @@ OPT_LIST(V) /// Return a new OMPTraitInfo object owned by this context. OMPTraitInfo &getNewOMPTraitInfo(); - /// Whether a C++ static variable may be externalized. + /// Whether a C++ static variable or CUDA/HIP kernel may be externalized. bool mayExternalizeStaticVar(const Decl *D) const; - /// Whether a C++ static variable should be externalized. + /// Whether a C++ static variable or CUDA/HIP kernel should be externalized. bool shouldExternalizeStaticVar(const Decl *D) const; StringRef getCUIDHash() const; diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 5fa2d46de89b..e4b3827b8714 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -8547,21 +8547,18 @@ static TypedefDecl *CreateVoidPtrBuiltinVaListDecl(const ASTContext *Context) { static TypedefDecl * CreateAArch64ABIBuiltinVaListDecl(const ASTContext *Context) { + // struct __va_list RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list"); - // namespace std { struct __va_list { - // Note that we create the namespace even in C. This is intentional so that - // the type is consistent between C and C++, which is important in cases where - // the types need to match between translation units (e.g. with - // -fsanitize=cfi-icall). Ideally we wouldn't have created this namespace at - // all, but it's now part of the ABI (e.g. in mangled names), so we can't - // change it. - auto *NS = NamespaceDecl::Create( - const_cast(*Context), Context->getTranslationUnitDecl(), - /*Inline*/ false, SourceLocation(), SourceLocation(), - &Context->Idents.get("std"), - /*PrevDecl*/ nullptr); - NS->setImplicit(); - VaListTagDecl->setDeclContext(NS); + if (Context->getLangOpts().CPlusPlus) { + // namespace std { struct __va_list { + auto *NS = NamespaceDecl::Create( + const_cast(*Context), Context->getTranslationUnitDecl(), + /*Inline*/ false, SourceLocation(), SourceLocation(), + &Context->Idents.get("std"), + /*PrevDecl*/ nullptr); + NS->setImplicit(); + VaListTagDecl->setDeclContext(NS); + } VaListTagDecl->startDefinition(); @@ -12266,14 +12263,18 @@ bool ASTContext::mayExternalizeStaticVar(const Decl *D) const { (D->hasAttr() && !D->getAttr()->isImplicit()); // CUDA/HIP: static managed variables need to be externalized since it is - // a declaration in IR, therefore cannot have internal linkage. - return IsStaticVar && - (D->hasAttr() || IsExplicitDeviceVar); + // a declaration in IR, therefore cannot have internal linkage. Kernels in + // anonymous name space needs to be externalized to avoid duplicate symbols. + return (IsStaticVar && + (D->hasAttr() || IsExplicitDeviceVar)) || + (D->hasAttr() && + basicGVALinkageForFunction(*this, cast(D)) == + GVA_Internal); } bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const { return mayExternalizeStaticVar(D) && - (D->hasAttr() || + (D->hasAttr() || D->hasAttr() || CUDADeviceVarODRUsedByHost.count(cast(D))); } diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 2e734e2b28cd..68d4d1271cdb 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -40,65 +40,10 @@ using namespace clang; namespace { -/// Retrieve the declaration context that should be used when mangling the given -/// declaration. -static const DeclContext *getEffectiveDeclContext(const Decl *D) { - // The ABI assumes that lambda closure types that occur within - // default arguments live in the context of the function. However, due to - // the way in which Clang parses and creates function declarations, this is - // not the case: the lambda closure type ends up living in the context - // where the function itself resides, because the function declaration itself - // had not yet been created. Fix the context here. - if (const CXXRecordDecl *RD = dyn_cast(D)) { - if (RD->isLambda()) - if (ParmVarDecl *ContextParam - = dyn_cast_or_null(RD->getLambdaContextDecl())) - return ContextParam->getDeclContext(); - } - - // Perform the same check for block literals. - if (const BlockDecl *BD = dyn_cast(D)) { - if (ParmVarDecl *ContextParam - = dyn_cast_or_null(BD->getBlockManglingContextDecl())) - return ContextParam->getDeclContext(); - } - - const DeclContext *DC = D->getDeclContext(); - if (isa(DC) || isa(DC) || - isa(DC)) { - return getEffectiveDeclContext(cast(DC)); - } - - if (const auto *VD = dyn_cast(D)) - if (VD->isExternC()) - return VD->getASTContext().getTranslationUnitDecl(); - - if (const auto *FD = dyn_cast(D)) - if (FD->isExternC()) - return FD->getASTContext().getTranslationUnitDecl(); - - return DC->getRedeclContext(); -} - -static const DeclContext *getEffectiveParentContext(const DeclContext *DC) { - return getEffectiveDeclContext(cast(DC)); -} - static bool isLocalContainerContext(const DeclContext *DC) { return isa(DC) || isa(DC) || isa(DC); } -static const RecordDecl *GetLocalClassDecl(const Decl *D) { - const DeclContext *DC = getEffectiveDeclContext(D); - while (!DC->isNamespace() && !DC->isTranslationUnit()) { - if (isLocalContainerContext(DC)) - return dyn_cast(D); - D = cast(DC); - DC = getEffectiveDeclContext(D); - } - return nullptr; -} - static const FunctionDecl *getStructor(const FunctionDecl *fn) { if (const FunctionTemplateDecl *ftd = fn->getPrimaryTemplate()) return ftd->getTemplatedDecl(); @@ -126,6 +71,7 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext { llvm::DenseMap Discriminator; llvm::DenseMap Uniquifier; const DiscriminatorOverrideTy DiscriminatorOverride = nullptr; + NamespaceDecl *StdNamespace = nullptr; bool NeedsUniqueInternalLinkageNames = false; @@ -249,6 +195,16 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext { return DiscriminatorOverride; } + NamespaceDecl *getStdNamespace(); + + const DeclContext *getEffectiveDeclContext(const Decl *D); + const DeclContext *getEffectiveParentContext(const DeclContext *DC) { + return getEffectiveDeclContext(cast(DC)); + } + + bool isInternalLinkageDecl(const NamedDecl *ND); + const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC); + /// @} }; @@ -427,6 +383,15 @@ class CXXNameMangler { ASTContext &getASTContext() const { return Context.getASTContext(); } + bool isStd(const NamespaceDecl *NS); + bool isStdNamespace(const DeclContext *DC); + + const RecordDecl *GetLocalClassDecl(const Decl *D); + const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC); + bool isSpecializedAs(QualType S, llvm::StringRef Name, QualType A); + bool isStdCharSpecialization(const ClassTemplateSpecializationDecl *SD, + llvm::StringRef Name, bool HasAllocator); + public: CXXNameMangler(ItaniumMangleContextImpl &C, raw_ostream &Out_, const NamedDecl *D = nullptr, bool NullOut_ = false) @@ -628,7 +593,71 @@ class CXXNameMangler { } -static bool isInternalLinkageDecl(const NamedDecl *ND) { +NamespaceDecl *ItaniumMangleContextImpl::getStdNamespace() { + if (!StdNamespace) { + StdNamespace = NamespaceDecl::Create( + getASTContext(), getASTContext().getTranslationUnitDecl(), + /*Inline*/ false, SourceLocation(), SourceLocation(), + &getASTContext().Idents.get("std"), + /*PrevDecl*/ nullptr); + StdNamespace->setImplicit(); + } + return StdNamespace; +} + +/// Retrieve the declaration context that should be used when mangling the given +/// declaration. +const DeclContext * +ItaniumMangleContextImpl::getEffectiveDeclContext(const Decl *D) { + // The ABI assumes that lambda closure types that occur within + // default arguments live in the context of the function. However, due to + // the way in which Clang parses and creates function declarations, this is + // not the case: the lambda closure type ends up living in the context + // where the function itself resides, because the function declaration itself + // had not yet been created. Fix the context here. + if (const CXXRecordDecl *RD = dyn_cast(D)) { + if (RD->isLambda()) + if (ParmVarDecl *ContextParam = + dyn_cast_or_null(RD->getLambdaContextDecl())) + return ContextParam->getDeclContext(); + } + + // Perform the same check for block literals. + if (const BlockDecl *BD = dyn_cast(D)) { + if (ParmVarDecl *ContextParam = + dyn_cast_or_null(BD->getBlockManglingContextDecl())) + return ContextParam->getDeclContext(); + } + + // On ARM and AArch64, the va_list tag is always mangled as if in the std + // namespace. We do not represent va_list as actually being in the std + // namespace in C because this would result in incorrect debug info in C, + // among other things. It is important for both languages to have the same + // mangling in order for -fsanitize=cfi-icall to work. + if (D == getASTContext().getVaListTagDecl()) { + const llvm::Triple &T = getASTContext().getTargetInfo().getTriple(); + if (T.isARM() || T.isThumb() || T.isAArch64()) + return getStdNamespace(); + } + + const DeclContext *DC = D->getDeclContext(); + if (isa(DC) || isa(DC) || + isa(DC)) { + return getEffectiveDeclContext(cast(DC)); + } + + if (const auto *VD = dyn_cast(D)) + if (VD->isExternC()) + return getASTContext().getTranslationUnitDecl(); + + if (const auto *FD = dyn_cast(D)) + if (FD->isExternC()) + return getASTContext().getTranslationUnitDecl(); + + return DC->getRedeclContext(); +} + +bool ItaniumMangleContextImpl::isInternalLinkageDecl(const NamedDecl *ND) { if (ND && ND->getFormalLinkage() == InternalLinkage && !ND->isExternallyVisible() && getEffectiveDeclContext(ND)->isFileContext() && @@ -862,18 +891,9 @@ void CXXNameMangler::mangleFunctionEncodingBareType(const FunctionDecl *FD) { MangleReturnType, FD); } -static const DeclContext *IgnoreLinkageSpecDecls(const DeclContext *DC) { - while (isa(DC)) { - DC = getEffectiveParentContext(DC); - } - - return DC; -} - /// Return whether a given namespace is the 'std' namespace. -static bool isStd(const NamespaceDecl *NS) { - if (!IgnoreLinkageSpecDecls(getEffectiveParentContext(NS)) - ->isTranslationUnit()) +bool CXXNameMangler::isStd(const NamespaceDecl *NS) { + if (!Context.getEffectiveParentContext(NS)->isTranslationUnit()) return false; const IdentifierInfo *II = NS->getOriginalNamespace()->getIdentifier(); @@ -882,7 +902,7 @@ static bool isStd(const NamespaceDecl *NS) { // isStdNamespace - Return whether a given decl context is a toplevel 'std' // namespace. -static bool isStdNamespace(const DeclContext *DC) { +bool CXXNameMangler::isStdNamespace(const DeclContext *DC) { if (!DC->isNamespace()) return false; @@ -956,6 +976,17 @@ void CXXNameMangler::mangleName(GlobalDecl GD) { } } +const RecordDecl *CXXNameMangler::GetLocalClassDecl(const Decl *D) { + const DeclContext *DC = Context.getEffectiveDeclContext(D); + while (!DC->isNamespace() && !DC->isTranslationUnit()) { + if (isLocalContainerContext(DC)) + return dyn_cast(D); + D = cast(DC); + DC = Context.getEffectiveDeclContext(D); + } + return nullptr; +} + void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD, const AbiTagList *AdditionalAbiTags) { const NamedDecl *ND = cast(GD.getDecl()); @@ -964,7 +995,7 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD, // ::= [] // ::= // - const DeclContext *DC = getEffectiveDeclContext(ND); + const DeclContext *DC = Context.getEffectiveDeclContext(ND); // If this is an extern variable declared locally, the relevant DeclContext // is that of the containing namespace, or the translation unit. @@ -972,13 +1003,13 @@ void CXXNameMangler::mangleNameWithAbiTags(GlobalDecl GD, // a proper semantic declaration context! if (isLocalContainerContext(DC) && ND->hasLinkage() && !isLambda(ND)) while (!DC->isNamespace() && !DC->isTranslationUnit()) - DC = getEffectiveParentContext(DC); + DC = Context.getEffectiveParentContext(DC); else if (GetLocalClassDecl(ND)) { mangleLocalName(GD, AdditionalAbiTags); return; } - DC = IgnoreLinkageSpecDecls(DC); + assert(!isa(DC) && "context cannot be LinkageSpecDecl"); if (isLocalContainerContext(DC)) { mangleLocalName(GD, AdditionalAbiTags); @@ -1054,7 +1085,7 @@ void CXXNameMangler::mangleModuleNamePrefix(StringRef Name) { void CXXNameMangler::mangleTemplateName(const TemplateDecl *TD, const TemplateArgument *TemplateArgs, unsigned NumTemplateArgs) { - const DeclContext *DC = IgnoreLinkageSpecDecls(getEffectiveDeclContext(TD)); + const DeclContext *DC = Context.getEffectiveDeclContext(TD); if (DC->isTranslationUnit() || isStdNamespace(DC)) { mangleUnscopedTemplateName(TD, nullptr); @@ -1070,7 +1101,7 @@ void CXXNameMangler::mangleUnscopedName(GlobalDecl GD, // ::= // ::= St # ::std:: - if (isStdNamespace(IgnoreLinkageSpecDecls(getEffectiveDeclContext(ND)))) + if (isStdNamespace(Context.getEffectiveDeclContext(ND))) Out << "St"; mangleUnqualifiedName(GD, AdditionalAbiTags); @@ -1430,7 +1461,7 @@ void CXXNameMangler::mangleUnqualifiedName(GlobalDecl GD, // 12_GLOBAL__N_1 mangling is quite sufficient there, and this better // matches GCC anyway, because GCC does not treat anonymous namespaces as // implying internal linkage. - if (isInternalLinkageDecl(ND)) + if (Context.isInternalLinkageDecl(ND)) Out << 'L'; auto *FD = dyn_cast(ND); @@ -1745,7 +1776,7 @@ void CXXNameMangler::mangleLocalName(GlobalDecl GD, // := _ assert(isa(D) || isa(D)); const RecordDecl *RD = GetLocalClassDecl(D); - const DeclContext *DC = getEffectiveDeclContext(RD ? RD : D); + const DeclContext *DC = Context.getEffectiveDeclContext(RD ? RD : D); Out << 'Z'; @@ -1798,13 +1829,13 @@ void CXXNameMangler::mangleLocalName(GlobalDecl GD, if (const NamedDecl *PrefixND = getClosurePrefix(BD)) mangleClosurePrefix(PrefixND, true /*NoFunction*/); else - manglePrefix(getEffectiveDeclContext(BD), true /*NoFunction*/); + manglePrefix(Context.getEffectiveDeclContext(BD), true /*NoFunction*/); assert(!AdditionalAbiTags && "Block cannot have additional abi tags"); mangleUnqualifiedBlock(BD); } else { const NamedDecl *ND = cast(D); - mangleNestedName(GD, getEffectiveDeclContext(ND), AdditionalAbiTags, - true /*NoFunction*/); + mangleNestedName(GD, Context.getEffectiveDeclContext(ND), + AdditionalAbiTags, true /*NoFunction*/); } } else if (const BlockDecl *BD = dyn_cast(D)) { // Mangle a block in a default parameter; see above explanation for @@ -1843,7 +1874,7 @@ void CXXNameMangler::mangleBlockForPrefix(const BlockDecl *Block) { mangleLocalName(Block, /* AdditionalAbiTags */ nullptr); return; } - const DeclContext *DC = getEffectiveDeclContext(Block); + const DeclContext *DC = Context.getEffectiveDeclContext(Block); if (isLocalContainerContext(DC)) { mangleLocalName(Block, /* AdditionalAbiTags */ nullptr); return; @@ -2030,7 +2061,7 @@ void CXXNameMangler::manglePrefix(const DeclContext *DC, bool NoFunction) { // ::= # empty // ::= - DC = IgnoreLinkageSpecDecls(DC); + assert(!isa(DC) && "prefix cannot be LinkageSpecDecl"); if (DC->isTranslationUnit()) return; @@ -2053,7 +2084,7 @@ void CXXNameMangler::manglePrefix(const DeclContext *DC, bool NoFunction) { mangleClosurePrefix(PrefixND, NoFunction); mangleUnqualifiedName(ND, nullptr); } else { - manglePrefix(getEffectiveDeclContext(ND), NoFunction); + manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction); mangleUnqualifiedName(ND, nullptr); } @@ -2107,7 +2138,7 @@ void CXXNameMangler::mangleTemplatePrefix(GlobalDecl GD, if (const auto *TTP = dyn_cast(ND)) { mangleTemplateParameter(TTP->getDepth(), TTP->getIndex()); } else { - manglePrefix(getEffectiveDeclContext(ND), NoFunction); + manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction); if (isa(ND) || isa(ND)) mangleUnqualifiedName(GD, nullptr); else @@ -2152,7 +2183,7 @@ void CXXNameMangler::mangleClosurePrefix(const NamedDecl *ND, bool NoFunction) { mangleTemplatePrefix(TD, NoFunction); mangleTemplateArgs(asTemplateName(TD), *TemplateArgs); } else { - manglePrefix(getEffectiveDeclContext(ND), NoFunction); + manglePrefix(Context.getEffectiveDeclContext(ND), NoFunction); mangleUnqualifiedName(ND, nullptr); } @@ -5969,56 +6000,61 @@ bool CXXNameMangler::mangleSubstitution(uintptr_t Ptr) { return true; } -static bool isCharType(QualType T) { - if (T.isNull()) +/// Returns whether S is a template specialization of std::Name with a single +/// argument of type A. +bool CXXNameMangler::isSpecializedAs(QualType S, llvm::StringRef Name, + QualType A) { + if (S.isNull()) return false; - return T->isSpecificBuiltinType(BuiltinType::Char_S) || - T->isSpecificBuiltinType(BuiltinType::Char_U); -} - -/// Returns whether a given type is a template specialization of a given name -/// with a single argument of type char. -static bool isCharSpecialization(QualType T, const char *Name) { - if (T.isNull()) - return false; - - const RecordType *RT = T->getAs(); + const RecordType *RT = S->getAs(); if (!RT) return false; const ClassTemplateSpecializationDecl *SD = dyn_cast(RT->getDecl()); - if (!SD) + if (!SD || !SD->getIdentifier()->isStr(Name)) return false; - if (!isStdNamespace(getEffectiveDeclContext(SD))) + if (!isStdNamespace(Context.getEffectiveDeclContext(SD))) return false; const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs(); if (TemplateArgs.size() != 1) return false; - if (!isCharType(TemplateArgs[0].getAsType())) + if (TemplateArgs[0].getAsType() != A) return false; - return SD->getIdentifier()->getName() == Name; + return true; } -template -static bool isStreamCharSpecialization(const ClassTemplateSpecializationDecl*SD, - const char (&Str)[StrLen]) { - if (!SD->getIdentifier()->isStr(Str)) +/// Returns whether SD is a template specialization std::Name [, std::allocator]> +/// HasAllocator controls whether the 3rd template argument is needed. +bool CXXNameMangler::isStdCharSpecialization( + const ClassTemplateSpecializationDecl *SD, llvm::StringRef Name, + bool HasAllocator) { + if (!SD->getIdentifier()->isStr(Name)) return false; const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs(); - if (TemplateArgs.size() != 2) + if (TemplateArgs.size() != (HasAllocator ? 3 : 2)) return false; - if (!isCharType(TemplateArgs[0].getAsType())) + QualType A = TemplateArgs[0].getAsType(); + if (A.isNull()) + return false; + // Plain 'char' is named Char_S or Char_U depending on the target ABI. + if (!A->isSpecificBuiltinType(BuiltinType::Char_S) && + !A->isSpecificBuiltinType(BuiltinType::Char_U)) return false; - if (!isCharSpecialization(TemplateArgs[1].getAsType(), "char_traits")) + if (!isSpecializedAs(TemplateArgs[1].getAsType(), "char_traits", A)) + return false; + + if (HasAllocator && + !isSpecializedAs(TemplateArgs[2].getAsType(), "allocator", A)) return false; return true; @@ -6031,10 +6067,11 @@ bool CXXNameMangler::mangleStandardSubstitution(const NamedDecl *ND) { Out << "St"; return true; } + return false; } if (const ClassTemplateDecl *TD = dyn_cast(ND)) { - if (!isStdNamespace(getEffectiveDeclContext(TD))) + if (!isStdNamespace(Context.getEffectiveDeclContext(TD))) return false; // ::= Sa # ::std::allocator @@ -6048,56 +6085,45 @@ bool CXXNameMangler::mangleStandardSubstitution(const NamedDecl *ND) { Out << "Sb"; return true; } + return false; } if (const ClassTemplateSpecializationDecl *SD = dyn_cast(ND)) { - if (!isStdNamespace(getEffectiveDeclContext(SD))) + if (!isStdNamespace(Context.getEffectiveDeclContext(SD))) return false; // ::= Ss # ::std::basic_string, // ::std::allocator > - if (SD->getIdentifier()->isStr("basic_string")) { - const TemplateArgumentList &TemplateArgs = SD->getTemplateArgs(); - - if (TemplateArgs.size() != 3) - return false; - - if (!isCharType(TemplateArgs[0].getAsType())) - return false; - - if (!isCharSpecialization(TemplateArgs[1].getAsType(), "char_traits")) - return false; - - if (!isCharSpecialization(TemplateArgs[2].getAsType(), "allocator")) - return false; - + if (isStdCharSpecialization(SD, "basic_string", /*HasAllocator=*/true)) { Out << "Ss"; return true; } // ::= Si # ::std::basic_istream > - if (isStreamCharSpecialization(SD, "basic_istream")) { + if (isStdCharSpecialization(SD, "basic_istream", /*HasAllocator=*/false)) { Out << "Si"; return true; } // ::= So # ::std::basic_ostream > - if (isStreamCharSpecialization(SD, "basic_ostream")) { + if (isStdCharSpecialization(SD, "basic_ostream", /*HasAllocator=*/false)) { Out << "So"; return true; } // ::= Sd # ::std::basic_iostream > - if (isStreamCharSpecialization(SD, "basic_iostream")) { + if (isStdCharSpecialization(SD, "basic_iostream", /*HasAllocator=*/false)) { Out << "Sd"; return true; } + return false; } + return false; } diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index bba323f651aa..6e827530f41b 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -1466,8 +1466,7 @@ void TypePrinter::printTemplateId(const TemplateSpecializationType *T, if (!Policy.SuppressScope) AppendScope(TD->getDeclContext(), OS, TD->getDeclName()); - IdentifierInfo *II = TD->getIdentifier(); - OS << II->getName(); + OS << TD->getName(); } else { T->getTemplateName().print(OS, Policy); } diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index e3a2f30febe7..188ffb5f2f78 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -69,11 +69,11 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { // From the glibc documentation, on GNU systems, malloc guarantees 16-byte // alignment on 64-bit systems and 8-byte alignment on 32-bit systems. See // https://www.gnu.org/software/libc/manual/html_node/Malloc-Examples.html. - // This alignment guarantee also applies to Windows and Android. On Darwin, - // the alignment is 16 bytes on both 64-bit and 32-bit systems. + // This alignment guarantee also applies to Windows and Android. On Darwin + // and OpenBSD, the alignment is 16 bytes on both 64-bit and 32-bit systems. if (T.isGNUEnvironment() || T.isWindowsMSVCEnvironment() || T.isAndroid()) NewAlign = Triple.isArch64Bit() ? 128 : Triple.isArch32Bit() ? 64 : 0; - else if (T.isOSDarwin()) + else if (T.isOSDarwin() || T.isOSOpenBSD()) NewAlign = 128; else NewAlign = 0; // Infer from basic type alignment. diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index c4e3f7f54f4f..414e61f25fb3 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -287,7 +287,7 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { SmallString<256> Buffer; llvm::raw_svector_ostream Out(Buffer); Out << DeviceSideName; - CGM.printPostfixForExternalizedStaticVar(Out); + CGM.printPostfixForExternalizedDecl(Out, ND); DeviceSideName = std::string(Out.str()); } return DeviceSideName; diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index f06d21861740..3a3acf98f309 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -1572,7 +1572,7 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) { llvm::Value *allocSize = EmitCXXNewAllocSize(*this, E, minElements, numElements, allocSizeWithoutCookie); - CharUnits allocAlign = getContext().getPreferredTypeAlignInChars(allocType); + CharUnits allocAlign = getContext().getTypeAlignInChars(allocType); // Emit the allocation call. If the allocator is a global placement // operator, just "inline" it directly. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 29806b65e984..2777fc22600d 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1367,7 +1367,7 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD, if (CGM.getContext().shouldExternalizeStaticVar(ND) && CGM.getLangOpts().GPURelocatableDeviceCode && CGM.getLangOpts().CUDAIsDevice && !CGM.getLangOpts().CUID.empty()) - CGM.printPostfixForExternalizedStaticVar(Out); + CGM.printPostfixForExternalizedDecl(Out, ND); return std::string(Out.str()); } @@ -1455,7 +1455,7 @@ StringRef CodeGenModule::getMangledName(GlobalDecl GD) { // directly between host- and device-compilations, the host- and // device-mangling in host compilation could help catching certain ones. assert(!isa(ND) || !ND->hasAttr() || - getLangOpts().CUDAIsDevice || + getContext().shouldExternalizeStaticVar(ND) || getLangOpts().CUDAIsDevice || (getContext().getAuxTargetInfo() && (getContext().getAuxTargetInfo()->getCXXABI() != getContext().getTargetInfo().getCXXABI())) || @@ -6645,7 +6645,14 @@ bool CodeGenModule::stopAutoInit() { return false; } -void CodeGenModule::printPostfixForExternalizedStaticVar( - llvm::raw_ostream &OS) const { - OS << "__static__" << getContext().getCUIDHash(); +void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS, + const Decl *D) const { + StringRef Tag; + // ptxas does not allow '.' in symbol names. On the other hand, HIP prefers + // postfix beginning with '.' since the symbol name can be demangled. + if (LangOpts.HIP) + Tag = (isa(D) ? ".static." : ".intern."); + else + Tag = (isa(D) ? "__static__" : "__intern__"); + OS << Tag << getContext().getCUIDHash(); } diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index 1fcd5d4d808a..a8a63c8da57f 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1447,9 +1447,10 @@ class CodeGenModule : public CodeGenTypeCache { TBAAAccessInfo *TBAAInfo = nullptr); bool stopAutoInit(); - /// Print the postfix for externalized static variable for single source - /// offloading languages CUDA and HIP. - void printPostfixForExternalizedStaticVar(llvm::raw_ostream &OS) const; + /// Print the postfix for externalized static variable or kernels for single + /// source offloading languages CUDA and HIP. + void printPostfixForExternalizedDecl(llvm::raw_ostream &OS, + const Decl *D) const; private: llvm::Constant *GetOrCreateLLVMFunction( diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 43ce33750eba..9638fa2ecfca 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -510,7 +510,7 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs, return; } - CC1Args.push_back("-internal-isystem"); + CC1Args.push_back("-idirafter"); CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath())); if (UsesRuntimeWrapper) CC1Args.append({"-include", "__clang_hip_runtime_wrapper.h"}); diff --git a/clang/lib/Driver/ToolChains/Ananas.cpp b/clang/lib/Driver/ToolChains/Ananas.cpp index be1476a7636c..40f9e56b38e9 100644 --- a/clang/lib/Driver/ToolChains/Ananas.cpp +++ b/clang/lib/Driver/ToolChains/Ananas.cpp @@ -85,7 +85,8 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA, assert(Output.isNothing() && "Invalid output."); } - if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) { + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, + options::OPT_r)) { if (!Args.hasArg(options::OPT_shared)) { CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crt0.o"))); } @@ -111,12 +112,15 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA, AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs, JA); - if (ToolChain.ShouldLinkCXXStdlib(Args)) - ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs); - if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, + options::OPT_r)) { + if (ToolChain.ShouldLinkCXXStdlib(Args)) + ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs); CmdArgs.push_back("-lc"); + } - if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) { + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, + options::OPT_r)) { if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie)) CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtendS.o"))); else diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index dfcef2304040..8f9244cae8db 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -661,6 +661,17 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC, } } +void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC, + const ArgList &Args, + ArgStringList &CmdArgs) { + // Default to clang lib / lib64 folder, i.e. the same location as device + // runtime. + SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(TC.getDriver().Dir); + llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX); + CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath)); +} + void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { // Enable -frtlib-add-rpath by default for the case of VE. @@ -720,6 +731,7 @@ bool tools::addOpenMPRuntime(ArgStringList &CmdArgs, const ToolChain &TC, if (RTKind == Driver::OMPRT_OMP) addOpenMPRuntimeSpecificRPath(TC, Args, CmdArgs); + addOpenMPRuntimeLibraryPath(TC, Args, CmdArgs); return true; } diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 23012dc247e4..2bba1ee285e6 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -111,6 +111,9 @@ void addOpenMPRuntimeSpecificRPath(const ToolChain &TC, llvm::opt::ArgStringList &CmdArgs); void addArchSpecificRPath(const ToolChain &TC, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs); +void addOpenMPRuntimeLibraryPath(const ToolChain &TC, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs); /// Returns true, if an OpenMP runtime has been added. bool addOpenMPRuntime(llvm::opt::ArgStringList &CmdArgs, const ToolChain &TC, const llvm::opt::ArgList &Args, diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index f85c04df4f6c..83cb41159de7 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -301,18 +301,12 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths); - // Similar to the logic for GCC above, if we are currently running Clang - // inside of the requested system root, add its parent library path to those - // searched. - // FIXME: It's not clear whether we should use the driver's installed - // directory ('Dir' below) or the ResourceDir. - if (StringRef(D.Dir).startswith(SysRoot)) { - // Even if OSLibDir != "lib", this is needed for Clang in the build - // directory (not installed) to find libc++. + // The deprecated -DLLVM_ENABLE_PROJECTS=libcxx configuration installs + // libc++.so in D.Dir+"/../lib/". Detect this path. + // TODO Remove once LLVM_ENABLE_PROJECTS=libcxx is unsupported. + if (StringRef(D.Dir).startswith(SysRoot) && + D.getVFS().exists(D.Dir + "/../lib/libc++.so")) addPathIfExists(D, D.Dir + "/../lib", Paths); - if (OSLibDir != "lib") - addPathIfExists(D, D.Dir + "/../" + OSLibDir, Paths); - } addPathIfExists(D, SysRoot + "/lib", Paths); addPathIfExists(D, SysRoot + "/usr/lib", Paths); diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index 24f18b92dd66..cb3898575d3a 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -82,7 +82,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA, assert(Output.isNothing() && "Invalid output."); } - if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) { + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, + options::OPT_r)) { if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crt1.o"))); @@ -122,7 +123,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA, bool NeedsSanitizerDeps = addSanitizerRuntimes(getToolChain(), Args, CmdArgs); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA); - if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs, + options::OPT_r)) { if (getToolChain().ShouldLinkCXXStdlib(Args)) getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); if (Args.hasArg(options::OPT_fstack_protector) || @@ -149,11 +151,13 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA, linkSanitizerRuntimeDeps(getToolChain(), CmdArgs); } - if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) { + if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles, + options::OPT_r)) { CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crtend.o"))); + CmdArgs.push_back( + Args.MakeArgString(getToolChain().GetFilePath("crtn.o"))); } - CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtn.o"))); getToolChain().addProfileRTLibs(Args, CmdArgs); diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S index fcff35fbc7e0..2f445e8f1b20 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_s390x.S @@ -45,3 +45,5 @@ intercept setjmp, _ZN14__interception11real_setjmpE intercept _setjmp, _ZN14__interception12real__setjmpE intercept sigsetjmp, _ZN14__interception14real_sigsetjmpE intercept __sigsetjmp, _ZN14__interception16real___sigsetjmpE + +NO_EXEC_STACK_DIRECTIVE diff --git a/libcxx/include/__ranges/concepts.h b/libcxx/include/__ranges/concepts.h index 5f1fa834d409..e16343591cda 100644 --- a/libcxx/include/__ranges/concepts.h +++ b/libcxx/include/__ranges/concepts.h @@ -68,8 +68,6 @@ namespace ranges { template using range_rvalue_reference_t = iter_rvalue_reference_t>; -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES) - // [range.sized] template concept sized_range = range<_Tp> && requires(_Tp& __t) { ranges::size(__t); }; @@ -135,8 +133,6 @@ namespace ranges { (is_lvalue_reference_v<_Tp> || (movable> && !__is_std_initializer_list>)))); -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES) - } // namespace ranges #endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) diff --git a/libcxx/include/__ranges/data.h b/libcxx/include/__ranges/data.h index f8d92cbc7520..f97ec8033297 100644 --- a/libcxx/include/__ranges/data.h +++ b/libcxx/include/__ranges/data.h @@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES) +#if !defined(_LIBCPP_HAS_NO_CONCEPTS) // [range.prim.data] @@ -99,7 +99,7 @@ inline namespace __cpo { } // namespace __cpo } // namespace ranges -#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES) +#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__ranges/size.h b/libcxx/include/__ranges/size.h index 2b71c03fb399..e1aaf7eba898 100644 --- a/libcxx/include/__ranges/size.h +++ b/libcxx/include/__ranges/size.h @@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES) +#if !defined(_LIBCPP_HAS_NO_CONCEPTS) namespace ranges { template @@ -128,7 +128,7 @@ inline namespace __cpo { } // namespace __cpo } // namespace ranges -#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) && !defined(_LIBCPP_HAS_NO_INCOMPLETE_RANGES) +#endif // !defined(_LIBCPP_HAS_NO_CONCEPTS) _LIBCPP_END_NAMESPACE_STD diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp index add819ff5a80..193956fa9991 100644 --- a/lld/ELF/Arch/AArch64.cpp +++ b/lld/ELF/Arch/AArch64.cpp @@ -256,9 +256,11 @@ void AArch64::writePlt(uint8_t *buf, const Symbol &sym, bool AArch64::needsThunk(RelExpr expr, RelType type, const InputFile *file, uint64_t branchAddr, const Symbol &s, int64_t a) const { - // If s is an undefined weak symbol and does not have a PLT entry then it - // will be resolved as a branch to the next instruction. - if (s.isUndefWeak() && !s.isInPlt()) + // If s is an undefined weak symbol and does not have a PLT entry then it will + // be resolved as a branch to the next instruction. If it is hidden, its + // binding has been converted to local, so we just check isUndefined() here. A + // undefined non-weak symbol will have been errored. + if (s.isUndefined() && !s.isInPlt()) return false; // ELF for the ARM 64-bit architecture, section Call and Jump relocations // only permits range extension thunks for R_AARCH64_CALL26 and diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index b7c2eb74757c..b6b32f0500a4 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -296,9 +296,11 @@ void ARM::addPltSymbols(InputSection &isec, uint64_t off) const { bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file, uint64_t branchAddr, const Symbol &s, int64_t a) const { - // If S is an undefined weak symbol and does not have a PLT entry then it - // will be resolved as a branch to the next instruction. - if (s.isUndefWeak() && !s.isInPlt()) + // If s is an undefined weak symbol and does not have a PLT entry then it will + // be resolved as a branch to the next instruction. If it is hidden, its + // binding has been converted to local, so we just check isUndefined() here. A + // undefined non-weak symbol will have been errored. + if (s.isUndefined() && !s.isInPlt()) return false; // A state change from ARM to Thumb and vice versa must go through an // interworking thunk if the relocation type is not R_ARM_CALL or diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index 4b047f75ad69..490254c995c8 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -733,11 +733,13 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type, if (expr == R_ARM_PCA) // Some PC relative ARM (Thumb) relocations align down the place. p = p & 0xfffffffc; - if (sym.isUndefWeak()) { + if (sym.isUndefined()) { // On ARM and AArch64 a branch to an undefined weak resolves to the next // instruction, otherwise the place. On RISCV, resolve an undefined weak // to the same instruction to cause an infinite loop (making the user // aware of the issue) while ensuring no overflow. + // Note: if the symbol is hidden, its binding has been converted to local, + // so we just check isUndefined() here. if (config->emachine == EM_ARM) dest = getARMUndefinedRelativeWeakVA(type, a, p); else if (config->emachine == EM_AARCH64) diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index ca9fdcde791f..cf2013a5f820 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -695,6 +695,7 @@ def plugin_opt_eq : J<"plugin-opt=">; def: F<"detect-odr-violations">; def: Flag<["-"], "g">; def: F<"long-plt">; +def: FF<"no-add-needed">; def: F<"no-copy-dt-needed-entries">; def: F<"no-ctors-in-init-array">; def: F<"no-keep-memory">; diff --git a/llvm/include/llvm/Support/AArch64TargetParser.def b/llvm/include/llvm/Support/AArch64TargetParser.def index a953e9439db4..c45c149c6084 100644 --- a/llvm/include/llvm/Support/AArch64TargetParser.def +++ b/llvm/include/llvm/Support/AArch64TargetParser.def @@ -290,6 +290,8 @@ AARCH64_CPU_NAME("a64fx", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false, (AArch64::AEK_FP16 | AArch64::AEK_SVE)) AARCH64_CPU_NAME("carmel", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false, AArch64::AEK_FP16) +AARCH64_CPU_NAME("ampere1", ARMV8_6A, FK_CRYPTO_NEON_FP_ARMV8, false, + (AArch64::AEK_FP16 | AArch64::AEK_SB | AArch64::AEK_SSBS)) // Invalid CPU AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID) #undef AARCH64_CPU_NAME diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 6d415c9c7f90..847df84afba6 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -3469,8 +3469,13 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { return false; } - if (!finalizeBasicBlock(*BB, MBB)) + if (!finalizeBasicBlock(*BB, MBB)) { + OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure", + BB->getTerminator()->getDebugLoc(), BB); + R << "unable to translate basic block"; + reportTranslationError(*MF, *TPC, *ORE, R); return false; + } } #ifndef NDEBUG WrapperObserver.removeObserver(&Verifier); diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 6fd2f7e7a718..e6d28141dd06 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -549,9 +549,27 @@ void ELFWriter::writeSymbol(SymbolTableWriter &Writer, uint32_t StringIndex, uint64_t Size = 0; const MCExpr *ESize = MSD.Symbol->getSize(); - if (!ESize && Base) + if (!ESize && Base) { + // For expressions like .set y, x+1, if y's size is unset, inherit from x. ESize = Base->getSize(); + // For `.size x, 2; y = x; .size y, 1; z = y; z1 = z; .symver y, y@v1`, z, + // z1, and y@v1's st_size equals y's. However, `Base` is `x` which will give + // us 2. Follow the MCSymbolRefExpr assignment chain, which covers most + // needs. MCBinaryExpr is not handled. + const MCSymbolELF *Sym = &Symbol; + while (Sym->isVariable()) { + if (auto *Expr = + dyn_cast(Sym->getVariableValue(false))) { + Sym = cast(&Expr->getSymbol()); + if (!Sym->getSize()) + continue; + ESize = Sym->getSize(); + } + break; + } + } + if (ESize) { int64_t Res; if (!ESize->evaluateKnownAbsolute(Res, Layout)) diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index f6003b783245..a82a4d451c8a 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -296,6 +296,12 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { } } + if (Implementer == "0xc0") { // Ampere Computing + return StringSwitch(Part) + .Case("0xac3", "ampere1") + .Default("generic"); + } + return "generic"; } diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 70c7b7b3f5dc..80e574b7b886 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -557,6 +557,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64SchedPredAmpere.td" include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -626,6 +627,7 @@ include "AArch64SchedThunderX2T99.td" include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" +include "AArch64SchedAmpere1.td" def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors">; @@ -939,6 +941,16 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", FeatureFuseAES, FeaturePostRAScheduler]>; +def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", + "Ampere Computing Ampere-1 processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureLSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals]>; def ProcessorFeatures { list A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, @@ -1047,6 +1059,8 @@ def ProcessorFeatures { list TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureFP16FML, FeatureDotProd]; + list Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, + FeatureMTE, FeatureSSBS]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1184,6 +1198,10 @@ def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX, def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel, [TuneCarmel]>; +// Ampere Computing +def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, + [TuneAmpere1]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td new file mode 100644 index 000000000000..32f7299fbf87 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td @@ -0,0 +1,1136 @@ +//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1 to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// The Ampere-1 core is an out-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1Model : SchedMachineModel { + let IssueWidth = 4; // 4-way decode and dispatch + let MicroOpBufferSize = 174; // micro-op re-order buffer size + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F); +} + +let SchedModel = Ampere1Model in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1. +// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, +// and 2 memory) issue into. The integer and FP schedulers can each issue +// one uop per cycle, while the memory schedulers can each issue one load +// and one store address calculation per cycle. + +def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1UnitL : ProcResource<2>; // load +def Ampere1UnitS : ProcResource<2>; // store address calculation +def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>; +def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS, + Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 1; +} + +def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 7; + let NumMicroOps = 1; +} + +def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA, + Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 16; +} + +def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 6; +} + +def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 3; +} + +def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 4; +} + +def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 18; + let NumMicroOps = 1; +} + +def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 25; + let NumMicroOps = 1; +} + +def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 32; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 62; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1Write_Arith : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes; // MOVN, MOVZ +def : WriteRes; // ALU +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes; // EXTR shifts a reg pair +def : WriteRes; // Shift/Scale +def : WriteRes { + let Latency = 18; +} // 32-bit Divide +def : WriteRes { + let Latency = 34; +} // 64-bit Divide +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes; +def : WriteRes; +def : WriteRes { + let Latency = 4; +} // Load from base addr plus immediate offset +def : WriteRes { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store a register pair. +def : WriteRes; +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} // Load from a register index (maybe scaled). +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes { + let Latency = 2; +} // General floating-point ops. +def : WriteRes { + let Latency = 5; +} // Floating-point compare. +def : WriteRes { + let Latency = 6; +} // Float conversion. +def : WriteRes { +} // Float-int register copy. +def : WriteRes { + let Latency = 2; +} // Float-int register copy. +def : WriteRes { + let Latency = 5; +} // Floating-point multiply. +def : WriteRes { + let Latency = 34; +} // Floating-point division. +def : WriteRes { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes { + let Latency = 5; +} // Vector loads. +def : WriteRes { + let Latency = 2; +} // Vector stores. + +def : WriteRes { let Unsupported = 1; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 4; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1. + +def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1Write_9cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_10cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1Write_9cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1Write_12cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1Write_11cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1Write_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1Write_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1Write_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1Write_4cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>; +def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>; +def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1Write_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>; +def : InstRW<[Ampere1Write_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(ADC|SBC)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1Write_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1Write_4cyc_1BS], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1Write_4cyc_2L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_5cyc_1AB_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1Write_2cyc_1AB_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1Write_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1Write_2cyc_1B_1S], + (instrs STPWi, STPXi)>; +def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB], + (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>; + +// Pointer authentication +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>; +def : InstRW<[Ampere1Write_8cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1Write_8cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>; +def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1Write_6cyc_2XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1Write_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1Write_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1Write_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1Write_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1Model diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td new file mode 100644 index 000000000000..8552c07bda56 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td @@ -0,0 +1,25 @@ +//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Ampere Computing processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check for a LSL shift <= 4 +def AmpereCheapLSL : MCSchedPredicate< + CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3, + CheckShiftBy4]>]>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td index fc13b23b4cf8..19a3780c7381 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td +++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td @@ -53,7 +53,7 @@ let FunctionMapper = "AArch64_AM::getShiftType" in { } // Check for shifting in arithmetic and logic instructions. -foreach I = {0-3, 8} in { +foreach I = {0-4, 8} in { let FunctionMapper = "AArch64_AM::getShiftValue" in def CheckShiftBy#I : CheckImmOperand<3, I>; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 8a7e20237271..9f2753584db8 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -221,6 +221,12 @@ void AArch64Subtarget::initializeProperties() { // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case Ampere1: + CacheLineSize = 64; + PrefFunctionLogAlignment = 6; + PrefLoopLogAlignment = 6; + MaxInterleaveFactor = 4; + break; } } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 061db926ee2b..1c14e0a55049 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -40,6 +40,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { enum ARMProcFamilyEnum : uint8_t { Others, A64FX, + Ampere1, AppleA7, AppleA10, AppleA11, diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td index b4bc35e191c0..0fae61fb55c5 100644 --- a/llvm/lib/Target/AVR/AVRCallingConv.td +++ b/llvm/lib/Target/AVR/AVRCallingConv.td @@ -27,6 +27,8 @@ def RetCC_AVR_BUILTIN : CallingConv<[ // Calling convention for variadic functions. def ArgCC_AVR_Vararg : CallingConv<[ + // i8 are always passed through the stack with a byte slot and byte alignment. + CCIfType<[i8], CCAssignToStack<1, 1>>, // i16 are always passed through the stack with an alignment of 1. CCAssignToStack<2, 1> ]>; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 2b96dc0b833a..7e027369f096 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -194,6 +194,11 @@ def brtarget_13 : Operand { let EncoderMethod = "encodeRelCondBrTarget"; } +def rcalltarget_13 : Operand { + let PrintMethod = "printPCRelImm"; + let EncoderMethod = "encodeRelCondBrTarget"; +} + // The target of a 22 or 16-bit call/jmp instruction. def call_target : Operand { let EncoderMethod = "encodeCallTarget"; @@ -965,10 +970,8 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in { let isCall = 1 in { // SP is marked as a use to prevent stack-pointer assignments that appear // immediately before calls from potentially appearing dead. - let Uses = [SP] in def RCALLk : FBRk<1, (outs), - (ins brtarget_13 - : $target), - "rcall\t$target", []>; + let Uses = [SP] in def RCALLk : FBRk<1, (outs), (ins rcalltarget_13:$k), + "rcall\t$k", [(AVRcall imm:$k)]>; // SP is marked as a use to prevent stack-pointer assignments that appear // immediately before calls from potentially appearing dead. @@ -985,13 +988,10 @@ let isCall = 1 in { // SP is marked as a use to prevent stack-pointer assignments that appear // immediately before calls from potentially appearing dead. // - //: TODO: the imm field can be either 16 or 22 bits in devices with more + // TODO: the imm field can be either 16 or 22 bits in devices with more // than 64k of ROM, fix it once we support the largest devices. - let Uses = [SP] in def CALLk : F32BRk<0b111, (outs), - (ins call_target - : $k), - "call\t$k", [(AVRcall imm - : $k)]>, + let Uses = [SP] in def CALLk : F32BRk<0b111, (outs), (ins call_target:$k), + "call\t$k", [(AVRcall imm:$k)]>, Requires<[HasJMPCALL]>; } @@ -2457,8 +2457,12 @@ def : Pat<(adde i8 : $src2))>; // Calls. -def : Pat<(AVRcall(i16 tglobaladdr : $dst)), (CALLk tglobaladdr : $dst)>; -def : Pat<(AVRcall(i16 texternalsym : $dst)), (CALLk texternalsym : $dst)>; +let Predicates = [HasJMPCALL] in { + def : Pat<(AVRcall(i16 tglobaladdr:$dst)), (CALLk tglobaladdr:$dst)>; + def : Pat<(AVRcall(i16 texternalsym:$dst)), (CALLk texternalsym:$dst)>; +} +def : Pat<(AVRcall(i16 tglobaladdr:$dst)), (RCALLk tglobaladdr:$dst)>; +def : Pat<(AVRcall(i16 texternalsym:$dst)), (RCALLk texternalsym:$dst)>; // `anyext` def : Pat<(i16(anyext i8 diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index c83796b8579b..9eb546d1b5dc 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -37,6 +37,8 @@ class SystemZMCCodeEmitter : public MCCodeEmitter { const MCInstrInfo &MCII; MCContext &Ctx; + mutable unsigned MemOpsEmitted; + public: SystemZMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : MCII(mcii), Ctx(ctx) { @@ -165,6 +167,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, verifyInstructionPredicates(MI, computeAvailableFeatures(STI.getFeatureBits())); + MemOpsEmitted = 0; uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); unsigned Size = MCII.get(MI.getOpcode()).getSize(); // Big-endian insertion of Size bytes. @@ -191,12 +194,14 @@ getDispOpValue(const MCInst &MI, unsigned OpNum, SmallVectorImpl &Fixups, SystemZ::FixupKind Kind) const { const MCOperand &MO = MI.getOperand(OpNum); - if (MO.isImm()) + if (MO.isImm()) { + ++MemOpsEmitted; return static_cast(MO.getImm()); + } if (MO.isExpr()) { // All instructions follow the pattern where the first displacement has a // 2 bytes offset, and the second one 4 bytes. - unsigned ByteOffs = Fixups.size() == 0 ? 2 : 4; + unsigned ByteOffs = MemOpsEmitted++ == 0 ? 2 : 4; Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind, MI.getLoc())); assert(Fixups.size() <= 2 && "More than two memory operands in MI?"); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index f10651d5c5d7..d795697b3ed6 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -6442,22 +6442,26 @@ SDValue SystemZTargetLowering::combineINT_TO_FP( SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.Level != BeforeLegalizeTypes) return SDValue(); + SelectionDAG &DAG = DCI.DAG; + LLVMContext &Ctx = *DAG.getContext(); unsigned Opcode = N->getOpcode(); EVT OutVT = N->getValueType(0); - SelectionDAG &DAG = DCI.DAG; + Type *OutLLVMTy = OutVT.getTypeForEVT(Ctx); SDValue Op = N->getOperand(0); - unsigned OutScalarBits = OutVT.getScalarSizeInBits(); + unsigned OutScalarBits = OutLLVMTy->getScalarSizeInBits(); unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits(); // Insert an extension before type-legalization to avoid scalarization, e.g.: // v2f64 = uint_to_fp v2i16 // => // v2f64 = uint_to_fp (v2i64 zero_extend v2i16) - if (OutVT.isVector() && OutScalarBits > InScalarBits) { - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()), - OutVT.getVectorNumElements()); + if (OutLLVMTy->isVectorTy() && OutScalarBits > InScalarBits && + OutScalarBits <= 64) { + unsigned NumElts = cast(OutLLVMTy)->getNumElements(); + EVT ExtVT = EVT::getVectorVT( + Ctx, EVT::getIntegerVT(Ctx, OutLLVMTy->getScalarSizeInBits()), NumElts); unsigned ExtOpcode = - (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND); + (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND); SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op); return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 682932b8f3e6..8bb7e81e19bb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37558,7 +37558,11 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (RootVT.is128BitVector() && Subtarget.hasVLX())) && (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) && isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) { - if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE) + // Bail if this was already a truncation or PACK node. + // We sometimes fail to match PACK if we demand known undef elements. + if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE || + Root.getOpcode() == X86ISD::PACKSS || + Root.getOpcode() == X86ISD::PACKUS)) return SDValue(); // Nothing to do! ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 2aab79e89078..7eaa28bd1320 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2488,8 +2488,12 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B, // not create unnecessary casts if the types already match. Type *SelTy = A->getType(); if (auto *VecTy = dyn_cast(Cond->getType())) { + // For a fixed or scalable vector get N from <{vscale x} N x iM> unsigned Elts = VecTy->getElementCount().getKnownMinValue(); - Type *EltTy = Builder.getIntNTy(SelTy->getPrimitiveSizeInBits() / Elts); + // For a fixed or scalable vector, get the size in bits of N x iM; for a + // scalar this is just M. + unsigned SelEltSize = SelTy->getPrimitiveSizeInBits().getKnownMinSize(); + Type *EltTy = Builder.getIntNTy(SelEltSize / Elts); SelTy = VectorType::get(EltTy, VecTy->getElementCount()); } Value *BitcastC = Builder.CreateBitCast(C, SelTy); diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index c34da51e6dc1..fa1cfc84e4fd 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -342,7 +342,8 @@ static void findReturnsToZap(Function &F, } static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, - DomTreeUpdater &DTU) { + DomTreeUpdater &DTU, + BasicBlock *&NewUnreachableBB) { SmallPtrSet FeasibleSuccessors; bool HasNonFeasibleEdges = false; for (BasicBlock *Succ : successors(BB)) { @@ -385,6 +386,23 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, } else if (FeasibleSuccessors.size() > 1) { SwitchInstProfUpdateWrapper SI(*cast(TI)); SmallVector Updates; + + // If the default destination is unfeasible it will never be taken. Replace + // it with a new block with a single Unreachable instruction. + BasicBlock *DefaultDest = SI->getDefaultDest(); + if (!FeasibleSuccessors.contains(DefaultDest)) { + if (!NewUnreachableBB) { + NewUnreachableBB = + BasicBlock::Create(DefaultDest->getContext(), "default.unreachable", + DefaultDest->getParent(), DefaultDest); + new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); + } + + SI->setDefaultDest(NewUnreachableBB); + Updates.push_back({DominatorTree::Delete, BB, DefaultDest}); + Updates.push_back({DominatorTree::Insert, BB, NewUnreachableBB}); + } + for (auto CI = SI->case_begin(); CI != SI->case_end();) { if (FeasibleSuccessors.contains(CI->getCaseSuccessor())) { ++CI; @@ -532,8 +550,9 @@ bool llvm::runIPSCCP( NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(), /*PreserveLCSSA=*/false, &DTU); + BasicBlock *NewUnreachableBB = nullptr; for (BasicBlock &BB : F) - MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU); + MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU, NewUnreachableBB); for (BasicBlock *DeadBB : BlocksToErase) DTU.deleteBB(DeadBB);