From 8746d127c04f5bbaf6c6e88cef8606ca5a6a54e9 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Thu, 13 Jul 2017 19:25:38 +0000
Subject: [PATCH 1/4] Vendor import of clang trunk r307894:
 https://llvm.org/svn/llvm-project/cfe/trunk@307894

---
 bindings/python/clang/cindex.py               |   11 +-
 bindings/python/tests/cindex/test_cursor.py   |   16 +
 docs/ControlFlowIntegrityDesign.rst           |   38 +-
 docs/LibASTMatchersReference.html             |   32 +
 docs/ReleaseNotes.rst                         |   13 +
 docs/UsersManual.rst                          |   13 +
 include/clang-c/Index.h                       |    5 +
 include/clang/AST/DeclBase.h                  |    2 +-
 include/clang/AST/DeclCXX.h                   |   13 +
 include/clang/AST/ExternalASTMerger.h         |    4 -
 include/clang/AST/RecursiveASTVisitor.h       |    1 +
 include/clang/ASTMatchers/ASTMatchers.h       |    8 +
 include/clang/Analysis/AnalysisContext.h      |    1 +
 include/clang/Analysis/CFG.h                  |   45 +-
 include/clang/Analysis/CloneDetection.h       |  188 ++
 include/clang/Basic/DiagnosticDriverKinds.td  |    2 +
 .../clang/Basic/DiagnosticFrontendKinds.td    |    2 +
 include/clang/Basic/DiagnosticSemaKinds.td    |   34 +-
 .../Basic/DiagnosticSerializationKinds.td     |   14 +-
 include/clang/Basic/Linkage.h                 |   24 +-
 include/clang/Basic/PartialDiagnostic.h       |    9 +
 include/clang/Basic/TargetInfo.h              |    9 +
 include/clang/Basic/arm_neon.td               |  185 --
 include/clang/CodeGen/CodeGenABITypes.h       |    8 +
 include/clang/Driver/Options.td               |   14 +
 include/clang/Driver/ToolChain.h              |    3 +-
 include/clang/Format/Format.h                 |    7 +-
 include/clang/Frontend/FrontendActions.h      |    2 +
 include/clang/Lex/Preprocessor.h              |    9 +-
 include/clang/Sema/DelayedDiagnostic.h        |   14 +-
 include/clang/Sema/Sema.h                     |   32 +-
 include/clang/Sema/TemplateDeduction.h        |    6 +
 .../StaticAnalyzer/Core/AnalyzerOptions.h     |   22 +-
 .../StaticAnalyzer/Core/CheckerManager.h      |    2 +-
 .../Core/PathSensitive/BasicValueFactory.h    |    2 +-
 include/clang/Tooling/ArgumentsAdjusters.h    |    4 +
 include/clang/Tooling/Core/Diagnostic.h       |    6 +-
 include/clang/Tooling/Tooling.h               |    5 +-
 lib/AST/ASTContext.cpp                        |    9 +-
 lib/AST/Decl.cpp                              |   81 +-
 lib/AST/DeclBase.cpp                          |    6 +-
 lib/AST/DeclCXX.cpp                           |   78 +
 lib/AST/ExprConstant.cpp                      |   20 +-
 lib/AST/ExternalASTMerger.cpp                 |    5 -
 lib/AST/ODRHash.cpp                           |   18 +
 lib/Analysis/AnalysisDeclContext.cpp          |    2 +
 lib/Analysis/CFG.cpp                          |  246 ++-
 lib/Analysis/CloneDetection.cpp               |  217 +--
 lib/Basic/Targets.cpp                         |  121 +-
 lib/CodeGen/BackendUtil.cpp                   |    2 +-
 lib/CodeGen/CGBlocks.cpp                      |   19 +-
 lib/CodeGen/CGBuiltin.cpp                     |  212 +--
 lib/CodeGen/CGCall.cpp                        |    2 +-
 lib/CodeGen/CGClass.cpp                       |   82 -
 lib/CodeGen/CGDecl.cpp                        |   22 +-
 lib/CodeGen/CGExpr.cpp                        |   38 +-
 lib/CodeGen/CGExprCXX.cpp                     |    3 +-
 lib/CodeGen/CGOpenMPRuntime.cpp               |   19 +-
 lib/CodeGen/CGStmtOpenMP.cpp                  |   13 +-
 lib/CodeGen/CodeGenABITypes.cpp               |   16 +
 lib/CodeGen/CodeGenFunction.h                 |   12 +-
 lib/CodeGen/CodeGenModule.cpp                 |   73 +-
 lib/CodeGen/CodeGenModule.h                   |   10 +-
 lib/CodeGen/CodeGenTypeCache.h                |    2 +-
 lib/CodeGen/ItaniumCXXABI.cpp                 |   12 +-
 lib/CodeGen/MicrosoftCXXABI.cpp               |    2 +
 lib/CodeGen/TargetInfo.cpp                    |   47 +
 lib/CodeGen/TargetInfo.h                      |   16 +
 lib/Driver/Driver.cpp                         |   29 +-
 lib/Driver/ToolChain.cpp                      |    6 +-
 lib/Driver/ToolChains/Arch/ARM.cpp            |   12 +
 lib/Driver/ToolChains/Arch/Mips.cpp           |   17 +-
 lib/Driver/ToolChains/BareMetal.cpp           |    3 +-
 lib/Driver/ToolChains/BareMetal.h             |    3 +-
 lib/Driver/ToolChains/Clang.cpp               |    9 +-
 lib/Driver/ToolChains/CommonArgs.cpp          |    1 +
 lib/Driver/ToolChains/Cuda.cpp                |   55 +-
 lib/Driver/ToolChains/Cuda.h                  |    3 +-
 lib/Driver/ToolChains/Darwin.cpp              |   26 +-
 lib/Driver/ToolChains/Darwin.h                |    3 +-
 lib/Driver/ToolChains/Fuchsia.cpp             |   86 +-
 lib/Driver/ToolChains/Fuchsia.h               |   23 +-
 lib/Driver/ToolChains/Gnu.cpp                 |    3 +-
 lib/Driver/ToolChains/Gnu.h                   |    3 +-
 lib/Driver/ToolChains/Hexagon.cpp             |    3 +-
 lib/Driver/ToolChains/Hexagon.h               |    3 +-
 lib/Driver/ToolChains/NetBSD.cpp              |   13 +
 lib/Driver/ToolChains/NetBSD.h                |    1 +
 lib/Driver/ToolChains/WebAssembly.cpp         |    3 +-
 lib/Driver/ToolChains/WebAssembly.h           |    3 +-
 lib/Driver/ToolChains/XCore.cpp               |    3 +-
 lib/Driver/ToolChains/XCore.h                 |    3 +-
 lib/Format/ContinuationIndenter.cpp           |   42 +-
 lib/Format/Format.cpp                         |    7 +
 lib/Format/FormatToken.h                      |   27 +-
 lib/Format/TokenAnnotator.cpp                 |   65 +-
 lib/Format/UnwrappedLineParser.cpp            |   33 +-
 lib/Frontend/FrontendActions.cpp              |   10 +
 lib/Frontend/Rewrite/RewriteModernObjC.cpp    |   11 +-
 lib/Frontend/Rewrite/RewriteObjC.cpp          |    2 +-
 lib/Frontend/SerializedDiagnosticReader.cpp   |    3 +
 lib/Frontend/TextDiagnostic.cpp               |   11 +-
 lib/Headers/bmiintrin.h                       |    2 +-
 lib/Headers/cpuid.h                           |  118 +-
 lib/Headers/immintrin.h                       |   18 +-
 lib/Headers/mmintrin.h                        |    2 +-
 lib/Index/IndexBody.cpp                       |   26 +-
 lib/Index/IndexDecl.cpp                       |    2 +
 lib/Index/IndexSymbol.cpp                     |    4 +-
 lib/Index/IndexingContext.cpp                 |   14 +-
 lib/Lex/Lexer.cpp                             |    2 +-
 lib/Lex/PPLexerChange.cpp                     |    6 -
 lib/Lex/Preprocessor.cpp                      |    2 +
 lib/Parse/ParseCXXInlineMethods.cpp           |    2 +-
 lib/Parse/ParseDecl.cpp                       |    2 +-
 lib/Parse/ParseExpr.cpp                       |    4 +-
 lib/Parse/ParseObjc.cpp                       |    2 +-
 lib/Parse/Parser.cpp                          |    4 +
 lib/Sema/AnalysisBasedWarnings.cpp            |   19 +-
 lib/Sema/DelayedDiagnostic.cpp                |    6 +-
 lib/Sema/Sema.cpp                             |   12 +
 lib/Sema/SemaCast.cpp                         |   92 +-
 lib/Sema/SemaChecking.cpp                     |   35 +-
 lib/Sema/SemaCoroutine.cpp                    |   56 +-
 lib/Sema/SemaDecl.cpp                         |   40 +-
 lib/Sema/SemaDeclAttr.cpp                     |  115 +-
 lib/Sema/SemaDeclObjC.cpp                     |    5 +-
 lib/Sema/SemaExpr.cpp                         |  134 +-
 lib/Sema/SemaExprMember.cpp                   |    9 +-
 lib/Sema/SemaLambda.cpp                       |    2 +-
 lib/Sema/SemaLookup.cpp                       |  106 +-
 lib/Sema/SemaObjCProperty.cpp                 |   14 +-
 lib/Sema/SemaOpenMP.cpp                       |   43 +
 lib/Sema/SemaOverload.cpp                     |   31 +-
 lib/Sema/SemaPseudoObject.cpp                 |    9 +-
 lib/Sema/SemaStmt.cpp                         |   67 +-
 lib/Sema/SemaTemplate.cpp                     |  252 ++-
 lib/Serialization/ASTReader.cpp               |   53 +
 lib/Serialization/ASTReaderDecl.cpp           |    2 +
 lib/Serialization/ASTWriter.cpp               |    2 +-
 lib/Serialization/ASTWriterDecl.cpp           |   14 +-
 lib/StaticAnalyzer/Core/AnalysisManager.cpp   |    3 +-
 lib/StaticAnalyzer/Core/AnalyzerOptions.cpp   |   11 +
 lib/StaticAnalyzer/Core/ExprEngine.cpp        |    2 +
 lib/StaticAnalyzer/Core/PathDiagnostic.cpp    |    1 +
 lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp |    7 +-
 lib/Tooling/ArgumentsAdjusters.cpp            |   23 +-
 lib/Tooling/Core/Diagnostic.cpp               |    6 +-
 lib/Tooling/Tooling.cpp                       |   18 +-
 test/Analysis/analyzer-config.c               |    5 +-
 test/Analysis/analyzer-config.cpp             |    4 +-
 test/Analysis/enum.cpp                        |   30 +
 test/Analysis/lifetime-cfg-output.cpp         |  783 ++++++++
 test/CXX/except/except.spec/p11.cpp           |    4 +-
 .../basic/basic.def.odr/p4/module.cpp         |   46 +
 .../basic/basic.def.odr/p4/module.cppm        |  118 ++
 .../basic/basic.def.odr/p4/user.cpp           |   25 +
 .../modules-ts/basic/basic.link/p2/module.cpp |   17 +
 .../basic/basic.link/p2/module.cppm           |   29 +
 .../modules-ts/basic/basic.link/p2/other.cpp  |   16 +
 .../dcl.module/dcl.module.import/p1.cpp       |    4 +-
 test/CXX/modules-ts/dcl.dcl/dcl.module/p5.cpp |   33 +
 test/CodeGen/aarch64-neon-intrinsics.c        |  230 +--
 test/CodeGen/aarch64-neon-ldst-one.c          |  228 +--
 test/CodeGen/aarch64-v8.2a-neon-intrinsics.c  | 1633 -----------------
 test/CodeGen/address-space.c                  |   10 +-
 test/CodeGen/arm_neon_intrinsics.c            |  240 +--
 test/CodeGen/bitscan-builtins.c               |    3 +
 test/CodeGen/default-address-space.c          |   30 +-
 test/CodeGen/mcount.c                         |    4 +
 test/CodeGen/ms-barriers-intrinsics.c         |    6 +-
 test/CodeGen/no-devirt.cpp                    |    4 +-
 test/CodeGen/pgo-sample-thinlto-summary.c     |    4 +-
 test/CodeGenCXX/amdgcn-automatic-variable.cpp |   10 +-
 .../cxx0x-initializer-stdinitializerlist.cpp  |  143 +-
 ...irtualize-virtual-function-calls-final.cpp |   50 +
 test/CodeGenCXX/dllimport-memptr-global.cpp   |   58 +
 .../vtable-available-externally.cpp           |    5 +-
 test/CodeGenCXX/windows-itanium-type-info.cpp |   10 +-
 test/CodeGenOpenCL/address-spaces.cl          |    2 +
 .../amdgcn-automatic-variable.cl              |    8 +-
 test/CodeGenOpenCL/amdgpu-nullptr.cl          |   48 +-
 test/CodeGenOpenCL/amdgpu-sizeof-alignof.cl   |   70 +
 test/Driver/autocomplete.c                    |    4 +
 test/Driver/clang_f_opts.c                    |    4 +
 ... report spaces.c => crash-report-spaces.c} |    3 +-
 test/Driver/darwin-sdk-vs-os-version.c        |   10 +
 test/Driver/fuchsia.c                         |    2 +-
 test/Driver/fuchsia.cpp                       |    5 +-
 test/Driver/mips-features.c                   |   23 +
 test/FixIt/fixit-add-synthesize-to-property.m |   14 +
 test/Import/direct/Inputs/S.c                 |    3 +
 test/Import/direct/test.c                     |    5 +
 test/Import/enum/Inputs/S.cpp                 |    4 +
 test/Import/enum/test.cpp                     |    4 +
 .../import-overrides/Inputs/Hierarchy.cpp     |    9 +
 test/Import/import-overrides/test.cpp         |    7 +
 test/Index/Core/index-source-invalid-name.cpp |   13 +
 test/Index/Core/index-source.cpp              |   16 +
 test/Index/Core/index-source.m                |   25 +
 test/Index/Inputs/empty.dia                   |    0
 test/Index/pipe-size.cl                       |    2 +-
 test/Index/print-type-declaration.cpp         |    7 +
 test/Index/read-empty-diags.test              |    2 +
 test/Index/usrs.m                             |    2 +-
 test/Misc/find-diagnostic-id.c                |    5 +
 test/Modules/missing-flag.cpp                 |    4 +
 test/Modules/odr_hash.cpp                     |   78 +
 test/Modules/preprocess-build.cpp             |    2 +-
 test/Modules/relative-dep-gen.cpp             |   21 +-
 test/OpenMP/taskloop_ast_print.cpp            |    8 +-
 test/OpenMP/taskloop_codegen.cpp              |    6 +-
 test/OpenMP/taskloop_reduction_messages.cpp   |  331 ++++
 test/OpenMP/taskloop_simd_ast_print.cpp       |    8 +-
 test/OpenMP/taskloop_simd_codegen.cpp         |    6 +-
 .../taskloop_simd_reduction_messages.cpp      |  331 ++++
 test/Preprocessor/init.c                      |   12 +-
 .../objc-modern-metadata-visibility2.mm       |   45 +
 test/Sema/address-packed.c                    |    9 +
 test/Sema/attr-availability.c                 |   25 +-
 test/Sema/attr-deprecated.c                   |    4 +-
 test/Sema/attr-unavailable-message.c          |    8 +-
 test/Sema/loop-control.c                      |   48 +
 test/Sema/warn-cast-qual.c                    |   31 +
 test/Sema/warn-documentation.cpp              |    2 +-
 test/SemaCXX/amdgpu-sizeof-alignof.cpp        |   47 +
 test/SemaCXX/attr-deprecated.cpp              |    8 +-
 test/SemaCXX/coroutines.cpp                   |  265 ++-
 test/SemaCXX/dllimport-memptr.cpp             |    7 +
 test/SemaCXX/modules-ts.cppm                  |    8 +-
 test/SemaCXX/warn-cast-qual.cpp               |  140 ++
 test/SemaCXX/warn-loop-analysis.cpp           |   12 +
 test/SemaCXX/warn-throw-out-noexcept-func.cpp |   83 +-
 test/SemaObjC/attr-availability.m             |   30 +-
 test/SemaObjC/default-synthesize-3.m          |    4 +-
 test/SemaObjC/default-synthesize.m            |    2 +-
 .../forward-protocol-incomplete-impl-warn.m   |    2 +-
 test/SemaObjC/objc-container-subscripting-1.m |    5 +-
 test/SemaObjC/objc-container-subscripting-2.m |   19 +
 test/SemaObjC/unguarded-availability-new.m    |    8 +-
 test/SemaObjC/unguarded-availability.m        |   59 +-
 test/SemaOpenCL/cl20-device-side-enqueue.cl   |   16 +-
 test/SemaOpenCL/images.cl                     |   33 +-
 test/SemaTemplate/constexpr-instantiate.cpp   |    2 +-
 test/SemaTemplate/overload-candidates.cpp     |   34 +-
 test/Unit/lit.cfg                             |    5 +-
 test/lit.cfg                                  |    4 +-
 tools/c-index-test/c-index-test.c             |    2 +
 tools/clang-import-test/clang-import-test.cpp |   18 +-
 tools/diagtool/CMakeLists.txt                 |    1 +
 tools/diagtool/FindDiagnosticID.cpp           |   58 +
 tools/libclang/CIndex.cpp                     |   11 +
 tools/libclang/CXIndexDataConsumer.cpp        |    4 +-
 tools/libclang/libclang.exports               |    1 +
 unittests/ASTMatchers/ASTMatchersNodeTest.cpp |    6 +
 unittests/Format/CMakeLists.txt               |    1 +
 unittests/Format/FormatTest.cpp               |   25 +-
 unittests/Format/FormatTestComments.cpp       |   64 +
 unittests/Format/FormatTestJS.cpp             |    9 +
 unittests/Format/FormatTestProto.cpp          |   20 +-
 unittests/Format/FormatTestTextProto.cpp      |  250 +++
 unittests/Tooling/RecursiveASTVisitorTest.cpp |   58 +
 utils/TableGen/NeonEmitter.cpp                |    6 +-
 utils/bash-autocomplete.sh                    |   54 +-
 utils/perf-training/lit.cfg                   |    3 +-
 utils/perf-training/order-files.lit.cfg       |    3 +-
 www/analyzer/checker_dev_manual.html          |    4 +-
 www/analyzer/scripts/expandcollapse.js        |    2 +-
 www/cxx_status.html                           |    7 +-
 269 files changed, 6853 insertions(+), 3739 deletions(-)
 create mode 100644 test/Analysis/lifetime-cfg-output.cpp
 create mode 100644 test/CXX/modules-ts/basic/basic.def.odr/p4/module.cpp
 create mode 100644 test/CXX/modules-ts/basic/basic.def.odr/p4/module.cppm
 create mode 100644 test/CXX/modules-ts/basic/basic.def.odr/p4/user.cpp
 create mode 100644 test/CXX/modules-ts/basic/basic.link/p2/module.cpp
 create mode 100644 test/CXX/modules-ts/basic/basic.link/p2/module.cppm
 create mode 100644 test/CXX/modules-ts/basic/basic.link/p2/other.cpp
 create mode 100644 test/CXX/modules-ts/dcl.dcl/dcl.module/p5.cpp
 delete mode 100644 test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
 create mode 100644 test/CodeGenCXX/dllimport-memptr-global.cpp
 create mode 100644 test/CodeGenOpenCL/amdgpu-sizeof-alignof.cl
 rename test/Driver/{crash report spaces.c => crash-report-spaces.c} (84%)
 create mode 100644 test/Driver/darwin-sdk-vs-os-version.c
 create mode 100644 test/FixIt/fixit-add-synthesize-to-property.m
 create mode 100644 test/Import/direct/Inputs/S.c
 create mode 100644 test/Import/direct/test.c
 create mode 100644 test/Import/enum/Inputs/S.cpp
 create mode 100644 test/Import/enum/test.cpp
 create mode 100644 test/Import/import-overrides/Inputs/Hierarchy.cpp
 create mode 100644 test/Import/import-overrides/test.cpp
 create mode 100644 test/Index/Core/index-source-invalid-name.cpp
 create mode 100644 test/Index/Inputs/empty.dia
 create mode 100644 test/Index/read-empty-diags.test
 create mode 100644 test/Misc/find-diagnostic-id.c
 create mode 100644 test/Modules/missing-flag.cpp
 create mode 100644 test/OpenMP/taskloop_reduction_messages.cpp
 create mode 100644 test/OpenMP/taskloop_simd_reduction_messages.cpp
 create mode 100644 test/Rewriter/objc-modern-metadata-visibility2.mm
 create mode 100644 test/SemaCXX/amdgpu-sizeof-alignof.cpp
 create mode 100644 test/SemaCXX/dllimport-memptr.cpp
 create mode 100644 test/SemaCXX/warn-cast-qual.cpp
 create mode 100644 tools/diagtool/FindDiagnosticID.cpp
 create mode 100644 unittests/Format/FormatTestTextProto.cpp

diff --git a/bindings/python/clang/cindex.py b/bindings/python/clang/cindex.py
index 5e70bde770f8..236803a9ab9b 100644
--- a/bindings/python/clang/cindex.py
+++ b/bindings/python/clang/cindex.py
@@ -782,7 +782,7 @@ def __repr__(self):
 # A C++ template type parameter
 CursorKind.TEMPLATE_TYPE_PARAMETER = CursorKind(27)
 
-# A C++ non-type template paramater.
+# A C++ non-type template parameter.
 CursorKind.TEMPLATE_NON_TYPE_PARAMETER = CursorKind(28)
 
 # A C++ template template parameter.
@@ -1478,6 +1478,11 @@ def is_virtual_method(self):
         """
         return conf.lib.clang_CXXMethod_isVirtual(self)
 
+    def is_scoped_enum(self):
+        """Returns True if the cursor refers to a scoped enum declaration.
+        """
+        return conf.lib.clang_EnumDecl_isScoped(self)
+
     def get_definition(self):
         """
         If the cursor is a reference to a declaration or a declaration of
@@ -3314,6 +3319,10 @@ def cursor(self):
    [Cursor],
    bool),
 
+  ("clang_EnumDecl_isScoped",
+   [Cursor],
+   bool),
+
   ("clang_defaultDiagnosticDisplayOptions",
    [],
    c_uint),
diff --git a/bindings/python/tests/cindex/test_cursor.py b/bindings/python/tests/cindex/test_cursor.py
index 8103e96df4f9..4787ea931e13 100644
--- a/bindings/python/tests/cindex/test_cursor.py
+++ b/bindings/python/tests/cindex/test_cursor.py
@@ -255,6 +255,22 @@ def test_is_virtual_method():
     assert foo.is_virtual_method()
     assert not bar.is_virtual_method()
 
+def test_is_scoped_enum():
+    """Ensure Cursor.is_scoped_enum works."""
+    source = 'class X {}; enum RegularEnum {}; enum class ScopedEnum {};'
+    tu = get_tu(source, lang='cpp')
+
+    cls = get_cursor(tu, 'X')
+    regular_enum = get_cursor(tu, 'RegularEnum')
+    scoped_enum = get_cursor(tu, 'ScopedEnum')
+    assert cls is not None
+    assert regular_enum is not None
+    assert scoped_enum is not None
+
+    assert not cls.is_scoped_enum()
+    assert not regular_enum.is_scoped_enum()
+    assert scoped_enum.is_scoped_enum()
+
 def test_underlying_type():
     tu = get_tu('typedef int foo;')
     typedef = get_cursor(tu, 'foo')
diff --git a/docs/ControlFlowIntegrityDesign.rst b/docs/ControlFlowIntegrityDesign.rst
index 69b72f9ea5b2..e4225b35476a 100644
--- a/docs/ControlFlowIntegrityDesign.rst
+++ b/docs/ControlFlowIntegrityDesign.rst
@@ -437,12 +437,17 @@ export this information, every DSO implements
 
 .. code-block:: none
 
-   void __cfi_check(uint64 CallSiteTypeId, void *TargetAddr)
+   void __cfi_check(uint64 CallSiteTypeId, void *TargetAddr, void *DiagData)
 
-This function provides external modules with access to CFI checks for the
-targets inside this DSO.  For each known ``CallSiteTypeId``, this function
-performs an ``llvm.type.test`` with the corresponding type identifier. It
-aborts if the type is unknown, or if the check fails.
+This function provides external modules with access to CFI checks for
+the targets inside this DSO.  For each known ``CallSiteTypeId``, this
+function performs an ``llvm.type.test`` with the corresponding type
+identifier. It reports an error if the type is unknown, or if the
+check fails. Depending on the values of compiler flags
+``-fsanitize-trap`` and ``-fsanitize-recover``, this function may
+print an error, abort and/or return to the caller. ``DiagData`` is an
+opaque pointer to the diagnostic information about the error, or
+``null`` if the caller does not provide this information.
 
 The basic implementation is a large switch statement over all values
 of CallSiteTypeId supported by this DSO, and each case is similar to
@@ -452,11 +457,10 @@ CFI Shadow
 ----------
 
 To route CFI checks to the target DSO's __cfi_check function, a
-mapping from possible virtual / indirect call targets to
-the corresponding __cfi_check functions is maintained. This mapping is
+mapping from possible virtual / indirect call targets to the
+corresponding __cfi_check functions is maintained. This mapping is
 implemented as a sparse array of 2 bytes for every possible page (4096
-bytes) of memory. The table is kept readonly (FIXME: not yet) most of
-the time.
+bytes) of memory. The table is kept readonly most of the time.
 
 There are 3 types of shadow values:
 
@@ -481,14 +485,24 @@ them.
 CFI_SlowPath
 ------------
 
-The slow path check is implemented in compiler-rt library as
+The slow path check is implemented in a runtime support library as
 
 .. code-block:: none
 
   void __cfi_slowpath(uint64 CallSiteTypeId, void *TargetAddr)
+  void __cfi_slowpath_diag(uint64 CallSiteTypeId, void *TargetAddr, void *DiagData)
 
-This functions loads a shadow value for ``TargetAddr``, finds the
-address of __cfi_check as described above and calls that.
+These functions loads a shadow value for ``TargetAddr``, finds the
+address of ``__cfi_check`` as described above and calls
+that. ``DiagData`` is an opaque pointer to diagnostic data which is
+passed verbatim to ``__cfi_check``, and ``__cfi_slowpath`` passes
+``nullptr`` instead.
+
+Compiler-RT library contains reference implementations of slowpath
+functions, but they have unresolvable issues with correctness and
+performance in the handling of dlopen(). It is recommended that
+platforms provide their own implementations, usually as part of libc
+or libdl.
 
 Position-independent executable requirement
 -------------------------------------------
diff --git a/docs/LibASTMatchersReference.html b/docs/LibASTMatchersReference.html
index a0403a5edf65..cb5020af49c6 100644
--- a/docs/LibASTMatchersReference.html
+++ b/docs/LibASTMatchersReference.html
@@ -1872,6 +1872,14 @@ floatLiteral(equals(3.14)) and floatLiteral(equals(314e-2))
 integerLiteral(equals(42))
   matches 42
 
+Note that you cannot directly match a negative numeric literal because the
+minus sign is not part of the literal: It is a unary operator whose operand
+is the positive numeric literal. Instead, you must use a unaryOperator()
+matcher to match the minus sign:
+
+unaryOperator(hasOperatorName("-"),
+              hasUnaryOperand(integerLiteral(equals(13))))
+
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;,
            Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
@@ -2327,6 +2335,14 @@ floatLiteral(equals(3.14)) and floatLiteral(equals(314e-2))
 integerLiteral(equals(42))
   matches 42
 
+Note that you cannot directly match a negative numeric literal because the
+minus sign is not part of the literal: It is a unary operator whose operand
+is the positive numeric literal. Instead, you must use a unaryOperator()
+matcher to match the minus sign:
+
+unaryOperator(hasOperatorName("-"),
+              hasUnaryOperand(integerLiteral(equals(13))))
+
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;,
            Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
@@ -2583,6 +2599,14 @@ floatLiteral(equals(3.14)) and floatLiteral(equals(314e-2))
 integerLiteral(equals(42))
   matches 42
 
+Note that you cannot directly match a negative numeric literal because the
+minus sign is not part of the literal: It is a unary operator whose operand
+is the positive numeric literal. Instead, you must use a unaryOperator()
+matcher to match the minus sign:
+
+unaryOperator(hasOperatorName("-"),
+              hasUnaryOperand(integerLiteral(equals(13))))
+
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;,
            Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
@@ -2866,6 +2890,14 @@ floatLiteral(equals(3.14)) and floatLiteral(equals(314e-2))
 integerLiteral(equals(42))
   matches 42
 
+Note that you cannot directly match a negative numeric literal because the
+minus sign is not part of the literal: It is a unary operator whose operand
+is the positive numeric literal. Instead, you must use a unaryOperator()
+matcher to match the minus sign:
+
+unaryOperator(hasOperatorName("-"),
+              hasUnaryOperand(integerLiteral(equals(13))))
+
 Usable as: Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CharacterLiteral.html">CharacterLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1CXXBoolLiteralExpr.html">CXXBoolLiteralExpr</a>&gt;,
            Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1FloatingLiteral.html">FloatingLiteral</a>&gt;, Matcher&lt;<a href="http://clang.llvm.org/doxygen/classclang_1_1IntegerLiteral.html">IntegerLiteral</a>&gt;
 </pre></td></tr>
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index f9a3317811eb..8f1515dafd97 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -52,6 +52,9 @@ Major New Features
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+-  -Wcast-qual was implemented for C++. C-style casts are now properly
+   diagnosed.
+
 -  -Wunused-lambda-capture warns when a variable explicitly captured
    by a lambda is not used in the body of the lambda.
 
@@ -60,6 +63,16 @@ New Compiler Flags
 
 The option ....
 
+Deprecated Compiler Flags
+-------------------------
+
+The following options are deprecated and ignored. They will be removed in
+future versions of Clang.
+
+- -fslp-vectorize-aggressive used to enable the BB vectorizing pass. They have been superseeded
+  by the normal SLP vectorizer.
+- -fno-slp-vectorize-aggressive used to be the default behavior of clang.
+
 New Pragmas in Clang
 -----------------------
 
diff --git a/docs/UsersManual.rst b/docs/UsersManual.rst
index df6344af9102..387ec63e9c3d 100644
--- a/docs/UsersManual.rst
+++ b/docs/UsersManual.rst
@@ -332,6 +332,19 @@ output format of the diagnostics that it generates.
    using a structured YAML format, users can parse or sort the remarks in a
    convenient way.
 
+.. _opt_foptimization-record-file:
+
+**-foptimization-record-file**
+   Control the file to which optimization reports are written.
+
+   When optimization reports are being output (see
+   :ref:`-fsave-optimization-record <opt_fsave-optimization-record>`), this
+   option controls the file to which those reports are written.
+
+   If this option is not used, optimization records are output to a file named
+   after the primary file being compiled. If that's "foo.c", for example,
+   optimization records are output to "foo.opt.yaml".
+
 .. _opt_fdiagnostics-show-hotness:
 
 **-f[no-]diagnostics-show-hotness**
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index f404e6d72ec9..09f4403556c8 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -4416,6 +4416,11 @@ CINDEX_LINKAGE unsigned clang_CXXMethod_isStatic(CXCursor C);
  */
 CINDEX_LINKAGE unsigned clang_CXXMethod_isVirtual(CXCursor C);
 
+/**
+ * \brief Determine if an enum declaration refers to a scoped enum.
+ */
+CINDEX_LINKAGE unsigned clang_EnumDecl_isScoped(CXCursor C);
+
 /**
  * \brief Determine if a C++ member function or member function template is
  * declared 'const'.
diff --git a/include/clang/AST/DeclBase.h b/include/clang/AST/DeclBase.h
index 0f1f481ae49b..041f0fd484d4 100644
--- a/include/clang/AST/DeclBase.h
+++ b/include/clang/AST/DeclBase.h
@@ -749,7 +749,7 @@ class LLVM_ALIGNAS(/*alignof(uint64_t)*/ 8) Decl {
   /// Set that this declaration is globally visible, even if it came from a
   /// module that is not visible.
   void setVisibleDespiteOwningModule() {
-    if (hasOwningModule())
+    if (isHidden())
       setModuleOwnershipKind(ModuleOwnershipKind::Visible);
   }
 
diff --git a/include/clang/AST/DeclCXX.h b/include/clang/AST/DeclCXX.h
index 6965e8143ff6..9d64f0244ec3 100644
--- a/include/clang/AST/DeclCXX.h
+++ b/include/clang/AST/DeclCXX.h
@@ -1886,6 +1886,19 @@ class CXXMethodDecl : public FunctionDecl {
     return (CD->begin_overridden_methods() != CD->end_overridden_methods());
   }
 
+  /// If it's possible to devirtualize a call to this method, return the called
+  /// function. Otherwise, return null.
+
+  /// \param Base The object on which this virtual function is called.
+  /// \param IsAppleKext True if we are compiling for Apple kext.
+  CXXMethodDecl *getDevirtualizedMethod(const Expr *Base, bool IsAppleKext);
+
+  const CXXMethodDecl *getDevirtualizedMethod(const Expr *Base,
+                                              bool IsAppleKext) const {
+    return const_cast<CXXMethodDecl *>(this)->getDevirtualizedMethod(
+        Base, IsAppleKext);
+  }
+
   /// \brief Determine whether this is a usual deallocation function
   /// (C++ [basic.stc.dynamic.deallocation]p2), which is an overloaded
   /// delete or delete[] operator with a particular signature.
diff --git a/include/clang/AST/ExternalASTMerger.h b/include/clang/AST/ExternalASTMerger.h
index 92d7b39c48d2..51d0c30ad23b 100644
--- a/include/clang/AST/ExternalASTMerger.h
+++ b/include/clang/AST/ExternalASTMerger.h
@@ -44,10 +44,6 @@ class ExternalASTMerger : public ExternalASTSource {
   FindExternalLexicalDecls(const DeclContext *DC,
                            llvm::function_ref<bool(Decl::Kind)> IsKindWeWant,
                            SmallVectorImpl<Decl *> &Result) override;
-
-   using ExternalASTSource::CompleteType;
-
-   void CompleteType(TagDecl *Tag) override;
 };
 
 } // end namespace clang
diff --git a/include/clang/AST/RecursiveASTVisitor.h b/include/clang/AST/RecursiveASTVisitor.h
index 152e05bca740..917b240428e7 100644
--- a/include/clang/AST/RecursiveASTVisitor.h
+++ b/include/clang/AST/RecursiveASTVisitor.h
@@ -1799,6 +1799,7 @@ DEF_TRAVERSE_DECL(CXXRecordDecl, { TRY_TO(TraverseCXXRecordHelper(D)); })
     if (TypeSourceInfo *TSI = D->getTypeAsWritten())                           \
       TRY_TO(TraverseTypeLoc(TSI->getTypeLoc()));                              \
                                                                                \
+    TRY_TO(TraverseNestedNameSpecifierLoc(D->getQualifierLoc()));              \
     if (!getDerived().shouldVisitTemplateInstantiations() &&                   \
         D->getTemplateSpecializationKind() != TSK_ExplicitSpecialization)      \
       /* Returning from here skips traversing the                              \
diff --git a/include/clang/ASTMatchers/ASTMatchers.h b/include/clang/ASTMatchers/ASTMatchers.h
index cba4c99be959..c9b496df33f7 100644
--- a/include/clang/ASTMatchers/ASTMatchers.h
+++ b/include/clang/ASTMatchers/ASTMatchers.h
@@ -3821,6 +3821,14 @@ AST_MATCHER_P(CompoundStmt, statementCountIs, unsigned, N) {
 /// integerLiteral(equals(42))
 ///   matches 42
 ///
+/// Note that you cannot directly match a negative numeric literal because the
+/// minus sign is not part of the literal: It is a unary operator whose operand
+/// is the positive numeric literal. Instead, you must use a unaryOperator()
+/// matcher to match the minus sign:
+///
+/// unaryOperator(hasOperatorName("-"),
+///               hasUnaryOperand(integerLiteral(equals(13))))
+///
 /// Usable as: Matcher<CharacterLiteral>, Matcher<CXXBoolLiteralExpr>,
 ///            Matcher<FloatingLiteral>, Matcher<IntegerLiteral>
 template <typename ValueT>
diff --git a/include/clang/Analysis/AnalysisContext.h b/include/clang/Analysis/AnalysisContext.h
index f6a47d646d1d..ec7549d4535c 100644
--- a/include/clang/Analysis/AnalysisContext.h
+++ b/include/clang/Analysis/AnalysisContext.h
@@ -426,6 +426,7 @@ class AnalysisDeclContextManager {
                              bool addImplicitDtors = false,
                              bool addInitializers = false,
                              bool addTemporaryDtors = false,
+                             bool addLifetime = false,
                              bool synthesizeBodies = false,
                              bool addStaticInitBranches = false,
                              bool addCXXNewAllocator = true,
diff --git a/include/clang/Analysis/CFG.h b/include/clang/Analysis/CFG.h
index d23ed77ded13..97639bbfade2 100644
--- a/include/clang/Analysis/CFG.h
+++ b/include/clang/Analysis/CFG.h
@@ -58,6 +58,7 @@ class CFGElement {
     Statement,
     Initializer,
     NewAllocator,
+    LifetimeEnds,
     // dtor kind
     AutomaticObjectDtor,
     DeleteDtor,
@@ -167,6 +168,28 @@ class CFGNewAllocator : public CFGElement {
   }
 };
 
+/// Represents the point where the lifetime of an automatic object ends
+class CFGLifetimeEnds : public CFGElement {
+public:
+  explicit CFGLifetimeEnds(const VarDecl *var, const Stmt *stmt)
+      : CFGElement(LifetimeEnds, var, stmt) {}
+
+  const VarDecl *getVarDecl() const {
+    return static_cast<VarDecl *>(Data1.getPointer());
+  }
+
+  const Stmt *getTriggerStmt() const {
+    return static_cast<Stmt *>(Data2.getPointer());
+  }
+
+private:
+  friend class CFGElement;
+  CFGLifetimeEnds() {}
+  static bool isKind(const CFGElement &elem) {
+    return elem.getKind() == LifetimeEnds;
+  }
+};
+
 /// CFGImplicitDtor - Represents C++ object destructor implicitly generated
 /// by compiler on various occasions.
 class CFGImplicitDtor : public CFGElement {
@@ -701,6 +724,10 @@ class CFGBlock {
     Elements.push_back(CFGAutomaticObjDtor(VD, S), C);
   }
 
+  void appendLifetimeEnds(VarDecl *VD, Stmt *S, BumpVectorContext &C) {
+    Elements.push_back(CFGLifetimeEnds(VD, S), C);
+  }
+
   void appendDeleteDtor(CXXRecordDecl *RD, CXXDeleteExpr *DE, BumpVectorContext &C) {
     Elements.push_back(CFGDeleteDtor(RD, DE), C);
   }
@@ -717,6 +744,19 @@ class CFGBlock {
     *I = CFGAutomaticObjDtor(VD, S);
     return ++I;
   }
+
+  // Scope leaving must be performed in reversed order. So insertion is in two
+  // steps. First we prepare space for some number of elements, then we insert
+  // the elements beginning at the last position in prepared space.
+  iterator beginLifetimeEndsInsert(iterator I, size_t Cnt,
+                                   BumpVectorContext &C) {
+    return iterator(
+        Elements.insert(I.base(), Cnt, CFGLifetimeEnds(nullptr, nullptr), C));
+  }
+  iterator insertLifetimeEnds(iterator I, VarDecl *VD, Stmt *S) {
+    *I = CFGLifetimeEnds(VD, S);
+    return ++I;
+  }
 };
 
 /// \brief CFGCallback defines methods that should be called when a logical
@@ -753,6 +793,7 @@ class CFG {
     bool AddEHEdges;
     bool AddInitializers;
     bool AddImplicitDtors;
+    bool AddLifetime;
     bool AddTemporaryDtors;
     bool AddStaticInitBranches;
     bool AddCXXNewAllocator;
@@ -774,8 +815,10 @@ class CFG {
 
     BuildOptions()
       : forcedBlkExprs(nullptr), Observer(nullptr),
-        PruneTriviallyFalseEdges(true), AddEHEdges(false),
+        PruneTriviallyFalseEdges(true),
+        AddEHEdges(false),
         AddInitializers(false), AddImplicitDtors(false),
+        AddLifetime(false),
         AddTemporaryDtors(false), AddStaticInitBranches(false),
         AddCXXNewAllocator(false), AddCXXDefaultInitExprInCtors(false) {}
   };
diff --git a/include/clang/Analysis/CloneDetection.h b/include/clang/Analysis/CloneDetection.h
index 1ca3514e69b0..6339deef41bd 100644
--- a/include/clang/Analysis/CloneDetection.h
+++ b/include/clang/Analysis/CloneDetection.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
 #define LLVM_CLANG_AST_CLONEDETECTION_H
 
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/SourceLocation.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -29,6 +31,192 @@ class VarDecl;
 class ASTContext;
 class CompoundStmt;
 
+namespace clone_detection {
+
+/// Returns a string that represents all macro expansions that expanded into the
+/// given SourceLocation.
+///
+/// If 'getMacroStack(A) == getMacroStack(B)' is true, then the SourceLocations
+/// A and B are expanded from the same macros in the same order.
+std::string getMacroStack(SourceLocation Loc, ASTContext &Context);
+
+/// Collects the data of a single Stmt.
+///
+/// This class defines what a code clone is: If it collects for two statements
+/// the same data, then those two statements are considered to be clones of each
+/// other.
+///
+/// All collected data is forwarded to the given data consumer of the type T.
+/// The data consumer class needs to provide a member method with the signature:
+///   update(StringRef Str)
+template <typename T>
+class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector<T>> {
+
+  ASTContext &Context;
+  /// The data sink to which all data is forwarded.
+  T &DataConsumer;
+
+public:
+  /// Collects data of the given Stmt.
+  /// \param S The given statement.
+  /// \param Context The ASTContext of S.
+  /// \param DataConsumer The data sink to which all data is forwarded.
+  StmtDataCollector(const Stmt *S, ASTContext &Context, T &DataConsumer)
+      : Context(Context), DataConsumer(DataConsumer) {
+    this->Visit(S);
+  }
+
+  typedef unsigned DataPiece;
+
+  // Below are utility methods for appending different data to the vector.
+
+  void addData(DataPiece Integer) {
+    DataConsumer.update(
+        StringRef(reinterpret_cast<char *>(&Integer), sizeof(Integer)));
+  }
+
+  void addData(llvm::StringRef Str) { DataConsumer.update(Str); }
+
+  void addData(const QualType &QT) { addData(QT.getAsString()); }
+
+// The functions below collect the class specific data of each Stmt subclass.
+
+// Utility macro for defining a visit method for a given class. This method
+// calls back to the ConstStmtVisitor to visit all parent classes.
+#define DEF_ADD_DATA(CLASS, CODE)                                              \
+  void Visit##CLASS(const CLASS *S) {                                          \
+    CODE;                                                                      \
+    ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S);                      \
+  }
+
+  DEF_ADD_DATA(Stmt, {
+    addData(S->getStmtClass());
+    // This ensures that macro generated code isn't identical to macro-generated
+    // code.
+    addData(getMacroStack(S->getLocStart(), Context));
+    addData(getMacroStack(S->getLocEnd(), Context));
+  })
+  DEF_ADD_DATA(Expr, { addData(S->getType()); })
+
+  //--- Builtin functionality ----------------------------------------------//
+  DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
+  DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
+  DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
+  DEF_ADD_DATA(TypeTraitExpr, {
+    addData(S->getTrait());
+    for (unsigned i = 0; i < S->getNumArgs(); ++i)
+      addData(S->getArg(i)->getType());
+  })
+
+  //--- Calls --------------------------------------------------------------//
+  DEF_ADD_DATA(CallExpr, {
+    // Function pointers don't have a callee and we just skip hashing it.
+    if (const FunctionDecl *D = S->getDirectCallee()) {
+      // If the function is a template specialization, we also need to handle
+      // the template arguments as they are not included in the qualified name.
+      if (auto Args = D->getTemplateSpecializationArgs()) {
+        std::string ArgString;
+
+        // Print all template arguments into ArgString
+        llvm::raw_string_ostream OS(ArgString);
+        for (unsigned i = 0; i < Args->size(); ++i) {
+          Args->get(i).print(Context.getLangOpts(), OS);
+          // Add a padding character so that 'foo<X, XX>()' != 'foo<XX, X>()'.
+          OS << '\n';
+        }
+        OS.flush();
+
+        addData(ArgString);
+      }
+      addData(D->getQualifiedNameAsString());
+    }
+  })
+
+  //--- Exceptions ---------------------------------------------------------//
+  DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })
+
+  //--- C++ OOP Stmts ------------------------------------------------------//
+  DEF_ADD_DATA(CXXDeleteExpr, {
+    addData(S->isArrayFormAsWritten());
+    addData(S->isGlobalDelete());
+  })
+
+  //--- Casts --------------------------------------------------------------//
+  DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })
+
+  //--- Miscellaneous Exprs ------------------------------------------------//
+  DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
+  DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })
+
+  //--- Control flow -------------------------------------------------------//
+  DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
+  DEF_ADD_DATA(IndirectGotoStmt, {
+    if (S->getConstantTarget())
+      addData(S->getConstantTarget()->getName());
+  })
+  DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
+  DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
+  DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })
+
+  //--- Objective-C --------------------------------------------------------//
+  DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
+  DEF_ADD_DATA(ObjCPropertyRefExpr, {
+    addData(S->isSuperReceiver());
+    addData(S->isImplicitProperty());
+  })
+  DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })
+
+  //--- Miscellaneous Stmts ------------------------------------------------//
+  DEF_ADD_DATA(CXXFoldExpr, {
+    addData(S->isRightFold());
+    addData(S->getOperator());
+  })
+  DEF_ADD_DATA(GenericSelectionExpr, {
+    for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
+      addData(S->getAssocType(i));
+    }
+  })
+  DEF_ADD_DATA(LambdaExpr, {
+    for (const LambdaCapture &C : S->captures()) {
+      addData(C.isPackExpansion());
+      addData(C.getCaptureKind());
+      if (C.capturesVariable())
+        addData(C.getCapturedVar()->getType());
+    }
+    addData(S->isGenericLambda());
+    addData(S->isMutable());
+  })
+  DEF_ADD_DATA(DeclStmt, {
+    auto numDecls = std::distance(S->decl_begin(), S->decl_end());
+    addData(static_cast<DataPiece>(numDecls));
+    for (const Decl *D : S->decls()) {
+      if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
+        addData(VD->getType());
+      }
+    }
+  })
+  DEF_ADD_DATA(AsmStmt, {
+    addData(S->isSimple());
+    addData(S->isVolatile());
+    addData(S->generateAsmString(Context));
+    for (unsigned i = 0; i < S->getNumInputs(); ++i) {
+      addData(S->getInputConstraint(i));
+    }
+    for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
+      addData(S->getOutputConstraint(i));
+    }
+    for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
+      addData(S->getClobber(i));
+    }
+  })
+  DEF_ADD_DATA(AttributedStmt, {
+    for (const Attr *A : S->getAttrs()) {
+      addData(std::string(A->getSpelling()));
+    }
+  })
+};
+} // namespace clone_detection
+
 /// Identifies a list of statements.
 ///
 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
diff --git a/include/clang/Basic/DiagnosticDriverKinds.td b/include/clang/Basic/DiagnosticDriverKinds.td
index 42e1e5edaf9e..a28d63182749 100644
--- a/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/include/clang/Basic/DiagnosticDriverKinds.td
@@ -178,6 +178,8 @@ def warn_drv_optimization_value : Warning<"optimization level '%0' is not suppor
   InGroup<InvalidCommandLineArgument>;
 def warn_ignored_gcc_optimization : Warning<"optimization flag '%0' is not supported">,
   InGroup<IgnoredOptimizationArgument>;
+def warn_ignored_clang_option : Warning<"the flag '%0' has been deprecated and will be ignored">,
+  InGroup<UnusedCommandLineArgument>;
 def warn_drv_unsupported_opt_for_target : Warning<
   "optimization flag '%0' is not supported for target '%1'">,
   InGroup<IgnoredOptimizationArgument>;
diff --git a/include/clang/Basic/DiagnosticFrontendKinds.td b/include/clang/Basic/DiagnosticFrontendKinds.td
index 8b4cb47e545d..57c24e9be73a 100644
--- a/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -179,6 +179,8 @@ def warn_incompatible_analyzer_plugin_api : Warning<
 def note_incompatible_analyzer_plugin_api : Note<
     "current API version is '%0', but plugin was compiled with version '%1'">;
 
+def err_module_build_requires_fmodules : Error<
+  "module compilation requires '-fmodules'">;
 def err_module_interface_requires_modules_ts : Error<
   "module interface compilation requires '-fmodules-ts'">;
 def warn_module_config_mismatch : Warning<
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 136e48ab5e54..5a8750e4dab6 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1029,6 +1029,8 @@ def warn_auto_synthesizing_protocol_property :Warning<
   "auto property synthesis will not synthesize property %0"
   " declared in protocol %1">,
   InGroup<DiagGroup<"objc-protocol-property-synthesis">>;
+def note_add_synthesize_directive : Note<
+  "add a '@synthesize' directive">;
 def warn_no_autosynthesis_shared_ivar_property : Warning <
   "auto property synthesis will not synthesize property "
   "%0 because it cannot share an ivar with another synthesized property">,
@@ -2878,7 +2880,7 @@ def warn_partial_availability : Warning<"%0 is only available conditionally">,
 def warn_partial_availability_new : Warning<warn_partial_availability.Text>,
   InGroup<UnguardedAvailabilityNew>;
 def note_partial_availability_silence : Note<
-  "explicitly redeclare %0 to silence this warning">;
+  "annotate %select{%1|anonymous %1}0 with an availability attribute to silence">;
 def note_unguarded_available_silence : Note<
   "enclose %0 in %select{an @available|a __builtin_available}1 check to silence"
   " this warning">;
@@ -3516,6 +3518,8 @@ def note_ovl_candidate_substitution_failure : Note<
     "candidate template ignored: substitution failure%0%1">;
 def note_ovl_candidate_disabled_by_enable_if : Note<
     "candidate template ignored: disabled by %0%1">;
+def note_ovl_candidate_disabled_by_requirement : Note<
+    "candidate template ignored: requirement '%0' was not satisfied%1">;
 def note_ovl_candidate_has_pass_object_size_params: Note<
     "candidate address cannot be taken because parameter %0 has "
     "pass_object_size attribute">;
@@ -4429,6 +4433,9 @@ def err_typename_nested_not_found : Error<"no type named %0 in %1">;
 def err_typename_nested_not_found_enable_if : Error<
   "no type named 'type' in %0; 'enable_if' cannot be used to disable "
   "this declaration">;
+def err_typename_nested_not_found_requirement : Error<
+  "failed requirement '%0'; 'enable_if' cannot be used to disable this "
+  "declaration">;
 def err_typename_nested_not_type : Error<
     "typename specifier refers to non-type member %0 in %1">;
 def note_typename_refers_here : Note<
@@ -6353,15 +6360,13 @@ def err_exceptions_disabled : Error<
   "cannot use '%0' with exceptions disabled">;
 def err_objc_exceptions_disabled : Error<
   "cannot use '%0' with Objective-C exceptions disabled">;
-def warn_throw_in_noexcept_func 
-    : Warning<"%0 has a non-throwing exception specification but can still "
-      "throw, resulting in unexpected program termination">,
-      InGroup<Exceptions>;
-def note_throw_in_dtor 
-    : Note<"destructor or deallocator has a (possibly implicit) non-throwing "
-      "excepton specification">;
-def note_throw_in_function 
-    : Note<"non-throwing function declare here">;
+def warn_throw_in_noexcept_func : Warning<
+  "%0 has a non-throwing exception specification but can still throw">,
+  InGroup<Exceptions>;
+def note_throw_in_dtor : Note<
+  "%select{destructor|deallocator}0 has a %select{non-throwing|implicit "
+  "non-throwing}1 exception specification">;
+def note_throw_in_function : Note<"function declared non-throwing here">;
 def err_seh_try_outside_functions : Error<
   "cannot use SEH '__try' in blocks, captured regions, or Obj-C method decls">;
 def err_mixing_cxx_try_seh_try : Error<
@@ -8455,8 +8460,6 @@ def err_opencl_builtin_to_addr_invalid_arg : Error<
 // OpenCL v2.0 s6.13.17 Enqueue kernel restrictions.
 def err_opencl_enqueue_kernel_incorrect_args : Error<
   "illegal call to enqueue_kernel, incorrect argument types">;
-def err_opencl_enqueue_kernel_expected_type : Error<
-  "illegal call to enqueue_kernel, expected %0 argument type">;
 def err_opencl_enqueue_kernel_local_size_args : Error<
   "mismatch in number of block parameters and local size arguments passed">;
 def err_opencl_enqueue_kernel_invalid_local_size_type : Error<
@@ -8466,6 +8469,9 @@ def err_opencl_enqueue_kernel_blocks_non_local_void_args : Error<
 def err_opencl_enqueue_kernel_blocks_no_args : Error<
   "blocks with parameters are not accepted in this prototype of enqueue_kernel call">;
 
+def err_opencl_builtin_expected_type : Error<
+  "illegal call to %0, expected %1 argument type">;
+
 // OpenCL v2.2 s2.1.2.3 - Vector Component Access
 def ext_opencl_ext_vector_type_rgba_selector: ExtWarn<
   "vector component name '%0' is an OpenCL version 2.2 feature">,
@@ -8854,6 +8860,10 @@ def warn_omp_nesting_simd : Warning<
 def err_omp_orphaned_device_directive : Error<
   "orphaned 'omp %0' directives are prohibited"
   "; perhaps you forget to enclose the directive into a %select{|||target |teams }1region?">;
+def err_omp_reduction_non_addressable_expression : Error<
+  "expected addressable reduction item for the task-based directives">;
+def err_omp_reduction_with_nogroup : Error<
+  "'reduction' clause cannot be used with 'nogroup' clause">;
 } // end of OpenMP category
 
 let CategoryName = "Related Result Type Issue" in {
diff --git a/include/clang/Basic/DiagnosticSerializationKinds.td b/include/clang/Basic/DiagnosticSerializationKinds.td
index 3c64ebb9c7f4..0fc54848581c 100644
--- a/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -127,11 +127,11 @@ def err_module_odr_violation_mismatch_decl : Error<
   "%select{definition in module '%2'|defined here}1 found "
   "%select{end of class|public access specifier|private access specifier|"
   "protected access specifier|static assert|field|method|type alias|typedef|"
-  "data member}3">;
+  "data member|friend declaration}3">;
 def note_module_odr_violation_mismatch_decl : Note<"but in '%0' found "
   "%select{end of class|public access specifier|private access specifier|"
   "protected access specifier|static assert|field|method|type alias|typedef|"
-  "data member}1">;
+  "data member|friend declaration}1">;
 
 def err_module_odr_violation_mismatch_decl_diff : Error<
   "%q0 has different definitions in different modules; first difference is "
@@ -166,6 +166,9 @@ def err_module_odr_violation_mismatch_decl_diff : Error<
   "data member %4 with%select{out|}5 an initializer|"
   "data member %4 with an initializer|"
   "data member %4 %select{is constexpr|is not constexpr}5|"
+  "friend %select{class|function}4|"
+  "friend %4|"
+  "friend function %4|"
   "}3">;
 
 def note_module_odr_violation_mismatch_decl_diff : Note<"but in '%0' found "
@@ -199,18 +202,21 @@ def note_module_odr_violation_mismatch_decl_diff : Note<"but in '%0' found "
   "data member %2 with%select{out|}3 an initializer|"
   "data member %2 with a different initializer|"
   "data member %2 %select{is constexpr|is not constexpr}3|"
+  "friend %select{class|function}2|"
+  "friend %2|"
+  "friend function %2|"
   "}1">;
 
 def err_module_odr_violation_mismatch_decl_unknown : Error<
   "%q0 %select{with definition in module '%2'|defined here}1 has different "
   "definitions in different modules; first difference is this "
   "%select{||||static assert|field|method|type alias|typedef|data member|"
-  "unexpected decl}3">;
+  "friend declaration|unexpected decl}3">;
 def note_module_odr_violation_mismatch_decl_unknown : Note<
   "but in '%0' found "
   "%select{||||different static assert|different field|different method|"
   "different type alias|different typedef|different data member|"
-  "another unexpected decl}1">;
+  "different friend declaration|another unexpected decl}1">;
 
 def warn_duplicate_module_file_extension : Warning<
   "duplicate module file extension block name '%0'">,
diff --git a/include/clang/Basic/Linkage.h b/include/clang/Basic/Linkage.h
index e96fb568c009..6ec8763f2491 100644
--- a/include/clang/Basic/Linkage.h
+++ b/include/clang/Basic/Linkage.h
@@ -45,6 +45,17 @@ enum Linkage : unsigned char {
   /// translation units because of types defined in a inline function.
   VisibleNoLinkage,
 
+  /// \brief Internal linkage according to the Modules TS, but can be referred
+  /// to from other translation units indirectly through inline functions and
+  /// templates in the module interface.
+  ModuleInternalLinkage,
+
+  /// \brief Module linkage, which indicates that the entity can be referred
+  /// to from other translation units within the same module, and indirectly
+  /// from arbitrary other translation units through inline functions and
+  /// templates in the module interface.
+  ModuleLinkage,
+
   /// \brief External linkage, which indicates that the entity can
   /// be referred to from other translation units.
   ExternalLinkage
@@ -74,15 +85,20 @@ inline bool isDiscardableGVALinkage(GVALinkage L) {
 }
 
 inline bool isExternallyVisible(Linkage L) {
-  return L == ExternalLinkage || L == VisibleNoLinkage;
+  return L >= VisibleNoLinkage;
 }
 
 inline Linkage getFormalLinkage(Linkage L) {
-  if (L == UniqueExternalLinkage)
+  switch (L) {
+  case UniqueExternalLinkage:
     return ExternalLinkage;
-  if (L == VisibleNoLinkage)
+  case VisibleNoLinkage:
     return NoLinkage;
-  return L;
+  case ModuleInternalLinkage:
+    return InternalLinkage;
+  default:
+    return L;
+  }
 }
 
 inline bool isExternalFormalLinkage(Linkage L) {
diff --git a/include/clang/Basic/PartialDiagnostic.h b/include/clang/Basic/PartialDiagnostic.h
index 53ce95cab1b0..b2f14afe5695 100644
--- a/include/clang/Basic/PartialDiagnostic.h
+++ b/include/clang/Basic/PartialDiagnostic.h
@@ -329,6 +329,15 @@ class PartialDiagnostic {
 
   bool hasStorage() const { return DiagStorage != nullptr; }
 
+  /// Retrieve the string argument at the given index.
+  StringRef getStringArg(unsigned I) {
+    assert(DiagStorage && "No diagnostic storage?");
+    assert(I < DiagStorage->NumDiagArgs && "Not enough diagnostic args");
+    assert(DiagStorage->DiagArgumentsKind[I]
+             == DiagnosticsEngine::ak_std_string && "Not a string arg");
+    return DiagStorage->DiagArgumentsStr[I];
+  }
+
   friend const PartialDiagnostic &operator<<(const PartialDiagnostic &PD,
                                              unsigned I) {
     PD.AddTaggedVal(I, DiagnosticsEngine::ak_uint);
diff --git a/include/clang/Basic/TargetInfo.h b/include/clang/Basic/TargetInfo.h
index 9bdb288eef4f..5885532b91db 100644
--- a/include/clang/Basic/TargetInfo.h
+++ b/include/clang/Basic/TargetInfo.h
@@ -23,6 +23,7 @@
 #include "clang/Basic/VersionTuple.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -954,6 +955,14 @@ class TargetInfo : public RefCountedBase<TargetInfo> {
     return *AddrSpaceMap;
   }
 
+  /// \brief Return an AST address space which can be used opportunistically
+  /// for constant global memory. It must be possible to convert pointers into
+  /// this address space to LangAS::Default. If no such address space exists,
+  /// this may return None, and such optimizations will be disabled.
+  virtual llvm::Optional<unsigned> getConstantAddressSpace() const {
+    return LangAS::Default;
+  }
+
   /// \brief Retrieve the name of the platform as it is used in the
   /// availability attribute.
   StringRef getPlatformName() const { return PlatformName; }
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index d5c16a91a34f..ad8d679a1664 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -227,7 +227,6 @@ def OP_UNAVAILABLE : Operation {
 // u: unsigned integer (int/float args)
 // f: float (int args)
 // F: double (int args)
-// H: half (int args)
 // d: default
 // g: default, ignore 'Q' size modifier.
 // j: default, force 'Q' size modifier.
@@ -346,7 +345,6 @@ def OP_MLSLHi   : Op<(call "vmlsl", $p0, (call "vget_high", $p1),
                                          (call "vget_high", $p2))>;
 def OP_MLSLHi_N : Op<(call "vmlsl_n", $p0, (call "vget_high", $p1), $p2)>;
 def OP_MUL_N    : Op<(op "*", $p0, (dup $p1))>;
-def OP_MULX_N   : Op<(call "vmulx", $p0, (dup $p1))>;
 def OP_MLA_N    : Op<(op "+", $p0, (op "*", $p1, (dup $p2)))>;
 def OP_MLS_N    : Op<(op "-", $p0, (op "*", $p1, (dup $p2)))>;
 def OP_FMLA_N   : Op<(call "vfma", $p0, $p1, (dup $p2))>;
@@ -1663,186 +1661,3 @@ def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "sssji", "SsSi", OP_SCALAR
 def SCALAR_VDUP_LANE : IInst<"vdup_lane", "sdi", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "sji", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 }
-
-// ARMv8.2-A FP16 intrinsics.
-let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in {
-
-  // ARMv8.2-A FP16 one-operand vector intrinsics.
-
-  // Comparison
-  def CMEQH    : SInst<"vceqz", "ud", "hQh">;
-  def CMGEH    : SInst<"vcgez", "ud", "hQh">;
-  def CMGTH    : SInst<"vcgtz", "ud", "hQh">;
-  def CMLEH    : SInst<"vclez", "ud", "hQh">;
-  def CMLTH    : SInst<"vcltz", "ud", "hQh">;
-
-  // Vector conversion
-  def VCVT_F16     : SInst<"vcvt_f16", "Hd",  "sUsQsQUs">;
-  def VCVT_S16     : SInst<"vcvt_s16", "xd",  "hQh">;
-  def VCVT_U16     : SInst<"vcvt_u16", "ud",  "hQh">;
-  def VCVTA_S16    : SInst<"vcvta_s16", "xd", "hQh">;
-  def VCVTA_U16    : SInst<"vcvta_u16", "ud", "hQh">;
-  def VCVTM_S16    : SInst<"vcvtm_s16", "xd", "hQh">;
-  def VCVTM_U16    : SInst<"vcvtm_u16", "ud", "hQh">;
-  def VCVTN_S16    : SInst<"vcvtn_s16", "xd", "hQh">;
-  def VCVTN_U16    : SInst<"vcvtn_u16", "ud", "hQh">;
-  def VCVTP_S16    : SInst<"vcvtp_s16", "xd", "hQh">;
-  def VCVTP_U16    : SInst<"vcvtp_u16", "ud", "hQh">;
-
-  // Vector rounding
-  def FRINTZH      : SInst<"vrnd",  "dd", "hQh">;
-  def FRINTNH      : SInst<"vrndn", "dd", "hQh">;
-  def FRINTAH      : SInst<"vrnda", "dd", "hQh">;
-  def FRINTPH      : SInst<"vrndp", "dd", "hQh">;
-  def FRINTMH      : SInst<"vrndm", "dd", "hQh">;
-  def FRINTXH      : SInst<"vrndx", "dd", "hQh">;
-  def FRINTIH      : SInst<"vrndi", "dd", "hQh">;
-
-  // Misc.
-  def VABSH        : SInst<"vabs", "dd", "hQh">;
-  def VNEGH        : SOpInst<"vneg", "dd", "hQh", OP_NEG>;
-  def VRECPEH      : SInst<"vrecpe", "dd", "hQh">;
-  def FRSQRTEH     : SInst<"vrsqrte", "dd", "hQh">;
-  def FSQRTH       : SInst<"vsqrt", "dd", "hQh">;
-
-  // ARMv8.2-A FP16 two-operands vector intrinsics.
-
-  // Misc.
-  def VADDH        : SOpInst<"vadd", "ddd", "hQh", OP_ADD>;
-  def VABDH        : SInst<"vabd", "ddd",  "hQh">;
-  def VSUBH         : SOpInst<"vsub", "ddd", "hQh", OP_SUB>;
-
-  // Comparison
-  let InstName = "vacge" in {
-	  def VCAGEH     : SInst<"vcage", "udd", "hQh">;
-	  def VCALEH     : SInst<"vcale", "udd", "hQh">;
-  }
-  let InstName = "vacgt" in {
-    def VCAGTH     : SInst<"vcagt", "udd", "hQh">;
-	  def VCALTH     : SInst<"vcalt", "udd", "hQh">;
-  }
-  def VCEQH        : SOpInst<"vceq", "udd", "hQh", OP_EQ>;
-  def VCGEH        : SOpInst<"vcge", "udd", "hQh", OP_GE>;
-  def VCGTH        : SOpInst<"vcgt", "udd", "hQh", OP_GT>;
-  let InstName = "vcge" in
-    def VCLEH      : SOpInst<"vcle", "udd", "hQh", OP_LE>;
-  let InstName = "vcgt" in
-    def VCLTH      : SOpInst<"vclt", "udd", "hQh", OP_LT>;
-
-  // Vector conversion
-  let isVCVT_N = 1 in {
-    def VCVT_N_F16 : SInst<"vcvt_n_f16", "Hdi", "sUsQsQUs">;
-    def VCVT_N_S16 : SInst<"vcvt_n_s16", "xdi", "hQh">;
-    def VCVT_N_U16 : SInst<"vcvt_n_u16", "udi", "hQh">;
-  }
-
-  // Max/Min
-  def VMAXH         : SInst<"vmax", "ddd", "hQh">;
-  def VMINH         : SInst<"vmin", "ddd", "hQh">;
-  def FMAXNMH       : SInst<"vmaxnm", "ddd", "hQh">;
-  def FMINNMH       : SInst<"vminnm", "ddd", "hQh">;
-
-  // Multiplication/Division
-  def VMULH         : SOpInst<"vmul", "ddd", "hQh", OP_MUL>;
-  def MULXH         : SInst<"vmulx", "ddd", "hQh">;
-  def FDIVH         : IOpInst<"vdiv", "ddd",  "hQh", OP_DIV>;
-
-  // Pairwise addition
-  def VPADDH        : SInst<"vpadd", "ddd", "hQh">;
-
-  // Pairwise Max/Min
-  def VPMAXH        : SInst<"vpmax", "ddd", "hQh">;
-  def VPMINH        : SInst<"vpmin", "ddd", "hQh">;
-  // Pairwise MaxNum/MinNum
-  def FMAXNMPH      : SInst<"vpmaxnm", "ddd", "hQh">;
-  def FMINNMPH      : SInst<"vpminnm", "ddd", "hQh">;
-
-  // Reciprocal/Sqrt
-  def VRECPSH       : SInst<"vrecps", "ddd", "hQh">;
-  def VRSQRTSH      : SInst<"vrsqrts", "ddd", "hQh">;
-
-  // ARMv8.2-A FP16 three-operands vector intrinsics.
-
-  // Vector fused multiply-add operations
-  def VFMAH        : SInst<"vfma", "dddd", "hQh">;
-  def VFMSH        : SOpInst<"vfms", "dddd", "hQh", OP_FMLS>;
-
-  // ARMv8.2-A FP16 lane vector intrinsics.
-
-  // FMA lane
-  def VFMA_LANEH   : IInst<"vfma_lane", "dddgi", "hQh">;
-  def VFMA_LANEQH  : IInst<"vfma_laneq", "dddji", "hQh">;
-
-  // FMA lane with scalar argument
-  def FMLA_NH      : SOpInst<"vfma_n", "ddds", "hQh", OP_FMLA_N>;
-  // Scalar floating point fused multiply-add (scalar, by element)
-  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "sssdi", "Sh">;
-  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "sssji", "Sh">;
-
-  // FMS lane
-  def VFMS_LANEH   : IOpInst<"vfms_lane", "dddgi", "hQh", OP_FMS_LN>;
-  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "dddji", "hQh", OP_FMS_LNQ>;
-  // FMS lane with scalar argument
-  def FMLS_NH      : SOpInst<"vfms_n", "ddds", "hQh", OP_FMLS_N>;
-  // Scalar floating foint fused multiply-subtract (scalar, by element)
-  def SCALAR_FMLS_LANEH  : IOpInst<"vfms_lane", "sssdi", "Sh", OP_FMS_LN>;
-  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "sssji", "Sh", OP_FMS_LNQ>;
-
-  // Mul lane
-  def VMUL_LANEH    : IOpInst<"vmul_lane", "ddgi", "hQh", OP_MUL_LN>;
-  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "ddji", "hQh", OP_MUL_LN>;
-  def VMUL_NH       : IOpInst<"vmul_n", "dds", "hQh", OP_MUL_N>;
-  // Scalar floating point  multiply (scalar, by element)
-  def SCALAR_FMUL_LANEH  : IOpInst<"vmul_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>;
-  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>;
-
-  // Mulx lane
-  def VMULX_LANEH   : IOpInst<"vmulx_lane", "ddgi", "hQh", OP_MULX_LN>;
-  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "ddji", "hQh", OP_MULX_LN>;
-  def VMULX_NH      : IOpInst<"vmulx_n", "dds", "hQh", OP_MULX_N>;
-  // TODO: Scalar floating point multiply extended (scalar, by element)
-  // Below ones are commented out because they need vmulx_f16(float16_t, float16_t)
-  // which will be implemented later with fp16 scalar intrinsic (arm_fp16.h)
-  //def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>;
-  //def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>;
-
-  // ARMv8.2-A FP16 reduction vector intrinsics.
-  def VMAXVH   : SInst<"vmaxv", "sd", "hQh">;
-  def VMINVH   : SInst<"vminv", "sd", "hQh">;
-  def FMAXNMVH : SInst<"vmaxnmv", "sd", "hQh">;
-  def FMINNMVH : SInst<"vminnmv", "sd", "hQh">;
-
-  // Data processing intrinsics - section 5
-
-  // Logical operations
-  let isHiddenLInst = 1 in
-  def VBSLH    : SInst<"vbsl", "dudd", "hQh">;
-
-  // Transposition operations
-  def VZIPH    : WInst<"vzip", "2dd", "hQh">;
-  def VUZPH    : WInst<"vuzp", "2dd", "hQh">;
-  def VTRNH    : WInst<"vtrn", "2dd", "hQh">;
-
-  // Set all lanes to same value.
-  /* Already implemented prior to ARMv8.2-A.
-  def VMOV_NH  : WOpInst<"vmov_n", "ds", "hQh", OP_DUP>;
-  def VDUP_NH  : WOpInst<"vdup_n", "ds", "hQh", OP_DUP>;
-  def VDUP_LANE1H : WOpInst<"vdup_lane", "dgi", "hQh", OP_DUP_LN>;*/
-
-  // Vector Extract
-  def VEXTH      : WInst<"vext", "dddi", "hQh">;
-
-  // Reverse vector elements
-  def VREV64H    : WOpInst<"vrev64", "dd", "hQh", OP_REV64>;
-
-  // Permutation
-  def VTRN1H     : SOpInst<"vtrn1", "ddd", "hQh", OP_TRN1>;
-  def VZIP1H     : SOpInst<"vzip1", "ddd", "hQh", OP_ZIP1>;
-  def VUZP1H     : SOpInst<"vuzp1", "ddd", "hQh", OP_UZP1>;
-  def VTRN2H     : SOpInst<"vtrn2", "ddd", "hQh", OP_TRN2>;
-  def VZIP2H     : SOpInst<"vzip2", "ddd", "hQh", OP_ZIP2>;
-  def VUZP2H     : SOpInst<"vuzp2", "ddd", "hQh", OP_UZP2>;
-
-  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "sdi", "Sh">;
-  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "sji", "Sh">;
-}
diff --git a/include/clang/CodeGen/CodeGenABITypes.h b/include/clang/CodeGen/CodeGenABITypes.h
index 8ba769dfc3af..615e55c8b69f 100644
--- a/include/clang/CodeGen/CodeGenABITypes.h
+++ b/include/clang/CodeGen/CodeGenABITypes.h
@@ -31,6 +31,8 @@
 namespace llvm {
   class DataLayout;
   class Module;
+  class FunctionType;
+  class Type;
 }
 
 namespace clang {
@@ -70,6 +72,12 @@ const CGFunctionInfo &arrangeFreeFunctionCall(CodeGenModule &CGM,
                                               FunctionType::ExtInfo info,
                                               RequiredArgs args);
 
+// Returns null if the function type is incomplete and can't be lowered.
+llvm::FunctionType *convertFreeFunctionType(CodeGenModule &CGM,
+                                            const FunctionDecl *FD);
+
+llvm::Type *convertTypeForMemory(CodeGenModule &CGM, QualType T);
+
 }  // end namespace CodeGen
 }  // end namespace clang
 
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index b65b984731f6..861dfbf1916e 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -194,6 +194,16 @@ def clang_ignored_f_Group : OptionGroup<"<clang ignored f group>">,
 def clang_ignored_m_Group : OptionGroup<"<clang ignored m group>">,
   Group<m_Group>, Flags<[Ignored]>;
 
+// Group for clang options in the process of deprecation.
+// Please include the version that deprecated the flag as comment to allow
+// easier garbage collection.
+def clang_ignored_legacy_options_Group : OptionGroup<"<clang legacy flags>">,
+  Group<f_Group>, Flags<[Ignored]>;
+
+// Retired with clang-5.0
+def : Flag<["-"], "fslp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
+def : Flag<["-"], "fno-slp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
+
 // Group that ignores all gcc optimizations that won't be implemented
 def clang_ignored_gcc_optimization_f_Group : OptionGroup<
   "<clang_ignored_gcc_optimization_f_Group>">, Group<f_Group>, Flags<[Ignored]>;
@@ -2017,6 +2027,10 @@ def mmsa : Flag<["-"], "mmsa">, Group<m_Group>,
   HelpText<"Enable MSA ASE (MIPS only)">;
 def mno_msa : Flag<["-"], "mno-msa">, Group<m_Group>,
   HelpText<"Disable MSA ASE (MIPS only)">;
+def mmt : Flag<["-"], "mmt">, Group<m_Group>,
+  HelpText<"Enable MT ASE (MIPS only)">;
+def mno_mt : Flag<["-"], "mno-mt">, Group<m_Group>,
+  HelpText<"Disable MT ASE (MIPS only)">;
 def mfp64 : Flag<["-"], "mfp64">, Group<m_Group>,
   HelpText<"Use 64-bit floating point registers (MIPS only)">;
 def mfp32 : Flag<["-"], "mfp32">, Group<m_Group>,
diff --git a/include/clang/Driver/ToolChain.h b/include/clang/Driver/ToolChain.h
index 105d0f338ac6..eb42f1260d92 100644
--- a/include/clang/Driver/ToolChain.h
+++ b/include/clang/Driver/ToolChain.h
@@ -411,7 +411,8 @@ class ToolChain {
 
   /// \brief Add options that need to be passed to cc1 for this target.
   virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                                     llvm::opt::ArgStringList &CC1Args) const;
+                                     llvm::opt::ArgStringList &CC1Args,
+                                     Action::OffloadKind DeviceOffloadKind) const;
 
   /// \brief Add warning options that need to be passed to cc1 for this target.
   virtual void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const;
diff --git a/include/clang/Format/Format.h b/include/clang/Format/Format.h
index ee24c55fef61..99d54e55e828 100644
--- a/include/clang/Format/Format.h
+++ b/include/clang/Format/Format.h
@@ -1120,7 +1120,10 @@ struct FormatStyle {
     /// (https://developers.google.com/protocol-buffers/).
     LK_Proto,
     /// Should be used for TableGen code.
-    LK_TableGen
+    LK_TableGen,
+    /// Should be used for Protocol Buffer messages in text format
+    /// (https://developers.google.com/protocol-buffers/).
+    LK_TextProto
   };
   bool isCpp() const { return Language == LK_Cpp || Language == LK_ObjC; }
 
@@ -1750,6 +1753,8 @@ inline StringRef getLanguageName(FormatStyle::LanguageKind Language) {
     return "JavaScript";
   case FormatStyle::LK_Proto:
     return "Proto";
+  case FormatStyle::LK_TextProto:
+    return "TextProto";
   default:
     return "Unknown";
   }
diff --git a/include/clang/Frontend/FrontendActions.h b/include/clang/Frontend/FrontendActions.h
index 84db293c46f3..c45aeaa208c8 100644
--- a/include/clang/Frontend/FrontendActions.h
+++ b/include/clang/Frontend/FrontendActions.h
@@ -111,6 +111,8 @@ class GenerateModuleAction : public ASTFrontendAction {
 
 class GenerateModuleFromModuleMapAction : public GenerateModuleAction {
 private:
+  bool BeginSourceFileAction(CompilerInstance &CI) override;
+
   std::unique_ptr<raw_pwrite_stream>
   CreateOutputFile(CompilerInstance &CI, StringRef InFile) override;
 };
diff --git a/include/clang/Lex/Preprocessor.h b/include/clang/Lex/Preprocessor.h
index 712e1ab9fbf5..62090d6496ed 100644
--- a/include/clang/Lex/Preprocessor.h
+++ b/include/clang/Lex/Preprocessor.h
@@ -1048,6 +1048,10 @@ class Preprocessor {
   /// which implicitly adds the builtin defines etc.
   void EnterMainSourceFile();
 
+  /// \brief After parser warm-up, initialize the conditional stack from
+  /// the preamble.
+  void replayPreambleConditionalStack();
+
   /// \brief Inform the preprocessor callbacks that processing is complete.
   void EndSourceFile();
 
@@ -1733,11 +1737,6 @@ class Preprocessor {
   /// \brief Return true if we're in the top-level file, not in a \#include.
   bool isInPrimaryFile() const;
 
-  /// \brief Return true if we're in the main file (specifically, if we are 0
-  /// (zero) levels deep \#include. This is used by the lexer to determine if
-  /// it needs to generate errors about unterminated \#if directives.
-  bool isInMainFile() const;
-
   /// \brief Handle cases where the \#include name is expanded
   /// from a macro as multiple tokens, which need to be glued together. 
   ///
diff --git a/include/clang/Sema/DelayedDiagnostic.h b/include/clang/Sema/DelayedDiagnostic.h
index b73ec0868f52..d65dbf0cd34e 100644
--- a/include/clang/Sema/DelayedDiagnostic.h
+++ b/include/clang/Sema/DelayedDiagnostic.h
@@ -124,7 +124,8 @@ class DelayedDiagnostic {
 
   static DelayedDiagnostic makeAvailability(AvailabilityResult AR,
                                             SourceLocation Loc,
-                                            const NamedDecl *D,
+                                            const NamedDecl *ReferringDecl,
+                                            const NamedDecl *OffendingDecl,
                                             const ObjCInterfaceDecl *UnknownObjCClass,
                                             const ObjCPropertyDecl  *ObjCProperty,
                                             StringRef Msg,
@@ -164,9 +165,13 @@ class DelayedDiagnostic {
     return *reinterpret_cast<const AccessedEntity*>(AccessData);
   }
 
-  const NamedDecl *getAvailabilityDecl() const {
+  const NamedDecl *getAvailabilityReferringDecl() const {
     assert(Kind == Availability && "Not an availability diagnostic.");
-    return AvailabilityData.Decl;
+    return AvailabilityData.ReferringDecl;
+  }
+
+  const NamedDecl *getAvailabilityOffendingDecl() const {
+    return AvailabilityData.OffendingDecl;
   }
 
   StringRef getAvailabilityMessage() const {
@@ -213,7 +218,8 @@ class DelayedDiagnostic {
 private:
 
   struct AD {
-    const NamedDecl *Decl;
+    const NamedDecl *ReferringDecl;
+    const NamedDecl *OffendingDecl;
     const ObjCInterfaceDecl *UnknownObjCClass;
     const ObjCPropertyDecl  *ObjCProperty;
     const char *Message;
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 95134d52f873..95629a2591cf 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -1266,6 +1266,7 @@ class Sema {
 
   void emitAndClearUnusedLocalTypedefWarnings();
 
+  void ActOnStartOfTranslationUnit();
   void ActOnEndOfTranslationUnit();
 
   void CheckDelegatingCtorCycles();
@@ -1541,6 +1542,7 @@ class Sema {
                                  llvm::SmallVectorImpl<Module *> *Modules);
 
   bool hasVisibleMergedDefinition(NamedDecl *Def);
+  bool hasMergedDefinitionInCurrentModule(NamedDecl *Def);
 
   /// Determine if \p D and \p Suggested have a structurally compatible
   /// layout as described in C11 6.2.7/1.
@@ -3358,9 +3360,10 @@ class Sema {
 
   /// DefaultSynthesizeProperties - This routine default synthesizes all
   /// properties which must be synthesized in the class's \@implementation.
-  void DefaultSynthesizeProperties (Scope *S, ObjCImplDecl* IMPDecl,
-                                    ObjCInterfaceDecl *IDecl);
-  void DefaultSynthesizeProperties(Scope *S, Decl *D);
+  void DefaultSynthesizeProperties(Scope *S, ObjCImplDecl *IMPDecl,
+                                   ObjCInterfaceDecl *IDecl,
+                                   SourceLocation AtEnd);
+  void DefaultSynthesizeProperties(Scope *S, Decl *D, SourceLocation AtEnd);
 
   /// IvarBacksCurrentMethodAccessor - This routine returns 'true' if 'IV' is
   /// an ivar synthesized for 'Method' and 'Method' is a property accessor
@@ -3878,7 +3881,9 @@ class Sema {
 
   void redelayDiagnostics(sema::DelayedDiagnosticPool &pool);
 
-  void EmitAvailabilityWarning(AvailabilityResult AR, NamedDecl *D,
+  void EmitAvailabilityWarning(AvailabilityResult AR,
+                               const NamedDecl *ReferringDecl,
+                               const NamedDecl *OffendingDecl,
                                StringRef Message, SourceLocation Loc,
                                const ObjCInterfaceDecl *UnknownObjCClass,
                                const ObjCPropertyDecl *ObjCProperty,
@@ -3895,8 +3900,9 @@ class Sema {
 
   bool CanUseDecl(NamedDecl *D, bool TreatUnavailableAsInvalid);
   bool DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
-                         const ObjCInterfaceDecl *UnknownObjCClass=nullptr,
-                         bool ObjCPropertyAccess=false);
+                         const ObjCInterfaceDecl *UnknownObjCClass = nullptr,
+                         bool ObjCPropertyAccess = false,
+                         bool AvoidPartialAvailabilityChecks = false);
   void NoteDeletedFunction(FunctionDecl *FD);
   void NoteDeletedInheritingConstructor(CXXConstructorDecl *CD);
   std::string getDeletedOrUnavailableSuffix(const FunctionDecl *FD);
@@ -3938,7 +3944,7 @@ class Sema {
   void MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
                               bool MightBeOdrUse = true);
   void MarkVariableReferenced(SourceLocation Loc, VarDecl *Var);
-  void MarkDeclRefReferenced(DeclRefExpr *E);
+  void MarkDeclRefReferenced(DeclRefExpr *E, const Expr *Base = nullptr);
   void MarkMemberReferenced(MemberExpr *E);
 
   void UpdateMarkingForLValueToRValue(Expr *E);
@@ -10410,16 +10416,14 @@ class Sema {
     return OriginalLexicalContext ? OriginalLexicalContext : CurContext;
   }
 
-  /// \brief The diagnostic we should emit for \c D, or \c AR_Available.
-  ///
-  /// \param D The declaration to check. Note that this may be altered to point
-  /// to another declaration that \c D gets it's availability from. i.e., we
-  /// walk the list of typedefs to find an availability attribute.
+  /// The diagnostic we should emit for \c D, and the declaration that
+  /// originated it, or \c AR_Available.
   ///
+  /// \param D The declaration to check.
   /// \param Message If non-null, this will be populated with the message from
   /// the availability attribute that is selected.
-  AvailabilityResult ShouldDiagnoseAvailabilityOfDecl(NamedDecl *&D,
-                                                      std::string *Message);
+  std::pair<AvailabilityResult, const NamedDecl *>
+  ShouldDiagnoseAvailabilityOfDecl(const NamedDecl *D, std::string *Message);
 
   const DeclContext *getCurObjCLexicalContext() const {
     const DeclContext *DC = getCurLexicalContext();
diff --git a/include/clang/Sema/TemplateDeduction.h b/include/clang/Sema/TemplateDeduction.h
index d92cbab4fbcf..cd9ed6abfaf9 100644
--- a/include/clang/Sema/TemplateDeduction.h
+++ b/include/clang/Sema/TemplateDeduction.h
@@ -88,6 +88,12 @@ class TemplateDeductionInfo {
     HasSFINAEDiagnostic = false;
   }
 
+  /// Peek at the SFINAE diagnostic.
+  const PartialDiagnosticAt &peekSFINAEDiagnostic() const {
+    assert(HasSFINAEDiagnostic);
+    return SuppressedDiagnostics.front();
+  }
+
   /// \brief Provide a new template argument list that contains the
   /// results of template argument deduction.
   void reset(TemplateArgumentList *NewDeduced) {
diff --git a/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 4fb50deb0f6b..5dd6bdf38496 100644
--- a/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -205,9 +205,15 @@ class AnalyzerOptions : public RefCountedBase<AnalyzerOptions> {
   /// Controls which C++ member functions will be considered for inlining.
   CXXInlineableMemberKind CXXMemberInliningMode;
   
+  /// \sa includeImplicitDtorsInCFG
+  Optional<bool> IncludeImplicitDtorsInCFG;
+
   /// \sa includeTemporaryDtorsInCFG
   Optional<bool> IncludeTemporaryDtorsInCFG;
-  
+
+  /// \sa IncludeLifetimeInCFG
+  Optional<bool> IncludeLifetimeInCFG;
+
   /// \sa mayInlineCXXStandardLibrary
   Optional<bool> InlineCXXStandardLibrary;
   
@@ -395,6 +401,20 @@ class AnalyzerOptions : public RefCountedBase<AnalyzerOptions> {
   /// accepts the values "true" and "false".
   bool includeTemporaryDtorsInCFG();
 
+  /// Returns whether or not implicit destructors for C++ objects should
+  /// be included in the CFG.
+  ///
+  /// This is controlled by the 'cfg-implicit-dtors' config option, which
+  /// accepts the values "true" and "false".
+  bool includeImplicitDtorsInCFG();
+
+  /// Returns whether or not end-of-lifetime information should be included in
+  /// the CFG.
+  ///
+  /// This is controlled by the 'cfg-lifetime' config option, which accepts
+  /// the values "true" and "false".
+  bool includeLifetimeInCFG();
+
   /// Returns whether or not C++ standard library functions may be considered
   /// for inlining.
   ///
diff --git a/include/clang/StaticAnalyzer/Core/CheckerManager.h b/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 52ed260346bf..88cb08a4b647 100644
--- a/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -286,7 +286,7 @@ class CheckerManager {
   void runCheckersForEndAnalysis(ExplodedGraph &G, BugReporter &BR,
                                  ExprEngine &Eng);
 
-  /// \brief Run checkers on begining of function.
+  /// \brief Run checkers on beginning of function.
   void runCheckersForBeginFunction(ExplodedNodeSet &Dst,
                                    const BlockEdge &L,
                                    ExplodedNode *Pred,
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h b/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
index b8ec2aa6ae8d..4aa87443e4c2 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
@@ -124,7 +124,7 @@ class BasicValueFactory {
   /// Returns the type of the APSInt used to store values of the given QualType.
   APSIntType getAPSIntType(QualType T) const {
     assert(T->isIntegralOrEnumerationType() || Loc::isLocType(T));
-    return APSIntType(Ctx.getTypeSize(T),
+    return APSIntType(Ctx.getIntWidth(T),
                       !T->isSignedIntegerOrEnumerationType());
   }
 
diff --git a/include/clang/Tooling/ArgumentsAdjusters.h b/include/clang/Tooling/ArgumentsAdjusters.h
index 1fd7be688761..4eb02251a775 100644
--- a/include/clang/Tooling/ArgumentsAdjusters.h
+++ b/include/clang/Tooling/ArgumentsAdjusters.h
@@ -44,6 +44,10 @@ ArgumentsAdjuster getClangSyntaxOnlyAdjuster();
 /// arguments.
 ArgumentsAdjuster getClangStripOutputAdjuster();
 
+/// \brief Gets an argument adjuster which removes dependency-file
+/// related command line arguments.
+ArgumentsAdjuster getClangStripDependencyFileAdjuster();
+
 enum class ArgumentInsertPosition { BEGIN, END };
 
 /// \brief Gets an argument adjuster which inserts \p Extra arguments in the
diff --git a/include/clang/Tooling/Core/Diagnostic.h b/include/clang/Tooling/Core/Diagnostic.h
index d657f16df183..b4920d4fe456 100644
--- a/include/clang/Tooling/Core/Diagnostic.h
+++ b/include/clang/Tooling/Core/Diagnostic.h
@@ -58,9 +58,9 @@ struct Diagnostic {
   Diagnostic(llvm::StringRef DiagnosticName, Level DiagLevel,
              StringRef BuildDirectory);
 
-  Diagnostic(llvm::StringRef DiagnosticName, DiagnosticMessage &Message,
-             llvm::StringMap<Replacements> &Fix,
-             SmallVector<DiagnosticMessage, 1> &Notes, Level DiagLevel,
+  Diagnostic(llvm::StringRef DiagnosticName, const DiagnosticMessage &Message,
+             const llvm::StringMap<Replacements> &Fix,
+             const SmallVector<DiagnosticMessage, 1> &Notes, Level DiagLevel,
              llvm::StringRef BuildDirectory);
 
   /// \brief Name identifying the Diagnostic.
diff --git a/include/clang/Tooling/Tooling.h b/include/clang/Tooling/Tooling.h
index 1c974f998852..6f9bc9e1a150 100644
--- a/include/clang/Tooling/Tooling.h
+++ b/include/clang/Tooling/Tooling.h
@@ -202,12 +202,15 @@ buildASTFromCode(const Twine &Code, const Twine &FileName = "input.cc",
 /// \param PCHContainerOps The PCHContainerOperations for loading and creating
 /// clang modules.
 ///
+/// \param Adjuster A function to filter the command line arguments as specified.
+///
 /// \return The resulting AST or null if an error occurred.
 std::unique_ptr<ASTUnit> buildASTFromCodeWithArgs(
     const Twine &Code, const std::vector<std::string> &Args,
     const Twine &FileName = "input.cc", const Twine &ToolName = "clang-tool",
     std::shared_ptr<PCHContainerOperations> PCHContainerOps =
-        std::make_shared<PCHContainerOperations>());
+      std::make_shared<PCHContainerOperations>(),
+    ArgumentsAdjuster Adjuster = getClangStripDependencyFileAdjuster());
 
 /// \brief Utility to run a FrontendAction in a single clang invocation.
 class ToolInvocation {
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index a2ff176df11f..fd9723298fca 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -8523,7 +8523,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   RequiresICE = false;
   
   // Read the prefixed modifiers first.
-  bool Done = false, IsSpecialLong = false;
+  bool Done = false;
+  #ifndef NDEBUG
+  bool IsSpecialLong = false;
+  #endif
   while (!Done) {
     switch (*Str++) {
     default: Done = true; --Str; break;
@@ -8549,7 +8552,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       // 'N' behaves like 'L' for all non LP64 targets and 'int' otherwise.
       assert(!IsSpecialLong && "Can't use two 'N' or 'W' modifiers!");
       assert(HowLong == 0 && "Can't use both 'L' and 'N' modifiers!");
+      #ifndef NDEBUG
       IsSpecialLong = true;
+      #endif
       if (Context.getTargetInfo().getLongWidth() == 32)
         ++HowLong;
       break;
@@ -8558,7 +8563,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       // This modifier represents int64 type.
       assert(!IsSpecialLong && "Can't use two 'N' or 'W' modifiers!");
       assert(HowLong == 0 && "Can't use both 'L' and 'W' modifiers!");
+      #ifndef NDEBUG
       IsSpecialLong = true;
+      #endif
       switch (Context.getTargetInfo().getInt64Type()) {
       default:
         llvm_unreachable("Unexpected integer type");
diff --git a/lib/AST/Decl.cpp b/lib/AST/Decl.cpp
index 267c6992af89..573a98efe980 100644
--- a/lib/AST/Decl.cpp
+++ b/lib/AST/Decl.cpp
@@ -573,6 +573,44 @@ static bool isSingleLineLanguageLinkage(const Decl &D) {
   return false;
 }
 
+static bool isExportedFromModuleIntefaceUnit(const NamedDecl *D) {
+  switch (D->getModuleOwnershipKind()) {
+  case Decl::ModuleOwnershipKind::Unowned:
+  case Decl::ModuleOwnershipKind::ModulePrivate:
+    return false;
+  case Decl::ModuleOwnershipKind::Visible:
+  case Decl::ModuleOwnershipKind::VisibleWhenImported:
+    if (auto *M = D->getOwningModule())
+      return M->Kind == Module::ModuleInterfaceUnit;
+  }
+  llvm_unreachable("unexpected module ownership kind");
+}
+
+static LinkageInfo getInternalLinkageFor(const NamedDecl *D) {
+  // Internal linkage declarations within a module interface unit are modeled
+  // as "module-internal linkage", which means that they have internal linkage
+  // formally but can be indirectly accessed from outside the module via inline
+  // functions and templates defined within the module.
+  if (auto *M = D->getOwningModule())
+    if (M->Kind == Module::ModuleInterfaceUnit)
+      return LinkageInfo(ModuleInternalLinkage, DefaultVisibility, false);
+
+  return LinkageInfo::internal();
+}
+
+static LinkageInfo getExternalLinkageFor(const NamedDecl *D) {
+  // C++ Modules TS [basic.link]/6.8:
+  //   - A name declared at namespace scope that does not have internal linkage
+  //     by the previous rules and that is introduced by a non-exported
+  //     declaration has module linkage.
+  if (auto *M = D->getOwningModule())
+    if (M->Kind == Module::ModuleInterfaceUnit)
+      if (!isExportedFromModuleIntefaceUnit(D))
+        return LinkageInfo(ModuleLinkage, DefaultVisibility, false);
+
+  return LinkageInfo::external();
+}
+
 static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
                                               LVComputationKind computation) {
   assert(D->getDeclContext()->getRedeclContext()->isFileContext() &&
@@ -588,16 +626,18 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
   if (const auto *Var = dyn_cast<VarDecl>(D)) {
     // Explicitly declared static.
     if (Var->getStorageClass() == SC_Static)
-      return LinkageInfo::internal();
+      return getInternalLinkageFor(Var);
 
     // - a non-inline, non-volatile object or reference that is explicitly
     //   declared const or constexpr and neither explicitly declared extern
     //   nor previously declared to have external linkage; or (there is no
     //   equivalent in C99)
+    // The C++ modules TS adds "non-exported" to this list.
     if (Context.getLangOpts().CPlusPlus &&
         Var->getType().isConstQualified() && 
         !Var->getType().isVolatileQualified() &&
-        !Var->isInline()) {
+        !Var->isInline() &&
+        !isExportedFromModuleIntefaceUnit(Var)) {
       const VarDecl *PrevVar = Var->getPreviousDecl();
       if (PrevVar)
         return getLVForDecl(PrevVar, computation);
@@ -605,7 +645,7 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
       if (Var->getStorageClass() != SC_Extern &&
           Var->getStorageClass() != SC_PrivateExtern &&
           !isSingleLineLanguageLinkage(*Var))
-        return LinkageInfo::internal();
+        return getInternalLinkageFor(Var);
     }
 
     for (const VarDecl *PrevVar = Var->getPreviousDecl(); PrevVar;
@@ -615,7 +655,7 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
         return PrevVar->getLinkageAndVisibility();
       // Explicitly declared static.
       if (PrevVar->getStorageClass() == SC_Static)
-        return LinkageInfo::internal();
+        return getInternalLinkageFor(Var);
     }
   } else if (const FunctionDecl *Function = D->getAsFunction()) {
     // C++ [temp]p4:
@@ -624,7 +664,7 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
 
     // Explicitly declared static.
     if (Function->getCanonicalDecl()->getStorageClass() == SC_Static)
-      return LinkageInfo(InternalLinkage, DefaultVisibility, false);
+      return getInternalLinkageFor(Function);
   } else if (const auto *IFD = dyn_cast<IndirectFieldDecl>(D)) {
     //   - a data member of an anonymous union.
     const VarDecl *VD = IFD->getVarDecl();
@@ -637,7 +677,12 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
     const auto *Var = dyn_cast<VarDecl>(D);
     const auto *Func = dyn_cast<FunctionDecl>(D);
     // FIXME: In C++11 onwards, anonymous namespaces should give decls
-    // within them internal linkage, not unique external linkage.
+    // within them (including those inside extern "C" contexts) internal
+    // linkage, not unique external linkage:
+    //
+    // C++11 [basic.link]p4:
+    //   An unnamed namespace or a namespace declared directly or indirectly
+    //   within an unnamed namespace has internal linkage.
     if ((!Var || !isFirstInExternCContext(Var)) &&
         (!Func || !isFirstInExternCContext(Func)))
       return LinkageInfo::uniqueExternal();
@@ -718,7 +763,8 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
     // because of this, but unique-external linkage suits us.
     if (Context.getLangOpts().CPlusPlus && !isFirstInExternCContext(Var)) {
       LinkageInfo TypeLV = getLVForType(*Var->getType(), computation);
-      if (TypeLV.getLinkage() != ExternalLinkage)
+      if (TypeLV.getLinkage() != ExternalLinkage &&
+          TypeLV.getLinkage() != ModuleLinkage)
         return LinkageInfo::uniqueExternal();
       if (!LV.isVisibilityExplicit())
         LV.mergeVisibility(TypeLV);
@@ -816,7 +862,9 @@ static LinkageInfo getLVForNamespaceScopeDecl(const NamedDecl *D,
 
   //     - a namespace (7.3), unless it is declared within an unnamed
   //       namespace.
-  } else if (isa<NamespaceDecl>(D) && !D->isInAnonymousNamespace()) {
+  //
+  // We handled names in anonymous namespaces above.
+  } else if (isa<NamespaceDecl>(D)) {
     return LV;
 
   // By extension, we assign external linkage to Objective-C
@@ -1125,6 +1173,8 @@ static LinkageInfo getLVForClosure(const DeclContext *DC, Decl *ContextDecl,
   if (const auto *ND = dyn_cast<NamedDecl>(DC))
     return getLVForDecl(ND, computation);
 
+  // FIXME: We have a closure at TU scope with no context declaration. This
+  // should probably have no linkage.
   return LinkageInfo::external();
 }
 
@@ -1137,7 +1187,7 @@ static LinkageInfo getLVForLocalDecl(const NamedDecl *D,
 
     // This is a "void f();" which got merged with a file static.
     if (Function->getCanonicalDecl()->getStorageClass() == SC_Static)
-      return LinkageInfo::internal();
+      return getInternalLinkageFor(Function);
 
     LinkageInfo LV;
     if (!hasExplicitVisibilityAlready(computation)) {
@@ -1226,7 +1276,7 @@ static LinkageInfo computeLVForDecl(const NamedDecl *D,
                                     LVComputationKind computation) {
   // Internal_linkage attribute overrides other considerations.
   if (D->hasAttr<InternalLinkageAttr>())
-    return LinkageInfo::internal();
+    return getInternalLinkageFor(D);
 
   // Objective-C: treat all Objective-C declarations as having external
   // linkage.
@@ -1259,8 +1309,7 @@ static LinkageInfo computeLVForDecl(const NamedDecl *D,
     case Decl::TypeAlias:
       // A typedef declaration has linkage if it gives a type a name for
       // linkage purposes.
-      if (!D->getASTContext().getLangOpts().CPlusPlus ||
-          !cast<TypedefNameDecl>(D)
+      if (!cast<TypedefNameDecl>(D)
                ->getAnonDeclWithTypedefName(/*AnyRedecl*/true))
         return LinkageInfo::none();
       break;
@@ -1276,14 +1325,14 @@ static LinkageInfo computeLVForDecl(const NamedDecl *D,
     case Decl::ObjCProperty:
     case Decl::ObjCPropertyImpl:
     case Decl::ObjCProtocol:
-      return LinkageInfo::external();
+      return getExternalLinkageFor(D);
       
     case Decl::CXXRecord: {
       const auto *Record = cast<CXXRecordDecl>(D);
       if (Record->isLambda()) {
         if (!Record->getLambdaManglingNumber()) {
           // This lambda has no mangling number, so it's internal.
-          return LinkageInfo::internal();
+          return getInternalLinkageFor(D);
         }
 
         // This lambda has its linkage/visibility determined:
@@ -1299,7 +1348,7 @@ static LinkageInfo computeLVForDecl(const NamedDecl *D,
         const CXXRecordDecl *OuterMostLambda = 
             getOutermostEnclosingLambda(Record);
         if (!OuterMostLambda->getLambdaManglingNumber())
-          return LinkageInfo::internal();
+          return getInternalLinkageFor(D);
         
         return getLVForClosure(
                   OuterMostLambda->getDeclContext()->getRedeclContext(),
@@ -1350,7 +1399,7 @@ class LinkageComputer {
                                   LVComputationKind computation) {
     // Internal_linkage attribute overrides other considerations.
     if (D->hasAttr<InternalLinkageAttr>())
-      return LinkageInfo::internal();
+      return getInternalLinkageFor(D);
 
     if (computation == LVForLinkageOnly && D->hasCachedLinkage())
       return LinkageInfo(D->getCachedLinkage(), DefaultVisibility, false);
diff --git a/lib/AST/DeclBase.cpp b/lib/AST/DeclBase.cpp
index a0594a020362..cd2c83a02f59 100644
--- a/lib/AST/DeclBase.cpp
+++ b/lib/AST/DeclBase.cpp
@@ -283,8 +283,10 @@ void Decl::setLexicalDeclContext(DeclContext *DC) {
       setLocalOwningModule(cast<Decl>(DC)->getOwningModule());
   }
 
-  assert((!hasOwningModule() || getOwningModule()) &&
-         "hidden declaration has no owning module");
+  assert(
+      (getModuleOwnershipKind() != ModuleOwnershipKind::VisibleWhenImported ||
+       getOwningModule()) &&
+      "hidden declaration has no owning module");
 }
 
 void Decl::setDeclContextsImpl(DeclContext *SemaDC, DeclContext *LexicalDC,
diff --git a/lib/AST/DeclCXX.cpp b/lib/AST/DeclCXX.cpp
index 07d128ba555b..5cab48882251 100644
--- a/lib/AST/DeclCXX.cpp
+++ b/lib/AST/DeclCXX.cpp
@@ -1605,6 +1605,84 @@ CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
                                    SC_None, false, false, SourceLocation());
 }
 
+CXXMethodDecl *CXXMethodDecl::getDevirtualizedMethod(const Expr *Base,
+                                                     bool IsAppleKext) {
+  assert(isVirtual() && "this method is expected to be virtual");
+
+  // When building with -fapple-kext, all calls must go through the vtable since
+  // the kernel linker can do runtime patching of vtables.
+  if (IsAppleKext)
+    return nullptr;
+
+  // If the member function is marked 'final', we know that it can't be
+  // overridden and can therefore devirtualize it unless it's pure virtual.
+  if (hasAttr<FinalAttr>())
+    return isPure() ? nullptr : this;
+
+  // If Base is unknown, we cannot devirtualize.
+  if (!Base)
+    return nullptr;
+
+  // If the base expression (after skipping derived-to-base conversions) is a
+  // class prvalue, then we can devirtualize.
+  Base = Base->getBestDynamicClassTypeExpr();
+  if (Base->isRValue() && Base->getType()->isRecordType())
+    return this;
+
+  // If we don't even know what we would call, we can't devirtualize.
+  const CXXRecordDecl *BestDynamicDecl = Base->getBestDynamicClassType();
+  if (!BestDynamicDecl)
+    return nullptr;
+
+  // There may be a method corresponding to MD in a derived class.
+  CXXMethodDecl *DevirtualizedMethod =
+      getCorrespondingMethodInClass(BestDynamicDecl);
+
+  // If that method is pure virtual, we can't devirtualize. If this code is
+  // reached, the result would be UB, not a direct call to the derived class
+  // function, and we can't assume the derived class function is defined.
+  if (DevirtualizedMethod->isPure())
+    return nullptr;
+
+  // If that method is marked final, we can devirtualize it.
+  if (DevirtualizedMethod->hasAttr<FinalAttr>())
+    return DevirtualizedMethod;
+
+  // Similarly, if the class itself is marked 'final' it can't be overridden
+  // and we can therefore devirtualize the member function call.
+  if (BestDynamicDecl->hasAttr<FinalAttr>())
+    return DevirtualizedMethod;
+
+  if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Base)) {
+    if (const VarDecl *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+      if (VD->getType()->isRecordType())
+        // This is a record decl. We know the type and can devirtualize it.
+        return DevirtualizedMethod;
+
+    return nullptr;
+  }
+
+  // We can devirtualize calls on an object accessed by a class member access
+  // expression, since by C++11 [basic.life]p6 we know that it can't refer to
+  // a derived class object constructed in the same location.
+  if (const MemberExpr *ME = dyn_cast<MemberExpr>(Base))
+    if (const ValueDecl *VD = dyn_cast<ValueDecl>(ME->getMemberDecl()))
+      return VD->getType()->isRecordType() ? DevirtualizedMethod : nullptr;
+
+  // Likewise for calls on an object accessed by a (non-reference) pointer to
+  // member access.
+  if (auto *BO = dyn_cast<BinaryOperator>(Base)) {
+    if (BO->isPtrMemOp()) {
+      auto *MPT = BO->getRHS()->getType()->castAs<MemberPointerType>();
+      if (MPT->getPointeeType()->isRecordType())
+        return DevirtualizedMethod;
+    }
+  }
+
+  // We can't devirtualize the call.
+  return nullptr;
+}
+
 bool CXXMethodDecl::isUsualDeallocationFunction() const {
   if (getOverloadedOperator() != OO_Delete &&
       getOverloadedOperator() != OO_Array_Delete)
diff --git a/lib/AST/ExprConstant.cpp b/lib/AST/ExprConstant.cpp
index e836135cf2f9..0c0c861e5d56 100644
--- a/lib/AST/ExprConstant.cpp
+++ b/lib/AST/ExprConstant.cpp
@@ -1665,6 +1665,19 @@ static bool CheckLValueConstantExpression(EvalInfo &Info, SourceLocation Loc,
   return true;
 }
 
+/// Member pointers are constant expressions unless they point to a
+/// non-virtual dllimport member function.
+static bool CheckMemberPointerConstantExpression(EvalInfo &Info,
+                                                 SourceLocation Loc,
+                                                 QualType Type,
+                                                 const APValue &Value) {
+  const ValueDecl *Member = Value.getMemberPointerDecl();
+  const auto *FD = dyn_cast_or_null<CXXMethodDecl>(Member);
+  if (!FD)
+    return true;
+  return FD->isVirtual() || !FD->hasAttr<DLLImportAttr>();
+}
+
 /// Check that this core constant expression is of literal type, and if not,
 /// produce an appropriate diagnostic.
 static bool CheckLiteralType(EvalInfo &Info, const Expr *E,
@@ -1757,6 +1770,9 @@ static bool CheckConstantExpression(EvalInfo &Info, SourceLocation DiagLoc,
     return CheckLValueConstantExpression(Info, DiagLoc, Type, LVal);
   }
 
+  if (Value.isMemberPointer())
+    return CheckMemberPointerConstantExpression(Info, DiagLoc, Type, Value);
+
   // Everything else is fine.
   return true;
 }
@@ -9508,7 +9524,7 @@ bool ComplexExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
   case BO_Mul:
     if (Result.isComplexFloat()) {
       // This is an implementation of complex multiplication according to the
-      // constraints laid out in C11 Annex G. The implemantion uses the
+      // constraints laid out in C11 Annex G. The implemention uses the
       // following naming scheme:
       //   (a + ib) * (c + id)
       ComplexValue LHS = Result;
@@ -9589,7 +9605,7 @@ bool ComplexExprEvaluator::VisitBinaryOperator(const BinaryOperator *E) {
   case BO_Div:
     if (Result.isComplexFloat()) {
       // This is an implementation of complex division according to the
-      // constraints laid out in C11 Annex G. The implemantion uses the
+      // constraints laid out in C11 Annex G. The implemention uses the
       // following naming scheme:
       //   (a + ib) / (c + id)
       ComplexValue LHS = Result;
diff --git a/lib/AST/ExternalASTMerger.cpp b/lib/AST/ExternalASTMerger.cpp
index b746edaf6439..4f4a99794c5b 100644
--- a/lib/AST/ExternalASTMerger.cpp
+++ b/lib/AST/ExternalASTMerger.cpp
@@ -180,8 +180,3 @@ void ExternalASTMerger::FindExternalLexicalDecls(
       });
 }
 
-void ExternalASTMerger::CompleteType(TagDecl *Tag) {
-  SmallVector<Decl *, 0> Result;
-  FindExternalLexicalDecls(Tag, [](Decl::Kind) { return true; }, Result);
-  Tag->setHasExternalLexicalStorage(false);
-}
diff --git a/lib/AST/ODRHash.cpp b/lib/AST/ODRHash.cpp
index 3f66e58eb868..66b9940b8b08 100644
--- a/lib/AST/ODRHash.cpp
+++ b/lib/AST/ODRHash.cpp
@@ -228,6 +228,13 @@ class ODRDeclVisitor : public ConstDeclVisitor<ODRDeclVisitor> {
     Hash.AddQualType(T);
   }
 
+  void AddDecl(const Decl *D) {
+    Hash.AddBoolean(D);
+    if (D) {
+      Hash.AddDecl(D);
+    }
+  }
+
   void Visit(const Decl *D) {
     ID.AddInteger(D->getKind());
     Inherited::Visit(D);
@@ -321,6 +328,16 @@ class ODRDeclVisitor : public ConstDeclVisitor<ODRDeclVisitor> {
   void VisitTypeAliasDecl(const TypeAliasDecl *D) {
     Inherited::VisitTypeAliasDecl(D);
   }
+
+  void VisitFriendDecl(const FriendDecl *D) {
+    TypeSourceInfo *TSI = D->getFriendType();
+    Hash.AddBoolean(TSI);
+    if (TSI) {
+      AddQualType(TSI->getType());
+    } else {
+      AddDecl(D->getFriendDecl());
+    }
+  }
 };
 
 // Only allow a small portion of Decl's to be processed.  Remove this once
@@ -335,6 +352,7 @@ bool ODRHash::isWhitelistedDecl(const Decl *D, const CXXRecordDecl *Parent) {
     case Decl::AccessSpec:
     case Decl::CXXMethod:
     case Decl::Field:
+    case Decl::Friend:
     case Decl::StaticAssert:
     case Decl::TypeAlias:
     case Decl::Typedef:
diff --git a/lib/Analysis/AnalysisDeclContext.cpp b/lib/Analysis/AnalysisDeclContext.cpp
index 7c0f5543da04..ec15f34fb231 100644
--- a/lib/Analysis/AnalysisDeclContext.cpp
+++ b/lib/Analysis/AnalysisDeclContext.cpp
@@ -67,6 +67,7 @@ AnalysisDeclContextManager::AnalysisDeclContextManager(bool useUnoptimizedCFG,
                                                        bool addImplicitDtors,
                                                        bool addInitializers,
                                                        bool addTemporaryDtors,
+                                                       bool addLifetime,
                                                        bool synthesizeBodies,
                                                        bool addStaticInitBranch,
                                                        bool addCXXNewAllocator,
@@ -77,6 +78,7 @@ AnalysisDeclContextManager::AnalysisDeclContextManager(bool useUnoptimizedCFG,
   cfgBuildOptions.AddImplicitDtors = addImplicitDtors;
   cfgBuildOptions.AddInitializers = addInitializers;
   cfgBuildOptions.AddTemporaryDtors = addTemporaryDtors;
+  cfgBuildOptions.AddLifetime = addLifetime;
   cfgBuildOptions.AddStaticInitBranches = addStaticInitBranch;
   cfgBuildOptions.AddCXXNewAllocator = addCXXNewAllocator;
 }
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index 2a2b3d73b5ca..6a77455edeef 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -233,6 +233,7 @@ class LocalScope {
     }
 
     int distance(const_iterator L);
+    const_iterator shared_parent(const_iterator L);
   };
 
   friend class const_iterator;
@@ -275,6 +276,30 @@ int LocalScope::const_iterator::distance(LocalScope::const_iterator L) {
   return D;
 }
 
+/// Calculates the closest parent of this iterator
+/// that is in a scope reachable through the parents of L.
+/// I.e. when using 'goto' from this to L, the lifetime of all variables
+/// between this and shared_parent(L) end.
+LocalScope::const_iterator
+LocalScope::const_iterator::shared_parent(LocalScope::const_iterator L) {
+  llvm::SmallPtrSet<const LocalScope *, 4> ScopesOfL;
+  while (true) {
+    ScopesOfL.insert(L.Scope);
+    if (L == const_iterator())
+      break;
+    L = L.Scope->Prev;
+  }
+
+  const_iterator F = *this;
+  while (true) {
+    if (ScopesOfL.count(F.Scope))
+      return F;
+    assert(F != const_iterator() &&
+           "L iterator is not reachable from F iterator.");
+    F = F.Scope->Prev;
+  }
+}
+
 /// Structure for specifying position in CFG during its build process. It
 /// consists of CFGBlock that specifies position in CFG and
 /// LocalScope::const_iterator that specifies position in LocalScope graph.
@@ -579,6 +604,10 @@ class CFGBuilder {
   CFGBlock *addInitializer(CXXCtorInitializer *I);
   void addAutomaticObjDtors(LocalScope::const_iterator B,
                             LocalScope::const_iterator E, Stmt *S);
+  void addLifetimeEnds(LocalScope::const_iterator B,
+                       LocalScope::const_iterator E, Stmt *S);
+  void addAutomaticObjHandling(LocalScope::const_iterator B,
+                               LocalScope::const_iterator E, Stmt *S);
   void addImplicitDtorsForDestructor(const CXXDestructorDecl *DD);
 
   // Local scopes creation.
@@ -619,6 +648,10 @@ class CFGBuilder {
     B->appendAutomaticObjDtor(VD, S, cfg->getBumpVectorContext());
   }
 
+  void appendLifetimeEnds(CFGBlock *B, VarDecl *VD, Stmt *S) {
+    B->appendLifetimeEnds(VD, S, cfg->getBumpVectorContext());
+  }
+
   void appendDeleteDtor(CFGBlock *B, CXXRecordDecl *RD, CXXDeleteExpr *DE) {
     B->appendDeleteDtor(RD, DE, cfg->getBumpVectorContext());
   }
@@ -626,6 +659,10 @@ class CFGBuilder {
   void prependAutomaticObjDtorsWithTerminator(CFGBlock *Blk,
       LocalScope::const_iterator B, LocalScope::const_iterator E);
 
+  void prependAutomaticObjLifetimeWithTerminator(CFGBlock *Blk,
+                                                 LocalScope::const_iterator B,
+                                                 LocalScope::const_iterator E);
+
   void addSuccessor(CFGBlock *B, CFGBlock *S, bool IsReachable = true) {
     B->addSuccessor(CFGBlock::AdjacentBlock(S, IsReachable),
                     cfg->getBumpVectorContext());
@@ -957,7 +994,8 @@ class CFGBuilder {
 
     return TryResult();
   }
-  
+
+  bool hasTrivialDestructor(VarDecl *VD);
 };
 
 inline bool AddStmtChoice::alwaysAdd(CFGBuilder &builder,
@@ -1031,6 +1069,9 @@ std::unique_ptr<CFG> CFGBuilder::buildCFG(const Decl *D, Stmt *Statement) {
   assert(Succ == &cfg->getExit());
   Block = nullptr;  // the EXIT block is empty.  Create all other blocks lazily.
 
+  assert(!(BuildOpts.AddImplicitDtors && BuildOpts.AddLifetime) &&
+         "AddImplicitDtors and AddLifetime cannot be used at the same time");
+
   if (BuildOpts.AddImplicitDtors)
     if (const CXXDestructorDecl *DD = dyn_cast_or_null<CXXDestructorDecl>(D))
       addImplicitDtorsForDestructor(DD);
@@ -1067,6 +1108,8 @@ std::unique_ptr<CFG> CFGBuilder::buildCFG(const Decl *D, Stmt *Statement) {
     if (LI == LabelMap.end()) continue;
 
     JumpTarget JT = LI->second;
+    prependAutomaticObjLifetimeWithTerminator(B, I->scopePosition,
+                                              JT.scopePosition);
     prependAutomaticObjDtorsWithTerminator(B, I->scopePosition,
                                            JT.scopePosition);
     addSuccessor(B, JT.block);
@@ -1209,7 +1252,61 @@ static QualType getReferenceInitTemporaryType(ASTContext &Context,
 
   return Init->getType();
 }
-  
+
+void CFGBuilder::addAutomaticObjHandling(LocalScope::const_iterator B,
+                                         LocalScope::const_iterator E,
+                                         Stmt *S) {
+  if (BuildOpts.AddImplicitDtors)
+    addAutomaticObjDtors(B, E, S);
+  if (BuildOpts.AddLifetime)
+    addLifetimeEnds(B, E, S);
+}
+
+/// Add to current block automatic objects that leave the scope.
+void CFGBuilder::addLifetimeEnds(LocalScope::const_iterator B,
+                                 LocalScope::const_iterator E, Stmt *S) {
+  if (!BuildOpts.AddLifetime)
+    return;
+
+  if (B == E)
+    return;
+
+  // To go from B to E, one first goes up the scopes from B to P
+  // then sideways in one scope from P to P' and then down
+  // the scopes from P' to E.
+  // The lifetime of all objects between B and P end.
+  LocalScope::const_iterator P = B.shared_parent(E);
+  int dist = B.distance(P);
+  if (dist <= 0)
+    return;
+
+  // We need to perform the scope leaving in reverse order
+  SmallVector<VarDecl *, 10> DeclsTrivial;
+  SmallVector<VarDecl *, 10> DeclsNonTrivial;
+  DeclsTrivial.reserve(dist);
+  DeclsNonTrivial.reserve(dist);
+
+  for (LocalScope::const_iterator I = B; I != P; ++I)
+    if (hasTrivialDestructor(*I))
+      DeclsTrivial.push_back(*I);
+    else
+      DeclsNonTrivial.push_back(*I);
+
+  autoCreateBlock();
+  // object with trivial destructor end their lifetime last (when storage
+  // duration ends)
+  for (SmallVectorImpl<VarDecl *>::reverse_iterator I = DeclsTrivial.rbegin(),
+                                                    E = DeclsTrivial.rend();
+       I != E; ++I)
+    appendLifetimeEnds(Block, *I, S);
+
+  for (SmallVectorImpl<VarDecl *>::reverse_iterator
+           I = DeclsNonTrivial.rbegin(),
+           E = DeclsNonTrivial.rend();
+       I != E; ++I)
+    appendLifetimeEnds(Block, *I, S);
+}
+
 /// addAutomaticObjDtors - Add to current block automatic objects destructors
 /// for objects in range of local scope positions. Use S as trigger statement
 /// for destructors.
@@ -1309,7 +1406,7 @@ LocalScope* CFGBuilder::createOrReuseLocalScope(LocalScope* Scope) {
 /// addLocalScopeForStmt - Add LocalScope to local scopes tree for statement
 /// that should create implicit scope (e.g. if/else substatements). 
 void CFGBuilder::addLocalScopeForStmt(Stmt *S) {
-  if (!BuildOpts.AddImplicitDtors)
+  if (!BuildOpts.AddImplicitDtors && !BuildOpts.AddLifetime)
     return;
 
   LocalScope *Scope = nullptr;
@@ -1334,7 +1431,7 @@ void CFGBuilder::addLocalScopeForStmt(Stmt *S) {
 /// reuse Scope if not NULL.
 LocalScope* CFGBuilder::addLocalScopeForDeclStmt(DeclStmt *DS,
                                                  LocalScope* Scope) {
-  if (!BuildOpts.AddImplicitDtors)
+  if (!BuildOpts.AddImplicitDtors && !BuildOpts.AddLifetime)
     return Scope;
 
   for (auto *DI : DS->decls())
@@ -1343,12 +1440,50 @@ LocalScope* CFGBuilder::addLocalScopeForDeclStmt(DeclStmt *DS,
   return Scope;
 }
 
+bool CFGBuilder::hasTrivialDestructor(VarDecl *VD) {
+  // Check for const references bound to temporary. Set type to pointee.
+  QualType QT = VD->getType();
+  if (QT.getTypePtr()->isReferenceType()) {
+    // Attempt to determine whether this declaration lifetime-extends a
+    // temporary.
+    //
+    // FIXME: This is incorrect. Non-reference declarations can lifetime-extend
+    // temporaries, and a single declaration can extend multiple temporaries.
+    // We should look at the storage duration on each nested
+    // MaterializeTemporaryExpr instead.
+
+    const Expr *Init = VD->getInit();
+    if (!Init)
+      return true;
+
+    // Lifetime-extending a temporary.
+    bool FoundMTE = false;
+    QT = getReferenceInitTemporaryType(*Context, Init, &FoundMTE);
+    if (!FoundMTE)
+      return true;
+  }
+
+  // Check for constant size array. Set type to array element type.
+  while (const ConstantArrayType *AT = Context->getAsConstantArrayType(QT)) {
+    if (AT->getSize() == 0)
+      return true;
+    QT = AT->getElementType();
+  }
+
+  // Check if type is a C++ class with non-trivial destructor.
+  if (const CXXRecordDecl *CD = QT->getAsCXXRecordDecl())
+    return !CD->hasDefinition() || CD->hasTrivialDestructor();
+  return true;
+}
+
 /// addLocalScopeForVarDecl - Add LocalScope for variable declaration. It will
 /// create add scope for automatic objects and temporary objects bound to
 /// const reference. Will reuse Scope if not NULL.
 LocalScope* CFGBuilder::addLocalScopeForVarDecl(VarDecl *VD,
                                                 LocalScope* Scope) {
-  if (!BuildOpts.AddImplicitDtors)
+  assert(!(BuildOpts.AddImplicitDtors && BuildOpts.AddLifetime) &&
+         "AddImplicitDtors and AddLifetime cannot be used at the same time");
+  if (!BuildOpts.AddImplicitDtors && !BuildOpts.AddLifetime)
     return Scope;
 
   // Check if variable is local.
@@ -1360,54 +1495,30 @@ LocalScope* CFGBuilder::addLocalScopeForVarDecl(VarDecl *VD,
   default: return Scope;
   }
 
-  // Check for const references bound to temporary. Set type to pointee.
-  QualType QT = VD->getType();
-  if (QT.getTypePtr()->isReferenceType()) {
-    // Attempt to determine whether this declaration lifetime-extends a
-    // temporary.
-    //
-    // FIXME: This is incorrect. Non-reference declarations can lifetime-extend
-    // temporaries, and a single declaration can extend multiple temporaries.
-    // We should look at the storage duration on each nested
-    // MaterializeTemporaryExpr instead.
-    const Expr *Init = VD->getInit();
-    if (!Init)
-      return Scope;
-
-    // Lifetime-extending a temporary.
-    bool FoundMTE = false;
-    QT = getReferenceInitTemporaryType(*Context, Init, &FoundMTE);
-    if (!FoundMTE)
-      return Scope;
-  }
-
-  // Check for constant size array. Set type to array element type.
-  while (const ConstantArrayType *AT = Context->getAsConstantArrayType(QT)) {
-    if (AT->getSize() == 0)
-      return Scope;
-    QT = AT->getElementType();
-  }
-
-  // Check if type is a C++ class with non-trivial destructor.
-  if (const CXXRecordDecl *CD = QT->getAsCXXRecordDecl())
-    if (CD->hasDefinition() && !CD->hasTrivialDestructor()) {
+  if (BuildOpts.AddImplicitDtors) {
+    if (!hasTrivialDestructor(VD)) {
       // Add the variable to scope
       Scope = createOrReuseLocalScope(Scope);
       Scope->addVar(VD);
       ScopePos = Scope->begin();
     }
+    return Scope;
+  }
+
+  assert(BuildOpts.AddLifetime);
+  // Add the variable to scope
+  Scope = createOrReuseLocalScope(Scope);
+  Scope->addVar(VD);
+  ScopePos = Scope->begin();
   return Scope;
 }
 
 /// addLocalScopeAndDtors - For given statement add local scope for it and
 /// add destructors that will cleanup the scope. Will reuse Scope if not NULL.
 void CFGBuilder::addLocalScopeAndDtors(Stmt *S) {
-  if (!BuildOpts.AddImplicitDtors)
-    return;
-
   LocalScope::const_iterator scopeBeginPos = ScopePos;
   addLocalScopeForStmt(S);
-  addAutomaticObjDtors(ScopePos, scopeBeginPos, S);
+  addAutomaticObjHandling(ScopePos, scopeBeginPos, S);
 }
 
 /// prependAutomaticObjDtorsWithTerminator - Prepend destructor CFGElements for
@@ -1419,6 +1530,8 @@ void CFGBuilder::addLocalScopeAndDtors(Stmt *S) {
 /// no-return destructors properly.
 void CFGBuilder::prependAutomaticObjDtorsWithTerminator(CFGBlock *Blk,
     LocalScope::const_iterator B, LocalScope::const_iterator E) {
+  if (!BuildOpts.AddImplicitDtors)
+    return;
   BumpVectorContext &C = cfg->getBumpVectorContext();
   CFGBlock::iterator InsertPos
     = Blk->beginAutomaticObjDtorsInsert(Blk->end(), B.distance(E), C);
@@ -1427,6 +1540,21 @@ void CFGBuilder::prependAutomaticObjDtorsWithTerminator(CFGBlock *Blk,
                                             Blk->getTerminator());
 }
 
+/// prependAutomaticObjLifetimeWithTerminator - Prepend lifetime CFGElements for
+/// variables with automatic storage duration to CFGBlock's elements vector.
+/// Elements will be prepended to physical beginning of the vector which
+/// happens to be logical end. Use blocks terminator as statement that specifies
+/// where lifetime ends.
+void CFGBuilder::prependAutomaticObjLifetimeWithTerminator(
+    CFGBlock *Blk, LocalScope::const_iterator B, LocalScope::const_iterator E) {
+  if (!BuildOpts.AddLifetime)
+    return;
+  BumpVectorContext &C = cfg->getBumpVectorContext();
+  CFGBlock::iterator InsertPos =
+      Blk->beginLifetimeEndsInsert(Blk->end(), B.distance(E), C);
+  for (LocalScope::const_iterator I = B; I != E; ++I)
+    InsertPos = Blk->insertLifetimeEnds(InsertPos, *I, Blk->getTerminator());
+}
 /// Visit - Walk the subtree of a statement and add extra
 ///   blocks for ternary operators, &&, and ||.  We also process "," and
 ///   DeclStmts (which may contain nested control-flow).
@@ -1815,7 +1943,7 @@ CFGBlock *CFGBuilder::VisitBreakStmt(BreakStmt *B) {
   // If there is no target for the break, then we are looking at an incomplete
   // AST.  This means that the CFG cannot be constructed.
   if (BreakJumpTarget.block) {
-    addAutomaticObjDtors(ScopePos, BreakJumpTarget.scopePosition, B);
+    addAutomaticObjHandling(ScopePos, BreakJumpTarget.scopePosition, B);
     addSuccessor(Block, BreakJumpTarget.block);
   } else
     badCFG = true;
@@ -1947,13 +2075,12 @@ CFGBlock *CFGBuilder::VisitChooseExpr(ChooseExpr *C,
 
 CFGBlock *CFGBuilder::VisitCompoundStmt(CompoundStmt *C) {
   LocalScope::const_iterator scopeBeginPos = ScopePos;
-  if (BuildOpts.AddImplicitDtors) {
-    addLocalScopeForStmt(C);
-  }
+  addLocalScopeForStmt(C);
+
   if (!C->body_empty() && !isa<ReturnStmt>(*C->body_rbegin())) {
     // If the body ends with a ReturnStmt, the dtors will be added in
     // VisitReturnStmt.
-    addAutomaticObjDtors(ScopePos, scopeBeginPos, C);
+    addAutomaticObjHandling(ScopePos, scopeBeginPos, C);
   }
 
   CFGBlock *LastBlock = Block;
@@ -2183,7 +2310,7 @@ CFGBlock *CFGBuilder::VisitIfStmt(IfStmt *I) {
   if (VarDecl *VD = I->getConditionVariable())
     addLocalScopeForVarDecl(VD);
 
-  addAutomaticObjDtors(ScopePos, save_scope_pos.get(), I);
+  addAutomaticObjHandling(ScopePos, save_scope_pos.get(), I);
 
   // The block we were processing is now finished.  Make it the successor
   // block.
@@ -2308,7 +2435,7 @@ CFGBlock *CFGBuilder::VisitReturnStmt(ReturnStmt *R) {
   // Create the new block.
   Block = createBlock(false);
 
-  addAutomaticObjDtors(ScopePos, LocalScope::const_iterator(), R);
+  addAutomaticObjHandling(ScopePos, LocalScope::const_iterator(), R);
 
   // If the one of the destructors does not return, we already have the Exit
   // block as a successor.
@@ -2389,7 +2516,7 @@ CFGBlock *CFGBuilder::VisitGotoStmt(GotoStmt *G) {
     BackpatchBlocks.push_back(JumpSource(Block, ScopePos));
   else {
     JumpTarget JT = I->second;
-    addAutomaticObjDtors(ScopePos, JT.scopePosition, G);
+    addAutomaticObjHandling(ScopePos, JT.scopePosition, G);
     addSuccessor(Block, JT.block);
   }
 
@@ -2414,7 +2541,7 @@ CFGBlock *CFGBuilder::VisitForStmt(ForStmt *F) {
     addLocalScopeForVarDecl(VD);
   LocalScope::const_iterator ContinueScopePos = ScopePos;
 
-  addAutomaticObjDtors(ScopePos, save_scope_pos.get(), F);
+  addAutomaticObjHandling(ScopePos, save_scope_pos.get(), F);
 
   // "for" is a control-flow statement.  Thus we stop processing the current
   // block.
@@ -2466,7 +2593,7 @@ CFGBlock *CFGBuilder::VisitForStmt(ForStmt *F) {
    ContinueJumpTarget.block->setLoopTarget(F);
 
     // Loop body should end with destructor of Condition variable (if any).
-    addAutomaticObjDtors(ScopePos, LoopBeginScopePos, F);
+   addAutomaticObjHandling(ScopePos, LoopBeginScopePos, F);
 
     // If body is not a compound statement create implicit scope
     // and add destructors.
@@ -2753,7 +2880,7 @@ CFGBlock *CFGBuilder::VisitWhileStmt(WhileStmt *W) {
   LocalScope::const_iterator LoopBeginScopePos = ScopePos;
   if (VarDecl *VD = W->getConditionVariable()) {
     addLocalScopeForVarDecl(VD);
-    addAutomaticObjDtors(ScopePos, LoopBeginScopePos, W);
+    addAutomaticObjHandling(ScopePos, LoopBeginScopePos, W);
   }
 
   // "while" is a control-flow statement.  Thus we stop processing the current
@@ -2788,7 +2915,7 @@ CFGBlock *CFGBuilder::VisitWhileStmt(WhileStmt *W) {
     BreakJumpTarget = JumpTarget(LoopSuccessor, ScopePos);
 
     // Loop body should end with destructor of Condition variable (if any).
-    addAutomaticObjDtors(ScopePos, LoopBeginScopePos, W);
+    addAutomaticObjHandling(ScopePos, LoopBeginScopePos, W);
 
     // If body is not a compound statement create implicit scope
     // and add destructors.
@@ -3030,7 +3157,7 @@ CFGBlock *CFGBuilder::VisitContinueStmt(ContinueStmt *C) {
   // If there is no target for the continue, then we are looking at an
   // incomplete AST.  This means the CFG cannot be constructed.
   if (ContinueJumpTarget.block) {
-    addAutomaticObjDtors(ScopePos, ContinueJumpTarget.scopePosition, C);
+    addAutomaticObjHandling(ScopePos, ContinueJumpTarget.scopePosition, C);
     addSuccessor(Block, ContinueJumpTarget.block);
   } else
     badCFG = true;
@@ -3085,7 +3212,7 @@ CFGBlock *CFGBuilder::VisitSwitchStmt(SwitchStmt *Terminator) {
   if (VarDecl *VD = Terminator->getConditionVariable())
     addLocalScopeForVarDecl(VD);
 
-  addAutomaticObjDtors(ScopePos, save_scope_pos.get(), Terminator);
+  addAutomaticObjHandling(ScopePos, save_scope_pos.get(), Terminator);
 
   if (Block) {
     if (badCFG)
@@ -3373,7 +3500,7 @@ CFGBlock *CFGBuilder::VisitCXXCatchStmt(CXXCatchStmt *CS) {
   if (VarDecl *VD = CS->getExceptionDecl()) {
     LocalScope::const_iterator BeginScopePos = ScopePos;
     addLocalScopeForVarDecl(VD);
-    addAutomaticObjDtors(ScopePos, BeginScopePos, CS);
+    addAutomaticObjHandling(ScopePos, BeginScopePos, CS);
   }
 
   if (CS->getHandlerBlock())
@@ -3427,7 +3554,7 @@ CFGBlock *CFGBuilder::VisitCXXForRangeStmt(CXXForRangeStmt *S) {
     addLocalScopeForStmt(Begin);
   if (Stmt *End = S->getEndStmt())
     addLocalScopeForStmt(End);
-  addAutomaticObjDtors(ScopePos, save_scope_pos.get(), S);
+  addAutomaticObjHandling(ScopePos, save_scope_pos.get(), S);
 
   LocalScope::const_iterator ContinueScopePos = ScopePos;
 
@@ -3898,6 +4025,7 @@ CFGImplicitDtor::getDestructorDecl(ASTContext &astContext) const {
     case CFGElement::Statement:
     case CFGElement::Initializer:
     case CFGElement::NewAllocator:
+    case CFGElement::LifetimeEnds:
       llvm_unreachable("getDestructorDecl should only be used with "
                        "ImplicitDtors");
     case CFGElement::AutomaticObjectDtor: {
@@ -4308,6 +4436,12 @@ static void print_elem(raw_ostream &OS, StmtPrinterHelper &Helper,
     OS << ".~" << T->getAsCXXRecordDecl()->getName().str() << "()";
     OS << " (Implicit destructor)\n";
 
+  } else if (Optional<CFGLifetimeEnds> DE = E.getAs<CFGLifetimeEnds>()) {
+    const VarDecl *VD = DE->getVarDecl();
+    Helper.handleDecl(VD, OS);
+
+    OS << " (Lifetime ends)\n";
+
   } else if (Optional<CFGNewAllocator> NE = E.getAs<CFGNewAllocator>()) {
     OS << "CFGNewAllocator(";
     if (const CXXNewExpr *AllocExpr = NE->getAllocatorExpr())
diff --git a/lib/Analysis/CloneDetection.cpp b/lib/Analysis/CloneDetection.cpp
index ee848ac711d6..5ea74989a7ec 100644
--- a/lib/Analysis/CloneDetection.cpp
+++ b/lib/Analysis/CloneDetection.cpp
@@ -16,13 +16,13 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
-#include "clang/AST/StmtVisitor.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang;
+using namespace clang::clone_detection;
 
 StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D,
                            unsigned StartIndex, unsigned EndIndex)
@@ -103,12 +103,8 @@ static void printMacroName(llvm::raw_string_ostream &MacroStack,
   MacroStack << " ";
 }
 
-/// Returns a string that represents all macro expansions that expanded into the
-/// given SourceLocation.
-///
-/// If 'getMacroStack(A) == getMacroStack(B)' is true, then the SourceLocations
-/// A and B are expanded from the same macros in the same order.
-static std::string getMacroStack(SourceLocation Loc, ASTContext &Context) {
+std::string clone_detection::getMacroStack(SourceLocation Loc,
+                                           ASTContext &Context) {
   std::string MacroStack;
   llvm::raw_string_ostream MacroStackStream(MacroStack);
   SourceManager &SM = Context.getSourceManager();
@@ -123,184 +119,6 @@ static std::string getMacroStack(SourceLocation Loc, ASTContext &Context) {
   return MacroStack;
 }
 
-namespace {
-typedef unsigned DataPiece;
-
-/// Collects the data of a single Stmt.
-///
-/// This class defines what a code clone is: If it collects for two statements
-/// the same data, then those two statements are considered to be clones of each
-/// other.
-///
-/// All collected data is forwarded to the given data consumer of the type T.
-/// The data consumer class needs to provide a member method with the signature:
-///   update(StringRef Str)
-template <typename T>
-class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector<T>> {
-
-  ASTContext &Context;
-  /// The data sink to which all data is forwarded.
-  T &DataConsumer;
-
-public:
-  /// Collects data of the given Stmt.
-  /// \param S The given statement.
-  /// \param Context The ASTContext of S.
-  /// \param DataConsumer The data sink to which all data is forwarded.
-  StmtDataCollector(const Stmt *S, ASTContext &Context, T &DataConsumer)
-      : Context(Context), DataConsumer(DataConsumer) {
-    this->Visit(S);
-  }
-
-  // Below are utility methods for appending different data to the vector.
-
-  void addData(DataPiece Integer) {
-    DataConsumer.update(
-        StringRef(reinterpret_cast<char *>(&Integer), sizeof(Integer)));
-  }
-
-  void addData(llvm::StringRef Str) { DataConsumer.update(Str); }
-
-  void addData(const QualType &QT) { addData(QT.getAsString()); }
-
-// The functions below collect the class specific data of each Stmt subclass.
-
-// Utility macro for defining a visit method for a given class. This method
-// calls back to the ConstStmtVisitor to visit all parent classes.
-#define DEF_ADD_DATA(CLASS, CODE)                                              \
-  void Visit##CLASS(const CLASS *S) {                                          \
-    CODE;                                                                      \
-    ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S);                      \
-  }
-
-  DEF_ADD_DATA(Stmt, {
-    addData(S->getStmtClass());
-    // This ensures that macro generated code isn't identical to macro-generated
-    // code.
-    addData(getMacroStack(S->getLocStart(), Context));
-    addData(getMacroStack(S->getLocEnd(), Context));
-  })
-  DEF_ADD_DATA(Expr, { addData(S->getType()); })
-
-  //--- Builtin functionality ----------------------------------------------//
-  DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
-  DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
-  DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
-  DEF_ADD_DATA(TypeTraitExpr, {
-    addData(S->getTrait());
-    for (unsigned i = 0; i < S->getNumArgs(); ++i)
-      addData(S->getArg(i)->getType());
-  })
-
-  //--- Calls --------------------------------------------------------------//
-  DEF_ADD_DATA(CallExpr, {
-    // Function pointers don't have a callee and we just skip hashing it.
-    if (const FunctionDecl *D = S->getDirectCallee()) {
-      // If the function is a template specialization, we also need to handle
-      // the template arguments as they are not included in the qualified name.
-      if (auto Args = D->getTemplateSpecializationArgs()) {
-        std::string ArgString;
-
-        // Print all template arguments into ArgString
-        llvm::raw_string_ostream OS(ArgString);
-        for (unsigned i = 0; i < Args->size(); ++i) {
-          Args->get(i).print(Context.getLangOpts(), OS);
-          // Add a padding character so that 'foo<X, XX>()' != 'foo<XX, X>()'.
-          OS << '\n';
-        }
-        OS.flush();
-
-        addData(ArgString);
-      }
-      addData(D->getQualifiedNameAsString());
-    }
-  })
-
-  //--- Exceptions ---------------------------------------------------------//
-  DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })
-
-  //--- C++ OOP Stmts ------------------------------------------------------//
-  DEF_ADD_DATA(CXXDeleteExpr, {
-    addData(S->isArrayFormAsWritten());
-    addData(S->isGlobalDelete());
-  })
-
-  //--- Casts --------------------------------------------------------------//
-  DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })
-
-  //--- Miscellaneous Exprs ------------------------------------------------//
-  DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
-  DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })
-
-  //--- Control flow -------------------------------------------------------//
-  DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
-  DEF_ADD_DATA(IndirectGotoStmt, {
-    if (S->getConstantTarget())
-      addData(S->getConstantTarget()->getName());
-  })
-  DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
-  DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
-  DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })
-
-  //--- Objective-C --------------------------------------------------------//
-  DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
-  DEF_ADD_DATA(ObjCPropertyRefExpr, {
-    addData(S->isSuperReceiver());
-    addData(S->isImplicitProperty());
-  })
-  DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })
-
-  //--- Miscellaneous Stmts ------------------------------------------------//
-  DEF_ADD_DATA(CXXFoldExpr, {
-    addData(S->isRightFold());
-    addData(S->getOperator());
-  })
-  DEF_ADD_DATA(GenericSelectionExpr, {
-    for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
-      addData(S->getAssocType(i));
-    }
-  })
-  DEF_ADD_DATA(LambdaExpr, {
-    for (const LambdaCapture &C : S->captures()) {
-      addData(C.isPackExpansion());
-      addData(C.getCaptureKind());
-      if (C.capturesVariable())
-        addData(C.getCapturedVar()->getType());
-    }
-    addData(S->isGenericLambda());
-    addData(S->isMutable());
-  })
-  DEF_ADD_DATA(DeclStmt, {
-    auto numDecls = std::distance(S->decl_begin(), S->decl_end());
-    addData(static_cast<DataPiece>(numDecls));
-    for (const Decl *D : S->decls()) {
-      if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
-        addData(VD->getType());
-      }
-    }
-  })
-  DEF_ADD_DATA(AsmStmt, {
-    addData(S->isSimple());
-    addData(S->isVolatile());
-    addData(S->generateAsmString(Context));
-    for (unsigned i = 0; i < S->getNumInputs(); ++i) {
-      addData(S->getInputConstraint(i));
-    }
-    for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
-      addData(S->getOutputConstraint(i));
-    }
-    for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
-      addData(S->getClobber(i));
-    }
-  })
-  DEF_ADD_DATA(AttributedStmt, {
-    for (const Attr *A : S->getAttrs()) {
-      addData(std::string(A->getSpelling()));
-    }
-  })
-};
-} // end anonymous namespace
-
 void CloneDetector::analyzeCodeBody(const Decl *D) {
   assert(D);
   assert(D->hasBody());
@@ -421,16 +239,27 @@ size_t RecursiveCloneTypeIIConstraint::saveHash(
   }
 
   if (CS) {
-    for (unsigned Length = 2; Length <= CS->size(); ++Length) {
-      for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
-        llvm::MD5 Hash;
-        for (unsigned i = Pos; i < Pos + Length; ++i) {
-          size_t ChildHash = ChildHashes[i];
-          Hash.update(StringRef(reinterpret_cast<char *>(&ChildHash),
-                                sizeof(ChildHash)));
+    // If we're in a CompoundStmt, we hash all possible combinations of child
+    // statements to find clones in those subsequences.
+    // We first go through every possible starting position of a subsequence.
+    for (unsigned Pos = 0; Pos < CS->size(); ++Pos) {
+      // Then we try all possible lengths this subsequence could have and
+      // reuse the same hash object to make sure we only hash every child
+      // hash exactly once.
+      llvm::MD5 Hash;
+      for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) {
+        // Grab the current child hash and put it into our hash. We do
+        // -1 on the index because we start counting the length at 1.
+        size_t ChildHash = ChildHashes[Pos + Length - 1];
+        Hash.update(
+            StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));
+        // If we have at least two elements in our subsequence, we can start
+        // saving it.
+        if (Length > 1) {
+          llvm::MD5 SubHash = Hash;
+          StmtsByHash.push_back(std::make_pair(
+              createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length)));
         }
-        StmtsByHash.push_back(std::make_pair(
-            createHash(Hash), StmtSequence(CS, D, Pos, Pos + Length)));
       }
     }
   }
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 4f04489a4a10..50b4fc34ad3a 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -499,6 +499,10 @@ class LinuxTargetInfo : public OSTargetInfo<Target> {
     switch (Triple.getArch()) {
     default:
       break;
+    case llvm::Triple::mips:
+    case llvm::Triple::mipsel:
+    case llvm::Triple::mips64:
+    case llvm::Triple::mips64el:
     case llvm::Triple::ppc:
     case llvm::Triple::ppc64:
     case llvm::Triple::ppc64le:
@@ -2049,7 +2053,7 @@ ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const {
   return llvm::makeArrayRef(GCCRegNames);
 }
 
-static const LangAS::Map AMDGPUNonOpenCLPrivateIsZeroMap = {
+static const LangAS::Map AMDGPUPrivIsZeroDefIsGenMap = {
     4, // Default
     1, // opencl_global
     3, // opencl_local
@@ -2059,7 +2063,7 @@ static const LangAS::Map AMDGPUNonOpenCLPrivateIsZeroMap = {
     2, // cuda_constant
     3  // cuda_shared
 };
-static const LangAS::Map AMDGPUNonOpenCLGenericIsZeroMap = {
+static const LangAS::Map AMDGPUGenIsZeroDefIsGenMap = {
     0, // Default
     1, // opencl_global
     3, // opencl_local
@@ -2069,7 +2073,7 @@ static const LangAS::Map AMDGPUNonOpenCLGenericIsZeroMap = {
     2, // cuda_constant
     3  // cuda_shared
 };
-static const LangAS::Map AMDGPUOpenCLPrivateIsZeroMap = {
+static const LangAS::Map AMDGPUPrivIsZeroDefIsPrivMap = {
     0, // Default
     1, // opencl_global
     3, // opencl_local
@@ -2079,7 +2083,7 @@ static const LangAS::Map AMDGPUOpenCLPrivateIsZeroMap = {
     2, // cuda_constant
     3  // cuda_shared
 };
-static const LangAS::Map AMDGPUOpenCLGenericIsZeroMap = {
+static const LangAS::Map AMDGPUGenIsZeroDefIsPrivMap = {
     5, // Default
     1, // opencl_global
     3, // opencl_local
@@ -2184,18 +2188,35 @@ class AMDGPUTargetInfo final : public TargetInfo {
                     : DataLayoutStringR600);
     assert(DataLayout->getAllocaAddrSpace() == AS.Private);
 
+    setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D ||
+                       Triple.getEnvironment() == llvm::Triple::OpenCL ||
+                       Triple.getEnvironmentName() == "amdgizcl" ||
+                       !isAMDGCN(Triple));
     UseAddrSpaceMapMangling = true;
+
+    // Set pointer width and alignment for target address space 0.
+    PointerWidth = PointerAlign = DataLayout->getPointerSizeInBits();
+    if (getMaxPointerWidth() == 64) {
+      LongWidth = LongAlign = 64;
+      SizeType = UnsignedLong;
+      PtrDiffType = SignedLong;
+      IntPtrType = SignedLong;
+    }
+  }
+
+  void setAddressSpaceMap(bool DefaultIsPrivate) {
+    if (isGenericZero(getTriple())) {
+      AddrSpaceMap = DefaultIsPrivate ? &AMDGPUGenIsZeroDefIsPrivMap
+                                      : &AMDGPUGenIsZeroDefIsGenMap;
+    } else {
+      AddrSpaceMap = DefaultIsPrivate ? &AMDGPUPrivIsZeroDefIsPrivMap
+                                      : &AMDGPUPrivIsZeroDefIsGenMap;
+    }
   }
 
   void adjust(LangOptions &Opts) override {
     TargetInfo::adjust(Opts);
-    if (isGenericZero(getTriple())) {
-      AddrSpaceMap = Opts.OpenCL ? &AMDGPUOpenCLGenericIsZeroMap
-                                 : &AMDGPUNonOpenCLGenericIsZeroMap;
-    } else {
-      AddrSpaceMap = Opts.OpenCL ? &AMDGPUOpenCLPrivateIsZeroMap
-                                 : &AMDGPUNonOpenCLPrivateIsZeroMap;
-    }
+    setAddressSpaceMap(Opts.OpenCL || !isAMDGCN(getTriple()));
   }
 
   uint64_t getPointerWidthV(unsigned AddrSpace) const override {
@@ -2208,6 +2229,10 @@ class AMDGPUTargetInfo final : public TargetInfo {
     return 64;
   }
 
+  uint64_t getPointerAlignV(unsigned AddrSpace) const override {
+    return getPointerWidthV(AddrSpace);
+  }
+
   uint64_t getMaxPointerWidth() const override {
     return getTriple().getArch() == llvm::Triple::amdgcn ? 64 : 32;
   }
@@ -2383,14 +2408,13 @@ class AMDGPUTargetInfo final : public TargetInfo {
     return LangAS::opencl_constant;
   }
 
-  /// \returns Target specific vtbl ptr address space.
-  unsigned getVtblPtrAddressSpace() const override {
-    // \todo: We currently have address spaces defined in AMDGPU Backend. It
-    // would be nice if we could use it here instead of using bare numbers (same
-    // applies to getDWARFAddressSpace).
-    return 2; // constant.
+  llvm::Optional<unsigned> getConstantAddressSpace() const override {
+    return LangAS::FirstTargetAddressSpace + AS.Constant;
   }
 
+  /// \returns Target specific vtbl ptr address space.
+  unsigned getVtblPtrAddressSpace() const override { return AS.Constant; }
+
   /// \returns If a target requires an address within a target specific address
   /// space \p AddressSpace to be converted in order to be used, then return the
   /// corresponding target specific DWARF address space.
@@ -3874,7 +3898,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_PentiumMMX:
     Builder.defineMacro("__pentium_mmx__");
     Builder.defineMacro("__tune_pentium_mmx__");
-    // Fallthrough
+    LLVM_FALLTHROUGH;
   case CK_i586:
   case CK_Pentium:
     defineCPUMacros(Builder, "i586");
@@ -3884,15 +3908,15 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_Pentium3M:
   case CK_PentiumM:
     Builder.defineMacro("__tune_pentium3__");
-    // Fallthrough
+    LLVM_FALLTHROUGH;
   case CK_Pentium2:
   case CK_C3_2:
     Builder.defineMacro("__tune_pentium2__");
-    // Fallthrough
+    LLVM_FALLTHROUGH;
   case CK_PentiumPro:
     Builder.defineMacro("__tune_i686__");
     Builder.defineMacro("__tune_pentiumpro__");
-    // Fallthrough
+    LLVM_FALLTHROUGH;
   case CK_i686:
     Builder.defineMacro("__i686");
     Builder.defineMacro("__i686__");
@@ -3948,7 +3972,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_K6_2:
     Builder.defineMacro("__k6_2__");
     Builder.defineMacro("__tune_k6_2__");
-    // Fallthrough
+    LLVM_FALLTHROUGH;
   case CK_K6_3:
     if (CPU != CK_K6_2) {  // In case of fallthrough
       // FIXME: GCC may be enabling these in cases where some other k6
@@ -3957,7 +3981,7 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
       Builder.defineMacro("__k6_3__");
       Builder.defineMacro("__tune_k6_3__");
     }
-    // Fallthrough
+    LLVM_FALLTHROUGH;
   case CK_K6:
     defineCPUMacros(Builder, "k6");
     break;
@@ -6310,9 +6334,6 @@ class AArch64TargetInfo : public TargetInfo {
                         MacroBuilder &Builder) const {
     // Also include the ARMv8.1 defines
     getTargetDefinesARMV81A(Opts, Builder);
-
-    if (FPU == NeonMode && HasFullFP16)
-      Builder.defineMacro("__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", "1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -7408,13 +7429,14 @@ class SystemZTargetInfo : public TargetInfo {
   static const Builtin::Info BuiltinInfo[];
   static const char *const GCCRegNames[];
   std::string CPU;
+  int ISARevision;
   bool HasTransactionalExecution;
   bool HasVector;
 
 public:
   SystemZTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
-      : TargetInfo(Triple), CPU("z10"), HasTransactionalExecution(false),
-        HasVector(false) {
+      : TargetInfo(Triple), CPU("z10"), ISARevision(8),
+        HasTransactionalExecution(false), HasVector(false) {
     IntMaxType = SignedLong;
     Int64Type = SignedLong;
     TLSSupported = true;
@@ -7436,14 +7458,7 @@ class SystemZTargetInfo : public TargetInfo {
     Builder.defineMacro("__zarch__");
     Builder.defineMacro("__LONG_DOUBLE_128__");
 
-    const std::string ISARev = llvm::StringSwitch<std::string>(CPU)
-                                   .Cases("arch8", "z10", "8")
-                                   .Cases("arch9", "z196", "9")
-                                   .Cases("arch10", "zEC12", "10")
-                                   .Cases("arch11", "z13", "11")
-                                   .Default("");
-    if (!ISARev.empty())
-      Builder.defineMacro("__ARCH__", ISARev);
+    Builder.defineMacro("__ARCH__", Twine(ISARevision));
 
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
@@ -7476,37 +7491,35 @@ class SystemZTargetInfo : public TargetInfo {
   BuiltinVaListKind getBuiltinVaListKind() const override {
     return TargetInfo::SystemZBuiltinVaList;
   }
+  int getISARevision(const StringRef &Name) const {
+    return llvm::StringSwitch<int>(Name)
+      .Cases("arch8", "z10", 8)
+      .Cases("arch9", "z196", 9)
+      .Cases("arch10", "zEC12", 10)
+      .Cases("arch11", "z13", 11)
+      .Default(-1);
+  }
   bool setCPU(const std::string &Name) override {
     CPU = Name;
-    bool CPUKnown = llvm::StringSwitch<bool>(Name)
-      .Case("z10", true)
-      .Case("arch8", true)
-      .Case("z196", true)
-      .Case("arch9", true)
-      .Case("zEC12", true)
-      .Case("arch10", true)
-      .Case("z13", true)
-      .Case("arch11", true)
-      .Default(false);
-
-    return CPUKnown;
+    ISARevision = getISARevision(CPU);
+    return ISARevision != -1;
   }
   bool
   initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
                  StringRef CPU,
                  const std::vector<std::string> &FeaturesVec) const override {
-    if (CPU == "zEC12" || CPU == "arch10")
-      Features["transactional-execution"] = true;
-    if (CPU == "z13" || CPU == "arch11") {
+    int ISARevision = getISARevision(CPU);
+    if (ISARevision >= 10)
       Features["transactional-execution"] = true;
+    if (ISARevision >= 11)
       Features["vector"] = true;
-    }
     return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
   }
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
                             DiagnosticsEngine &Diags) override {
     HasTransactionalExecution = false;
+    HasVector = false;
     for (const auto &Feature : Features) {
       if (Feature == "+transactional-execution")
         HasTransactionalExecution = true;
@@ -7525,6 +7538,10 @@ class SystemZTargetInfo : public TargetInfo {
   bool hasFeature(StringRef Feature) const override {
     return llvm::StringSwitch<bool>(Feature)
         .Case("systemz", true)
+        .Case("arch8", ISARevision >= 8)
+        .Case("arch9", ISARevision >= 9)
+        .Case("arch10", ISARevision >= 10)
+        .Case("arch11", ISARevision >= 11)
         .Case("htm", HasTransactionalExecution)
         .Case("vx", HasVector)
         .Default(false);
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp
index 9b3850abcce7..513896d98634 100644
--- a/lib/CodeGen/BackendUtil.cpp
+++ b/lib/CodeGen/BackendUtil.cpp
@@ -998,7 +998,7 @@ static void runThinLTOBackend(ModuleSummaryIndex *CombinedIndex, Module *M,
                               std::unique_ptr<raw_pwrite_stream> OS,
                               std::string SampleProfile,
                               BackendAction Action) {
-  StringMap<std::map<GlobalValue::GUID, GlobalValueSummary *>>
+  StringMap<DenseMap<GlobalValue::GUID, GlobalValueSummary *>>
       ModuleToDefinedGVSummaries;
   CombinedIndex->collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
diff --git a/lib/CodeGen/CGBlocks.cpp b/lib/CodeGen/CGBlocks.cpp
index 528a2b33acf8..181048957879 100644
--- a/lib/CodeGen/CGBlocks.cpp
+++ b/lib/CodeGen/CGBlocks.cpp
@@ -736,9 +736,9 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
   llvm::Constant *isa =
       (!CGM.getContext().getLangOpts().OpenCL)
           ? CGM.getNSConcreteStackBlock()
-          : CGM.getNullPointer(cast<llvm::PointerType>(
-                                   CGM.getNSConcreteStackBlock()->getType()),
-                               QualType(getContext().VoidPtrTy));
+          : CGM.getNullPointer(VoidPtrPtrTy,
+                               CGM.getContext().getPointerType(
+                                   QualType(CGM.getContext().VoidPtrTy)));
   isa = llvm::ConstantExpr::getBitCast(isa, VoidPtrTy);
 
   // Build the block descriptor.
@@ -1141,12 +1141,11 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
   auto fields = builder.beginStruct();
 
   // isa
-  fields.add(
-      (!CGM.getContext().getLangOpts().OpenCL)
-          ? CGM.getNSConcreteGlobalBlock()
-          : CGM.getNullPointer(cast<llvm::PointerType>(
-                                   CGM.getNSConcreteGlobalBlock()->getType()),
-                               QualType(CGM.getContext().VoidPtrTy)));
+  fields.add((!CGM.getContext().getLangOpts().OpenCL)
+                 ? CGM.getNSConcreteGlobalBlock()
+                 : CGM.getNullPointer(CGM.VoidPtrPtrTy,
+                                      CGM.getContext().getPointerType(QualType(
+                                          CGM.getContext().VoidPtrTy))));
 
   // __flags
   BlockFlags flags = BLOCK_IS_GLOBAL | BLOCK_HAS_SIGNATURE;
@@ -1255,7 +1254,7 @@ CodeGenFunction::GenerateBlockFunction(GlobalDecl GD,
 
   // For OpenCL passed block pointer can be private AS local variable or
   // global AS program scope variable (for the case with and without captures).
-  // Generic AS is used therefore to be able to accomodate both private and
+  // Generic AS is used therefore to be able to accommodate both private and
   // generic AS in one implementation.
   if (getLangOpts().OpenCL)
     selfTy = getContext().getPointerType(getContext().getAddrSpaceQualType(
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 2a6e92e7f3ce..bc902507c46e 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -1810,12 +1810,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
   case Builtin::BI__atomic_signal_fence:
   case Builtin::BI__c11_atomic_thread_fence:
   case Builtin::BI__c11_atomic_signal_fence: {
-    llvm::SynchronizationScope Scope;
+    llvm::SyncScope::ID SSID;
     if (BuiltinID == Builtin::BI__atomic_signal_fence ||
         BuiltinID == Builtin::BI__c11_atomic_signal_fence)
-      Scope = llvm::SingleThread;
+      SSID = llvm::SyncScope::SingleThread;
     else
-      Scope = llvm::CrossThread;
+      SSID = llvm::SyncScope::System;
     Value *Order = EmitScalarExpr(E->getArg(0));
     if (isa<llvm::ConstantInt>(Order)) {
       int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
@@ -1825,17 +1825,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
         break;
       case 1:  // memory_order_consume
       case 2:  // memory_order_acquire
-        Builder.CreateFence(llvm::AtomicOrdering::Acquire, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
         break;
       case 3:  // memory_order_release
-        Builder.CreateFence(llvm::AtomicOrdering::Release, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
         break;
       case 4:  // memory_order_acq_rel
-        Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
         break;
       case 5:  // memory_order_seq_cst
-        Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
-                            Scope);
+        Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
         break;
       }
       return RValue::get(nullptr);
@@ -1852,23 +1851,23 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
     llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
 
     Builder.SetInsertPoint(AcquireBB);
-    Builder.CreateFence(llvm::AtomicOrdering::Acquire, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(1), AcquireBB);
     SI->addCase(Builder.getInt32(2), AcquireBB);
 
     Builder.SetInsertPoint(ReleaseBB);
-    Builder.CreateFence(llvm::AtomicOrdering::Release, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(3), ReleaseBB);
 
     Builder.SetInsertPoint(AcqRelBB);
-    Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(4), AcqRelBB);
 
     Builder.SetInsertPoint(SeqCstBB);
-    Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, Scope);
+    Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32(5), SeqCstBB);
 
@@ -2956,9 +2955,8 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
     return llvm::VectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
   case NeonTypeFlags::Int16:
   case NeonTypeFlags::Poly16:
-    return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
   case NeonTypeFlags::Float16:
-    return llvm::VectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
+    return llvm::VectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
   case NeonTypeFlags::Int32:
     return llvm::VectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
   case NeonTypeFlags::Int64:
@@ -2981,8 +2979,6 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
                                           NeonTypeFlags IntTypeFlags) {
   int IsQuad = IntTypeFlags.isQuad();
   switch (IntTypeFlags.getEltType()) {
-  case NeonTypeFlags::Int16:
-    return llvm::VectorType::get(CGF->HalfTy, (4 << IsQuad));
   case NeonTypeFlags::Int32:
     return llvm::VectorType::get(CGF->FloatTy, (2 << IsQuad));
   case NeonTypeFlags::Int64:
@@ -3130,80 +3126,55 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
   NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
   NEONMAP0(vcvt_f32_v),
-  NEONMAP2(vcvt_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
-  NEONMAP1(vcvt_n_s16_v, arm_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
-  NEONMAP1(vcvt_n_u16_v, arm_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
-  NEONMAP0(vcvt_s16_v),
   NEONMAP0(vcvt_s32_v),
   NEONMAP0(vcvt_s64_v),
-  NEONMAP0(vcvt_u16_v),
   NEONMAP0(vcvt_u32_v),
   NEONMAP0(vcvt_u64_v),
-  NEONMAP1(vcvta_s16_v, arm_neon_vcvtas, 0),
   NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
   NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
   NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
   NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
-  NEONMAP1(vcvtaq_s16_v, arm_neon_vcvtas, 0),
   NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
   NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
-  NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0),
   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
-  NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0),
   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
-  NEONMAP1(vcvtm_u16_v, arm_neon_vcvtmu, 0),
   NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
   NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
-  NEONMAP1(vcvtmq_s16_v, arm_neon_vcvtms, 0),
   NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
   NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
-  NEONMAP1(vcvtmq_u16_v, arm_neon_vcvtmu, 0),
   NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
   NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
-  NEONMAP1(vcvtn_s16_v, arm_neon_vcvtns, 0),
   NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
   NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
-  NEONMAP1(vcvtn_u16_v, arm_neon_vcvtnu, 0),
   NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
   NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
-  NEONMAP1(vcvtnq_s16_v, arm_neon_vcvtns, 0),
   NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
   NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
-  NEONMAP1(vcvtnq_u16_v, arm_neon_vcvtnu, 0),
   NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
   NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
-  NEONMAP1(vcvtp_s16_v, arm_neon_vcvtps, 0),
   NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
   NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
-  NEONMAP1(vcvtp_u16_v, arm_neon_vcvtpu, 0),
   NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
   NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
-  NEONMAP1(vcvtpq_s16_v, arm_neon_vcvtps, 0),
   NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
   NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
-  NEONMAP1(vcvtpq_u16_v, arm_neon_vcvtpu, 0),
   NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
   NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
   NEONMAP0(vcvtq_f32_v),
-  NEONMAP2(vcvtq_n_f16_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
-  NEONMAP1(vcvtq_n_s16_v, arm_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
-  NEONMAP1(vcvtq_n_u16_v, arm_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
-  NEONMAP0(vcvtq_s16_v),
   NEONMAP0(vcvtq_s32_v),
   NEONMAP0(vcvtq_s64_v),
-  NEONMAP0(vcvtq_u16_v),
   NEONMAP0(vcvtq_u32_v),
   NEONMAP0(vcvtq_u64_v),
   NEONMAP0(vext_v),
@@ -3366,27 +3337,19 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
-  NEONMAP0(vcvt_f16_v),
   NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
   NEONMAP0(vcvt_f32_v),
-  NEONMAP2(vcvt_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
-  NEONMAP1(vcvt_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
-  NEONMAP1(vcvt_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
-  NEONMAP0(vcvtq_f16_v),
   NEONMAP0(vcvtq_f32_v),
-  NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
-  NEONMAP1(vcvtq_n_s16_v, aarch64_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
   NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
-  NEONMAP1(vcvtq_n_u16_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
@@ -3855,20 +3818,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcageq_v:
   case NEON::BI__builtin_neon_vcagt_v:
   case NEON::BI__builtin_neon_vcagtq_v: {
-    llvm::Type *Ty;
-    switch (VTy->getScalarSizeInBits()) {
-    default: llvm_unreachable("unexpected type");
-    case 32:
-      Ty = FloatTy;
-      break;
-    case 64:
-      Ty = DoubleTy;
-      break;
-    case 16:
-      Ty = HalfTy;
-      break;
-    }
-    llvm::Type *VecFlt = llvm::VectorType::get(Ty, VTy->getNumElements());
+    llvm::Type *VecFlt = llvm::VectorType::get(
+        VTy->getScalarSizeInBits() == 32 ? FloatTy : DoubleTy,
+        VTy->getNumElements());
     llvm::Type *Tys[] = { VTy, VecFlt };
     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
     return EmitNeonCall(F, Ops, NameHint);
@@ -3885,16 +3837,8 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad));
     return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
                 : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
-  case NEON::BI__builtin_neon_vcvt_f16_v:
-  case NEON::BI__builtin_neon_vcvtq_f16_v:
-    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
-    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad));
-    return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
-                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
-  case NEON::BI__builtin_neon_vcvt_n_f16_v:
   case NEON::BI__builtin_neon_vcvt_n_f32_v:
   case NEON::BI__builtin_neon_vcvt_n_f64_v:
-  case NEON::BI__builtin_neon_vcvtq_n_f16_v:
   case NEON::BI__builtin_neon_vcvtq_n_f32_v:
   case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
     llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
@@ -3902,15 +3846,11 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     Function *F = CGM.getIntrinsic(Int, Tys);
     return EmitNeonCall(F, Ops, "vcvt_n");
   }
-  case NEON::BI__builtin_neon_vcvt_n_s16_v:
   case NEON::BI__builtin_neon_vcvt_n_s32_v:
-  case NEON::BI__builtin_neon_vcvt_n_u16_v:
   case NEON::BI__builtin_neon_vcvt_n_u32_v:
   case NEON::BI__builtin_neon_vcvt_n_s64_v:
   case NEON::BI__builtin_neon_vcvt_n_u64_v:
-  case NEON::BI__builtin_neon_vcvtq_n_s16_v:
   case NEON::BI__builtin_neon_vcvtq_n_s32_v:
-  case NEON::BI__builtin_neon_vcvtq_n_u16_v:
   case NEON::BI__builtin_neon_vcvtq_n_u32_v:
   case NEON::BI__builtin_neon_vcvtq_n_s64_v:
   case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
@@ -3922,63 +3862,44 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vcvt_u32_v:
   case NEON::BI__builtin_neon_vcvt_s64_v:
   case NEON::BI__builtin_neon_vcvt_u64_v:
-  case NEON::BI__builtin_neon_vcvt_s16_v:
-  case NEON::BI__builtin_neon_vcvt_u16_v:
   case NEON::BI__builtin_neon_vcvtq_s32_v:
   case NEON::BI__builtin_neon_vcvtq_u32_v:
   case NEON::BI__builtin_neon_vcvtq_s64_v:
-  case NEON::BI__builtin_neon_vcvtq_u64_v:
-  case NEON::BI__builtin_neon_vcvtq_s16_v:
-  case NEON::BI__builtin_neon_vcvtq_u16_v: {
+  case NEON::BI__builtin_neon_vcvtq_u64_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
     return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
                 : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
   }
-  case NEON::BI__builtin_neon_vcvta_s16_v:
   case NEON::BI__builtin_neon_vcvta_s32_v:
   case NEON::BI__builtin_neon_vcvta_s64_v:
   case NEON::BI__builtin_neon_vcvta_u32_v:
   case NEON::BI__builtin_neon_vcvta_u64_v:
-  case NEON::BI__builtin_neon_vcvtaq_s16_v:
   case NEON::BI__builtin_neon_vcvtaq_s32_v:
   case NEON::BI__builtin_neon_vcvtaq_s64_v:
-  case NEON::BI__builtin_neon_vcvtaq_u16_v:
   case NEON::BI__builtin_neon_vcvtaq_u32_v:
   case NEON::BI__builtin_neon_vcvtaq_u64_v:
-  case NEON::BI__builtin_neon_vcvtn_s16_v:
   case NEON::BI__builtin_neon_vcvtn_s32_v:
   case NEON::BI__builtin_neon_vcvtn_s64_v:
-  case NEON::BI__builtin_neon_vcvtn_u16_v:
   case NEON::BI__builtin_neon_vcvtn_u32_v:
   case NEON::BI__builtin_neon_vcvtn_u64_v:
-  case NEON::BI__builtin_neon_vcvtnq_s16_v:
   case NEON::BI__builtin_neon_vcvtnq_s32_v:
   case NEON::BI__builtin_neon_vcvtnq_s64_v:
-  case NEON::BI__builtin_neon_vcvtnq_u16_v:
   case NEON::BI__builtin_neon_vcvtnq_u32_v:
   case NEON::BI__builtin_neon_vcvtnq_u64_v:
-  case NEON::BI__builtin_neon_vcvtp_s16_v:
   case NEON::BI__builtin_neon_vcvtp_s32_v:
   case NEON::BI__builtin_neon_vcvtp_s64_v:
-  case NEON::BI__builtin_neon_vcvtp_u16_v:
   case NEON::BI__builtin_neon_vcvtp_u32_v:
   case NEON::BI__builtin_neon_vcvtp_u64_v:
-  case NEON::BI__builtin_neon_vcvtpq_s16_v:
   case NEON::BI__builtin_neon_vcvtpq_s32_v:
   case NEON::BI__builtin_neon_vcvtpq_s64_v:
-  case NEON::BI__builtin_neon_vcvtpq_u16_v:
   case NEON::BI__builtin_neon_vcvtpq_u32_v:
   case NEON::BI__builtin_neon_vcvtpq_u64_v:
-  case NEON::BI__builtin_neon_vcvtm_s16_v:
   case NEON::BI__builtin_neon_vcvtm_s32_v:
   case NEON::BI__builtin_neon_vcvtm_s64_v:
-  case NEON::BI__builtin_neon_vcvtm_u16_v:
   case NEON::BI__builtin_neon_vcvtm_u32_v:
   case NEON::BI__builtin_neon_vcvtm_u64_v:
-  case NEON::BI__builtin_neon_vcvtmq_s16_v:
   case NEON::BI__builtin_neon_vcvtmq_s32_v:
   case NEON::BI__builtin_neon_vcvtmq_s64_v:
-  case NEON::BI__builtin_neon_vcvtmq_u16_v:
   case NEON::BI__builtin_neon_vcvtmq_u32_v:
   case NEON::BI__builtin_neon_vcvtmq_u64_v: {
     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
@@ -6188,9 +6109,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
     return Builder.CreateCall(F, {Ops[2], Ops[1], Ops[0]});
   }
-  case NEON::BI__builtin_neon_vfmah_lane_f16:
   case NEON::BI__builtin_neon_vfmas_lane_f32:
-  case NEON::BI__builtin_neon_vfmah_laneq_f16:
   case NEON::BI__builtin_neon_vfmas_laneq_f32:
   case NEON::BI__builtin_neon_vfmad_lane_f64:
   case NEON::BI__builtin_neon_vfmad_laneq_f64: {
@@ -6365,25 +6284,18 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vcvt_u32_v:
   case NEON::BI__builtin_neon_vcvt_s64_v:
   case NEON::BI__builtin_neon_vcvt_u64_v:
-	case NEON::BI__builtin_neon_vcvt_s16_v:
-	case NEON::BI__builtin_neon_vcvt_u16_v:
   case NEON::BI__builtin_neon_vcvtq_s32_v:
   case NEON::BI__builtin_neon_vcvtq_u32_v:
   case NEON::BI__builtin_neon_vcvtq_s64_v:
-  case NEON::BI__builtin_neon_vcvtq_u64_v:
-	case NEON::BI__builtin_neon_vcvtq_s16_v:
-	case NEON::BI__builtin_neon_vcvtq_u16_v: {
+  case NEON::BI__builtin_neon_vcvtq_u64_v: {
     Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
     if (usgn)
       return Builder.CreateFPToUI(Ops[0], Ty);
     return Builder.CreateFPToSI(Ops[0], Ty);
   }
-  case NEON::BI__builtin_neon_vcvta_s16_v:
   case NEON::BI__builtin_neon_vcvta_s32_v:
-  case NEON::BI__builtin_neon_vcvtaq_s16_v:
   case NEON::BI__builtin_neon_vcvtaq_s32_v:
   case NEON::BI__builtin_neon_vcvta_u32_v:
-  case NEON::BI__builtin_neon_vcvtaq_u16_v:
   case NEON::BI__builtin_neon_vcvtaq_u32_v:
   case NEON::BI__builtin_neon_vcvta_s64_v:
   case NEON::BI__builtin_neon_vcvtaq_s64_v:
@@ -6393,13 +6305,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
   }
-  case NEON::BI__builtin_neon_vcvtm_s16_v:
   case NEON::BI__builtin_neon_vcvtm_s32_v:
-  case NEON::BI__builtin_neon_vcvtmq_s16_v:
   case NEON::BI__builtin_neon_vcvtmq_s32_v:
-  case NEON::BI__builtin_neon_vcvtm_u16_v:
   case NEON::BI__builtin_neon_vcvtm_u32_v:
-  case NEON::BI__builtin_neon_vcvtmq_u16_v:
   case NEON::BI__builtin_neon_vcvtmq_u32_v:
   case NEON::BI__builtin_neon_vcvtm_s64_v:
   case NEON::BI__builtin_neon_vcvtmq_s64_v:
@@ -6409,13 +6317,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
   }
-  case NEON::BI__builtin_neon_vcvtn_s16_v:
   case NEON::BI__builtin_neon_vcvtn_s32_v:
-  case NEON::BI__builtin_neon_vcvtnq_s16_v:
   case NEON::BI__builtin_neon_vcvtnq_s32_v:
-  case NEON::BI__builtin_neon_vcvtn_u16_v:
   case NEON::BI__builtin_neon_vcvtn_u32_v:
-  case NEON::BI__builtin_neon_vcvtnq_u16_v:
   case NEON::BI__builtin_neon_vcvtnq_u32_v:
   case NEON::BI__builtin_neon_vcvtn_s64_v:
   case NEON::BI__builtin_neon_vcvtnq_s64_v:
@@ -6425,13 +6329,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
   }
-  case NEON::BI__builtin_neon_vcvtp_s16_v:
   case NEON::BI__builtin_neon_vcvtp_s32_v:
-  case NEON::BI__builtin_neon_vcvtpq_s16_v:
   case NEON::BI__builtin_neon_vcvtpq_s32_v:
-  case NEON::BI__builtin_neon_vcvtp_u16_v:
   case NEON::BI__builtin_neon_vcvtp_u32_v:
-  case NEON::BI__builtin_neon_vcvtpq_u16_v:
   case NEON::BI__builtin_neon_vcvtpq_u32_v:
   case NEON::BI__builtin_neon_vcvtp_s64_v:
   case NEON::BI__builtin_neon_vcvtpq_s64_v:
@@ -6604,24 +6504,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
     return Builder.CreateTrunc(Ops[0], Int16Ty);
   }
-  case NEON::BI__builtin_neon_vmaxv_f16: {
-    Int = Intrinsic::aarch64_neon_fmaxv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
-  case NEON::BI__builtin_neon_vmaxvq_f16: {
-    Int = Intrinsic::aarch64_neon_fmaxv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
   case NEON::BI__builtin_neon_vminv_u8: {
     Int = Intrinsic::aarch64_neon_uminv;
     Ty = Int32Ty;
@@ -6694,60 +6576,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
     return Builder.CreateTrunc(Ops[0], Int16Ty);
   }
-  case NEON::BI__builtin_neon_vminv_f16: {
-    Int = Intrinsic::aarch64_neon_fminv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
-  case NEON::BI__builtin_neon_vminvq_f16: {
-    Int = Intrinsic::aarch64_neon_fminv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
-  case NEON::BI__builtin_neon_vmaxnmv_f16: {
-    Int = Intrinsic::aarch64_neon_fmaxnmv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
-  case NEON::BI__builtin_neon_vmaxnmvq_f16: {
-    Int = Intrinsic::aarch64_neon_fmaxnmv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
-  case NEON::BI__builtin_neon_vminnmv_f16: {
-    Int = Intrinsic::aarch64_neon_fminnmv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
-  case NEON::BI__builtin_neon_vminnmvq_f16: {
-    Int = Intrinsic::aarch64_neon_fminnmv;
-    Ty = HalfTy;
-    VTy = llvm::VectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
-    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
-    return Builder.CreateTrunc(Ops[0], HalfTy);
-  }
   case NEON::BI__builtin_neon_vmul_n_f64: {
     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
     Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
@@ -7506,6 +7334,8 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       AVX512PF,
       AVX512VBMI,
       AVX512IFMA,
+      AVX5124VNNIW, // TODO implement this fully
+      AVX5124FMAPS, // TODO implement this fully
       AVX512VPOPCNTDQ,
       MAX
     };
@@ -8208,13 +8038,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
 
   case X86::BI__faststorefence: {
     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
-                               llvm::CrossThread);
+                               llvm::SyncScope::System);
   }
   case X86::BI_ReadWriteBarrier:
   case X86::BI_ReadBarrier:
   case X86::BI_WriteBarrier: {
     return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
-                               llvm::SingleThread);
+                               llvm::SyncScope::SingleThread);
   }
   case X86::BI_BitScanForward:
   case X86::BI_BitScanForward64:
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index 13a156c7bbd7..cee656a62fe7 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -129,7 +129,7 @@ static void addExtParameterInfosForCall(
   paramInfos.resize(totalArgs);
 }
 
-/// Adds the formal paramaters in FPT to the given prefix. If any parameter in
+/// Adds the formal parameters in FPT to the given prefix. If any parameter in
 /// FPT has pass_object_size attrs, then we'll add parameters for those, too.
 static void appendParameterTypes(const CodeGenTypes &CGT,
                                  SmallVectorImpl<CanQualType> &prefix,
diff --git a/lib/CodeGen/CGClass.cpp b/lib/CodeGen/CGClass.cpp
index 127d7df348ee..50d702c62268 100644
--- a/lib/CodeGen/CGClass.cpp
+++ b/lib/CodeGen/CGClass.cpp
@@ -2716,88 +2716,6 @@ llvm::Value *CodeGenFunction::EmitVTableTypeCheckedLoad(
       cast<llvm::PointerType>(VTable->getType())->getElementType());
 }
 
-bool
-CodeGenFunction::CanDevirtualizeMemberFunctionCall(const Expr *Base,
-                                                   const CXXMethodDecl *MD) {
-  // When building with -fapple-kext, all calls must go through the vtable since
-  // the kernel linker can do runtime patching of vtables.
-  if (getLangOpts().AppleKext)
-    return false;
-
-  // If the member function is marked 'final', we know that it can't be
-  // overridden and can therefore devirtualize it unless it's pure virtual.
-  if (MD->hasAttr<FinalAttr>())
-    return !MD->isPure();
-
-  // If the base expression (after skipping derived-to-base conversions) is a
-  // class prvalue, then we can devirtualize.
-  Base = Base->getBestDynamicClassTypeExpr();
-  if (Base->isRValue() && Base->getType()->isRecordType())
-    return true;
-
-  // If we don't even know what we would call, we can't devirtualize.
-  const CXXRecordDecl *BestDynamicDecl = Base->getBestDynamicClassType();
-  if (!BestDynamicDecl)
-    return false;
-
-  // There may be a method corresponding to MD in a derived class.
-  const CXXMethodDecl *DevirtualizedMethod =
-      MD->getCorrespondingMethodInClass(BestDynamicDecl);
-
-  // If that method is pure virtual, we can't devirtualize. If this code is
-  // reached, the result would be UB, not a direct call to the derived class
-  // function, and we can't assume the derived class function is defined.
-  if (DevirtualizedMethod->isPure())
-    return false;
-
-  // If that method is marked final, we can devirtualize it.
-  if (DevirtualizedMethod->hasAttr<FinalAttr>())
-    return true;
-
-  // Similarly, if the class itself is marked 'final' it can't be overridden
-  // and we can therefore devirtualize the member function call.
-  if (BestDynamicDecl->hasAttr<FinalAttr>())
-    return true;
-
-  if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Base)) {
-    if (const VarDecl *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
-      // This is a record decl. We know the type and can devirtualize it.
-      return VD->getType()->isRecordType();
-    }
-
-    return false;
-  }
-
-  // We can devirtualize calls on an object accessed by a class member access
-  // expression, since by C++11 [basic.life]p6 we know that it can't refer to
-  // a derived class object constructed in the same location. However, we avoid
-  // devirtualizing a call to a template function that we could instantiate
-  // implicitly, but have not decided to do so. This is needed because if this
-  // function does not get instantiated, the devirtualization will create a
-  // direct call to a function whose body may not exist. In contrast, calls to
-  // template functions that are not defined in this TU are allowed to be 
-  // devirtualized under assumption that it is user responsibility to
-  // instantiate them in some other TU.
-  if (const MemberExpr *ME = dyn_cast<MemberExpr>(Base))
-    if (const ValueDecl *VD = dyn_cast<ValueDecl>(ME->getMemberDecl()))
-      return VD->getType()->isRecordType() && 
-             (MD->instantiationIsPending() || MD->isDefined() ||
-               !MD->isImplicitlyInstantiable());
-
-  // Likewise for calls on an object accessed by a (non-reference) pointer to
-  // member access.
-  if (auto *BO = dyn_cast<BinaryOperator>(Base)) {
-    if (BO->isPtrMemOp()) {
-      auto *MPT = BO->getRHS()->getType()->castAs<MemberPointerType>();
-      if (MPT->getPointeeType()->isRecordType())
-        return true;
-    }
-  }
-
-  // We can't devirtualize the call.
-  return false;
-}
-
 void CodeGenFunction::EmitForwardingCallToLambda(
                                       const CXXMethodDecl *callOperator,
                                       CallArgList &callArgs) {
diff --git a/lib/CodeGen/CGDecl.cpp b/lib/CodeGen/CGDecl.cpp
index 4b656ea4a879..23517867437c 100644
--- a/lib/CodeGen/CGDecl.cpp
+++ b/lib/CodeGen/CGDecl.cpp
@@ -221,8 +221,8 @@ llvm::Constant *CodeGenModule::getOrCreateStaticVarDecl(
     Name = getStaticDeclName(*this, D);
 
   llvm::Type *LTy = getTypes().ConvertTypeForMem(Ty);
-  unsigned AddrSpace =
-      GetGlobalVarAddressSpace(&D, getContext().getTargetAddressSpace(Ty));
+  unsigned AS = GetGlobalVarAddressSpace(&D);
+  unsigned TargetAS = getContext().getTargetAddressSpace(AS);
 
   // Local address space cannot have an initializer.
   llvm::Constant *Init = nullptr;
@@ -231,12 +231,9 @@ llvm::Constant *CodeGenModule::getOrCreateStaticVarDecl(
   else
     Init = llvm::UndefValue::get(LTy);
 
-  llvm::GlobalVariable *GV =
-    new llvm::GlobalVariable(getModule(), LTy,
-                             Ty.isConstant(getContext()), Linkage,
-                             Init, Name, nullptr,
-                             llvm::GlobalVariable::NotThreadLocal,
-                             AddrSpace);
+  llvm::GlobalVariable *GV = new llvm::GlobalVariable(
+      getModule(), LTy, Ty.isConstant(getContext()), Linkage, Init, Name,
+      nullptr, llvm::GlobalVariable::NotThreadLocal, TargetAS);
   GV->setAlignment(getContext().getDeclAlign(&D).getQuantity());
   setGlobalVisibility(GV, &D);
 
@@ -254,11 +251,12 @@ llvm::Constant *CodeGenModule::getOrCreateStaticVarDecl(
   }
 
   // Make sure the result is of the correct type.
-  unsigned ExpectedAddrSpace = getContext().getTargetAddressSpace(Ty);
+  unsigned ExpectedAS = Ty.getAddressSpace();
   llvm::Constant *Addr = GV;
-  if (AddrSpace != ExpectedAddrSpace) {
-    llvm::PointerType *PTy = llvm::PointerType::get(LTy, ExpectedAddrSpace);
-    Addr = llvm::ConstantExpr::getAddrSpaceCast(GV, PTy);
+  if (AS != ExpectedAS) {
+    Addr = getTargetCodeGenInfo().performAddrSpaceCast(
+        *this, GV, AS, ExpectedAS,
+        LTy->getPointerTo(getContext().getTargetAddressSpace(ExpectedAS)));
   }
 
   setStaticLocalDeclAddress(&D, Addr);
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index 2ee1c96a6619..9f40ee5a00a3 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -338,9 +338,10 @@ pushTemporaryCleanup(CodeGenFunction &CGF, const MaterializeTemporaryExpr *M,
   }
 }
 
-static Address
-createReferenceTemporary(CodeGenFunction &CGF,
-                         const MaterializeTemporaryExpr *M, const Expr *Inner) {
+static Address createReferenceTemporary(CodeGenFunction &CGF,
+                                        const MaterializeTemporaryExpr *M,
+                                        const Expr *Inner) {
+  auto &TCG = CGF.getTargetHooks();
   switch (M->getStorageDuration()) {
   case SD_FullExpression:
   case SD_Automatic: {
@@ -353,13 +354,24 @@ createReferenceTemporary(CodeGenFunction &CGF,
         (Ty->isArrayType() || Ty->isRecordType()) &&
         CGF.CGM.isTypeConstant(Ty, true))
       if (llvm::Constant *Init = CGF.CGM.EmitConstantExpr(Inner, Ty, &CGF)) {
-        auto *GV = new llvm::GlobalVariable(
-            CGF.CGM.getModule(), Init->getType(), /*isConstant=*/true,
-            llvm::GlobalValue::PrivateLinkage, Init, ".ref.tmp");
-        CharUnits alignment = CGF.getContext().getTypeAlignInChars(Ty);
-        GV->setAlignment(alignment.getQuantity());
-        // FIXME: Should we put the new global into a COMDAT?
-        return Address(GV, alignment);
+        if (auto AddrSpace = CGF.getTarget().getConstantAddressSpace()) {
+          auto AS = AddrSpace.getValue();
+          auto *GV = new llvm::GlobalVariable(
+              CGF.CGM.getModule(), Init->getType(), /*isConstant=*/true,
+              llvm::GlobalValue::PrivateLinkage, Init, ".ref.tmp", nullptr,
+              llvm::GlobalValue::NotThreadLocal,
+              CGF.getContext().getTargetAddressSpace(AS));
+          CharUnits alignment = CGF.getContext().getTypeAlignInChars(Ty);
+          GV->setAlignment(alignment.getQuantity());
+          llvm::Constant *C = GV;
+          if (AS != LangAS::Default)
+            C = TCG.performAddrSpaceCast(
+                CGF.CGM, GV, AS, LangAS::Default,
+                GV->getValueType()->getPointerTo(
+                    CGF.getContext().getTargetAddressSpace(LangAS::Default)));
+          // FIXME: Should we put the new global into a COMDAT?
+          return Address(C, alignment);
+        }
       }
     return CGF.CreateMemTemp(Ty, "ref.tmp");
   }
@@ -440,9 +452,11 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) {
 
   // Create and initialize the reference temporary.
   Address Object = createReferenceTemporary(*this, M, E);
-  if (auto *Var = dyn_cast<llvm::GlobalVariable>(Object.getPointer())) {
+  if (auto *Var = dyn_cast<llvm::GlobalVariable>(
+          Object.getPointer()->stripPointerCasts())) {
     Object = Address(llvm::ConstantExpr::getBitCast(
-        Var, ConvertTypeForMem(E->getType())->getPointerTo()),
+                         cast<llvm::Constant>(Object.getPointer()),
+                         ConvertTypeForMem(E->getType())->getPointerTo()),
                      Object.getAlignment());
     // If the temporary is a global and has a constant initializer or is a
     // constant temporary that we promoted to a global, we may have already
diff --git a/lib/CodeGen/CGExprCXX.cpp b/lib/CodeGen/CGExprCXX.cpp
index a9865f3703d4..ab170245284c 100644
--- a/lib/CodeGen/CGExprCXX.cpp
+++ b/lib/CodeGen/CGExprCXX.cpp
@@ -199,7 +199,8 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
   bool CanUseVirtualCall = MD->isVirtual() && !HasQualifier;
 
   const CXXMethodDecl *DevirtualizedMethod = nullptr;
-  if (CanUseVirtualCall && CanDevirtualizeMemberFunctionCall(Base, MD)) {
+  if (CanUseVirtualCall &&
+      MD->getDevirtualizedMethod(Base, getLangOpts().AppleKext)) {
     const CXXRecordDecl *BestDynamicDecl = Base->getBestDynamicClassType();
     DevirtualizedMethod = MD->getCorrespondingMethodInClass(BestDynamicDecl);
     assert(DevirtualizedMethod);
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 3df95a4e9b2a..a2ea0dec3e9d 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4157,9 +4157,15 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
                        /*IsInitializer=*/true);
   enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
   llvm::Value *TaskArgs[] = {
-      UpLoc, ThreadID, Result.NewTask, IfVal, LBLVal.getPointer(),
-      UBLVal.getPointer(), CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
-      llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0),
+      UpLoc,
+      ThreadID,
+      Result.NewTask,
+      IfVal,
+      LBLVal.getPointer(),
+      UBLVal.getPointer(),
+      CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
+      llvm::ConstantInt::getNullValue(
+          CGF.IntTy), // Always 0 because taskgroup emitted by the compiler
       llvm::ConstantInt::getSigned(
           CGF.IntTy, Data.Schedule.getPointer()
                          ? Data.Schedule.getInt() ? NumTasks : Grainsize
@@ -4168,10 +4174,9 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
           ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
                                       /*isSigned=*/false)
           : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0),
-      Result.TaskDupFn
-          ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Result.TaskDupFn,
-                                                            CGF.VoidPtrTy)
-          : llvm::ConstantPointerNull::get(CGF.VoidPtrTy)};
+      Result.TaskDupFn ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                             Result.TaskDupFn, CGF.VoidPtrTy)
+                       : llvm::ConstantPointerNull::get(CGF.VoidPtrTy)};
   CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskloop), TaskArgs);
 }
 
diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp
index 493cd627e418..71797e2e6fbe 100644
--- a/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/lib/CodeGen/CGStmtOpenMP.cpp
@@ -4363,7 +4363,18 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_taskloop,
                                                     CodeGen);
   };
-  EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
+  if (Data.Nogroup)
+    EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
+  else {
+    CGM.getOpenMPRuntime().emitTaskgroupRegion(
+        *this,
+        [&S, &BodyGen, &TaskGen, &Data](CodeGenFunction &CGF,
+                                        PrePostActionTy &Action) {
+          Action.Enter(CGF);
+          CGF.EmitOMPTaskBasedDirective(S, BodyGen, TaskGen, Data);
+        },
+        S.getLocStart());
+  }
 }
 
 void CodeGenFunction::EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S) {
diff --git a/lib/CodeGen/CodeGenABITypes.cpp b/lib/CodeGen/CodeGenABITypes.cpp
index 166f44f816f3..0735a9c3dfbc 100644
--- a/lib/CodeGen/CodeGenABITypes.cpp
+++ b/lib/CodeGen/CodeGenABITypes.cpp
@@ -64,3 +64,19 @@ CodeGen::arrangeFreeFunctionCall(CodeGenModule &CGM,
       returnType, /*IsInstanceMethod=*/false, /*IsChainCall=*/false, argTypes,
       info, {}, args);
 }
+
+llvm::FunctionType *
+CodeGen::convertFreeFunctionType(CodeGenModule &CGM, const FunctionDecl *FD) {
+  assert(FD != nullptr && "Expected a non-null function declaration!");
+  llvm::Type *T = CGM.getTypes().ConvertFunctionType(FD->getType(), FD);
+
+  if (auto FT = dyn_cast<llvm::FunctionType>(T))
+    return FT;
+
+  return nullptr;
+}
+
+llvm::Type *
+CodeGen::convertTypeForMemory(CodeGenModule &CGM, QualType T) {
+  return CGM.getTypes().ConvertTypeForMem(T);
+}
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index 6785111bd052..5933e029be8d 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -1479,6 +1479,9 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   const TargetInfo &getTarget() const { return Target; }
   llvm::LLVMContext &getLLVMContext() { return CGM.getLLVMContext(); }
+  const TargetCodeGenInfo &getTargetHooks() const {
+    return CGM.getTargetCodeGenInfo();
+  }
 
   //===--------------------------------------------------------------------===//
   //                                  Cleanups
@@ -1749,11 +1752,6 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitVTableTypeCheckedLoad(const CXXRecordDecl *RD, llvm::Value *VTable,
                                          uint64_t VTableByteOffset);
 
-  /// CanDevirtualizeMemberFunctionCalls - Checks whether virtual calls on given
-  /// expr can be devirtualized.
-  bool CanDevirtualizeMemberFunctionCall(const Expr *Base,
-                                         const CXXMethodDecl *MD);
-
   /// EnterDtorCleanups - Enter the cleanups necessary to complete the
   /// given phase of destruction for a destructor.  The end result
   /// should call destructors on members and base classes in reverse
@@ -3820,10 +3818,6 @@ class CodeGenFunction : public CodeGenTypeCache {
 private:
   QualType getVarArgType(const Expr *Arg);
 
-  const TargetCodeGenInfo &getTargetHooks() const {
-    return CGM.getTargetCodeGenInfo();
-  }
-
   void EmitDeclMetadata();
 
   BlockByrefHelpers *buildByrefHelpers(llvm::StructType &byrefType,
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 5319ccec163f..4b15b8ac4c71 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -98,7 +98,6 @@ CodeGenModule::CodeGenModule(ASTContext &C, const HeaderSearchOptions &HSO,
   Int16Ty = llvm::Type::getInt16Ty(LLVMContext);
   Int32Ty = llvm::Type::getInt32Ty(LLVMContext);
   Int64Ty = llvm::Type::getInt64Ty(LLVMContext);
-  HalfTy = llvm::Type::getHalfTy(LLVMContext);
   FloatTy = llvm::Type::getFloatTy(LLVMContext);
   DoubleTy = llvm::Type::getDoubleTy(LLVMContext);
   PointerWidthInBits = C.getTargetInfo().getPointerWidth(0);
@@ -1099,7 +1098,7 @@ static void setLinkageAndVisibilityForGV(llvm::GlobalValue *GV,
                                          const NamedDecl *ND) {
   // Set linkage and visibility in case we never see a definition.
   LinkageInfo LV = ND->getLinkageAndVisibility();
-  if (LV.getLinkage() != ExternalLinkage) {
+  if (!isExternallyVisible(LV.getLinkage())) {
     // Don't set internal linkage on declarations.
   } else {
     if (ND->hasAttr<DLLImportAttr>()) {
@@ -2368,11 +2367,13 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName,
       return llvm::ConstantExpr::getBitCast(Entry, Ty);
   }
 
-  unsigned AddrSpace = GetGlobalVarAddressSpace(D, Ty->getAddressSpace());
+  auto AddrSpace = GetGlobalVarAddressSpace(D);
+  auto TargetAddrSpace = getContext().getTargetAddressSpace(AddrSpace);
+
   auto *GV = new llvm::GlobalVariable(
       getModule(), Ty->getElementType(), false,
       llvm::GlobalValue::ExternalLinkage, nullptr, MangledName, nullptr,
-      llvm::GlobalVariable::NotThreadLocal, AddrSpace);
+      llvm::GlobalVariable::NotThreadLocal, TargetAddrSpace);
 
   // If we already created a global with the same mangled name (but different
   // type) before, take its name and remove it from its parent.
@@ -2429,8 +2430,15 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName,
       GV->setSection(".cp.rodata");
   }
 
-  if (AddrSpace != Ty->getAddressSpace())
-    return llvm::ConstantExpr::getAddrSpaceCast(GV, Ty);
+  auto ExpectedAS =
+      D ? D->getType().getAddressSpace()
+        : static_cast<unsigned>(LangOpts.OpenCL ? LangAS::opencl_global
+                                                : LangAS::Default);
+  assert(getContext().getTargetAddressSpace(ExpectedAS) ==
+         Ty->getPointerAddressSpace());
+  if (AddrSpace != ExpectedAS)
+    return getTargetCodeGenInfo().performAddrSpaceCast(*this, GV, AddrSpace,
+                                                       ExpectedAS, Ty);
 
   return GV;
 }
@@ -2564,18 +2572,28 @@ CharUnits CodeGenModule::GetTargetTypeStoreSize(llvm::Type *Ty) const {
       getDataLayout().getTypeStoreSizeInBits(Ty));
 }
 
-unsigned CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D,
-                                                 unsigned AddrSpace) {
-  if (D && LangOpts.CUDA && LangOpts.CUDAIsDevice) {
-    if (D->hasAttr<CUDAConstantAttr>())
-      AddrSpace = getContext().getTargetAddressSpace(LangAS::cuda_constant);
-    else if (D->hasAttr<CUDASharedAttr>())
-      AddrSpace = getContext().getTargetAddressSpace(LangAS::cuda_shared);
-    else
-      AddrSpace = getContext().getTargetAddressSpace(LangAS::cuda_device);
+unsigned CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D) {
+  unsigned AddrSpace;
+  if (LangOpts.OpenCL) {
+    AddrSpace = D ? D->getType().getAddressSpace()
+                  : static_cast<unsigned>(LangAS::opencl_global);
+    assert(AddrSpace == LangAS::opencl_global ||
+           AddrSpace == LangAS::opencl_constant ||
+           AddrSpace == LangAS::opencl_local ||
+           AddrSpace >= LangAS::FirstTargetAddressSpace);
+    return AddrSpace;
   }
 
-  return AddrSpace;
+  if (LangOpts.CUDA && LangOpts.CUDAIsDevice) {
+    if (D && D->hasAttr<CUDAConstantAttr>())
+      return LangAS::cuda_constant;
+    else if (D && D->hasAttr<CUDASharedAttr>())
+      return LangAS::cuda_shared;
+    else
+      return LangAS::cuda_device;
+  }
+
+  return getTargetCodeGenInfo().getGlobalVarAddressSpace(*this, D);
 }
 
 template<typename SomeDecl>
@@ -2728,10 +2746,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   // "extern int x[];") and then a definition of a different type (e.g.
   // "int x[10];"). This also happens when an initializer has a different type
   // from the type of the global (this happens with unions).
-  if (!GV ||
-      GV->getType()->getElementType() != InitType ||
+  if (!GV || GV->getType()->getElementType() != InitType ||
       GV->getType()->getAddressSpace() !=
-       GetGlobalVarAddressSpace(D, getContext().getTargetAddressSpace(ASTTy))) {
+          getContext().getTargetAddressSpace(GetGlobalVarAddressSpace(D))) {
 
     // Move the old entry aside so that we'll create a new one.
     Entry->setName(StringRef());
@@ -3740,20 +3757,26 @@ ConstantAddress CodeGenModule::GetAddrOfGlobalTemporary(
       Linkage = llvm::GlobalVariable::InternalLinkage;
     }
   }
-  unsigned AddrSpace = GetGlobalVarAddressSpace(
-      VD, getContext().getTargetAddressSpace(MaterializedType));
+  unsigned AddrSpace =
+      VD ? GetGlobalVarAddressSpace(VD) : MaterializedType.getAddressSpace();
+  auto TargetAS = getContext().getTargetAddressSpace(AddrSpace);
   auto *GV = new llvm::GlobalVariable(
       getModule(), Type, Constant, Linkage, InitialValue, Name.c_str(),
-      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-      AddrSpace);
+      /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, TargetAS);
   setGlobalVisibility(GV, VD);
   GV->setAlignment(Align.getQuantity());
   if (supportsCOMDAT() && GV->isWeakForLinker())
     GV->setComdat(TheModule.getOrInsertComdat(GV->getName()));
   if (VD->getTLSKind())
     setTLSMode(GV, *VD);
-  MaterializedGlobalTemporaryMap[E] = GV;
-  return ConstantAddress(GV, Align);
+  llvm::Constant *CV = GV;
+  if (AddrSpace != LangAS::Default)
+    CV = getTargetCodeGenInfo().performAddrSpaceCast(
+        *this, GV, AddrSpace, LangAS::Default,
+        Type->getPointerTo(
+            getContext().getTargetAddressSpace(LangAS::Default)));
+  MaterializedGlobalTemporaryMap[E] = CV;
+  return ConstantAddress(CV, Align);
 }
 
 /// EmitObjCPropertyImplementations - Emit information for synthesized
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index c5f1a2b409ee..b162e72d1992 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -710,11 +710,15 @@ class CodeGenModule : public CodeGenTypeCache {
                                      SourceLocation Loc = SourceLocation(),
                                      bool TLS = false);
 
-  /// Return the address space of the underlying global variable for D, as
+  /// Return the AST address space of the underlying global variable for D, as
   /// determined by its declaration. Normally this is the same as the address
   /// space of D's type, but in CUDA, address spaces are associated with
-  /// declarations, not types.
-  unsigned GetGlobalVarAddressSpace(const VarDecl *D, unsigned AddrSpace);
+  /// declarations, not types. If D is nullptr, return the default address
+  /// space for global variable.
+  ///
+  /// For languages without explicit address spaces, if D has default address
+  /// space, target-specific global or constant address space may be returned.
+  unsigned GetGlobalVarAddressSpace(const VarDecl *D);
 
   /// Return the llvm::Constant for the address of the given global variable.
   /// If Ty is non-null and if the global doesn't exist, then it will be created
diff --git a/lib/CodeGen/CodeGenTypeCache.h b/lib/CodeGen/CodeGenTypeCache.h
index 6910d36733dc..450eab48a3b4 100644
--- a/lib/CodeGen/CodeGenTypeCache.h
+++ b/lib/CodeGen/CodeGenTypeCache.h
@@ -36,7 +36,7 @@ struct CodeGenTypeCache {
   /// i8, i16, i32, and i64
   llvm::IntegerType *Int8Ty, *Int16Ty, *Int32Ty, *Int64Ty;
   /// float, double
-  llvm::Type *HalfTy, *FloatTy, *DoubleTy;
+  llvm::Type *FloatTy, *DoubleTy;
 
   /// int
   llvm::IntegerType *IntTy;
diff --git a/lib/CodeGen/ItaniumCXXABI.cpp b/lib/CodeGen/ItaniumCXXABI.cpp
index 39efb9f43921..c82b9677eacf 100644
--- a/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2732,7 +2732,9 @@ static bool ShouldUseExternalRTTIDescriptor(CodeGenModule &CGM,
     // function.
     bool IsDLLImport = RD->hasAttr<DLLImportAttr>();
     if (CGM.getVTables().isVTableExternal(RD))
-      return IsDLLImport ? false : true;
+      return IsDLLImport && !CGM.getTriple().isWindowsItaniumEnvironment()
+                 ? false
+                 : true;
 
     if (IsDLLImport)
       return true;
@@ -2957,6 +2959,8 @@ static llvm::GlobalVariable::LinkageTypes getTypeInfoLinkage(CodeGenModule &CGM,
     return llvm::GlobalValue::InternalLinkage;
 
   case VisibleNoLinkage:
+  case ModuleInternalLinkage:
+  case ModuleLinkage:
   case ExternalLinkage:
     // RTTI is not enabled, which means that this type info struct is going
     // to be used for exception handling. Give it linkonce_odr linkage.
@@ -2968,7 +2972,8 @@ static llvm::GlobalVariable::LinkageTypes getTypeInfoLinkage(CodeGenModule &CGM,
       if (RD->hasAttr<WeakAttr>())
         return llvm::GlobalValue::WeakODRLinkage;
       if (CGM.getTriple().isWindowsItaniumEnvironment())
-        if (RD->hasAttr<DLLImportAttr>())
+        if (RD->hasAttr<DLLImportAttr>() &&
+            ShouldUseExternalRTTIDescriptor(CGM, Ty))
           return llvm::GlobalValue::ExternalLinkage;
       if (RD->isDynamicClass()) {
         llvm::GlobalValue::LinkageTypes LT = CGM.getVTableLinkage(RD);
@@ -3181,7 +3186,8 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(QualType Ty, bool Force,
     if (DLLExport || (RD && RD->hasAttr<DLLExportAttr>())) {
       TypeName->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
       GV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
-    } else if (CGM.getLangOpts().RTTI && RD && RD->hasAttr<DLLImportAttr>()) {
+    } else if (RD && RD->hasAttr<DLLImportAttr>() &&
+               ShouldUseExternalRTTIDescriptor(CGM, Ty)) {
       TypeName->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
       GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
 
diff --git a/lib/CodeGen/MicrosoftCXXABI.cpp b/lib/CodeGen/MicrosoftCXXABI.cpp
index e68a16e0bd51..78b510bb4665 100644
--- a/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -3425,6 +3425,8 @@ static llvm::GlobalValue::LinkageTypes getLinkageForRTTI(QualType Ty) {
     return llvm::GlobalValue::InternalLinkage;
 
   case VisibleNoLinkage:
+  case ModuleInternalLinkage:
+  case ModuleLinkage:
   case ExternalLinkage:
     return llvm::GlobalValue::LinkOnceODRLinkage;
   }
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 8d00e055306d..eeebd60a2d20 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -416,14 +416,34 @@ llvm::Constant *TargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &
   return llvm::ConstantPointerNull::get(T);
 }
 
+unsigned TargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
+                                                     const VarDecl *D) const {
+  assert(!CGM.getLangOpts().OpenCL &&
+         !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
+         "Address space agnostic languages only");
+  return D ? D->getType().getAddressSpace()
+           : static_cast<unsigned>(LangAS::Default);
+}
+
 llvm::Value *TargetCodeGenInfo::performAddrSpaceCast(
     CodeGen::CodeGenFunction &CGF, llvm::Value *Src, unsigned SrcAddr,
     unsigned DestAddr, llvm::Type *DestTy, bool isNonNull) const {
   // Since target may map different address spaces in AST to the same address
   // space, an address space conversion may end up as a bitcast.
+  if (auto *C = dyn_cast<llvm::Constant>(Src))
+    return performAddrSpaceCast(CGF.CGM, C, SrcAddr, DestAddr, DestTy);
   return CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Src, DestTy);
 }
 
+llvm::Constant *
+TargetCodeGenInfo::performAddrSpaceCast(CodeGenModule &CGM, llvm::Constant *Src,
+                                        unsigned SrcAddr, unsigned DestAddr,
+                                        llvm::Type *DestTy) const {
+  // Since target may map different address spaces in AST to the same address
+  // space, an address space conversion may end up as a bitcast.
+  return llvm::ConstantExpr::getPointerCast(Src, DestTy);
+}
+
 static bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays);
 
 /// isEmptyField - Return true iff a the field is "empty", that is it
@@ -7325,6 +7345,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
     return LangAS::FirstTargetAddressSpace +
            getABIInfo().getDataLayout().getAllocaAddrSpace();
   }
+  unsigned getGlobalVarAddressSpace(CodeGenModule &CGM,
+                                    const VarDecl *D) const override;
 };
 }
 
@@ -7408,6 +7430,31 @@ llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
       llvm::ConstantPointerNull::get(NPT), PT);
 }
 
+unsigned
+AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
+                                                  const VarDecl *D) const {
+  assert(!CGM.getLangOpts().OpenCL &&
+         !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
+         "Address space agnostic languages only");
+  unsigned DefaultGlobalAS =
+      LangAS::FirstTargetAddressSpace +
+      CGM.getContext().getTargetAddressSpace(LangAS::opencl_global);
+  if (!D)
+    return DefaultGlobalAS;
+
+  unsigned AddrSpace = D->getType().getAddressSpace();
+  assert(AddrSpace == LangAS::Default ||
+         AddrSpace >= LangAS::FirstTargetAddressSpace);
+  if (AddrSpace != LangAS::Default)
+    return AddrSpace;
+
+  if (CGM.isTypeConstant(D->getType(), false)) {
+    if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
+      return ConstAS.getValue();
+  }
+  return DefaultGlobalAS;
+}
+
 //===----------------------------------------------------------------------===//
 // SPARC v8 ABI Implementation.
 // Based on the SPARC Compliance Definition version 2.4.1.
diff --git a/lib/CodeGen/TargetInfo.h b/lib/CodeGen/TargetInfo.h
index 247d01dcb086..952ef96c4aef 100644
--- a/lib/CodeGen/TargetInfo.h
+++ b/lib/CodeGen/TargetInfo.h
@@ -229,6 +229,13 @@ class TargetCodeGenInfo {
   virtual llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
       llvm::PointerType *T, QualType QT) const;
 
+  /// Get target favored AST address space of a global variable for languages
+  /// other than OpenCL and CUDA.
+  /// If \p D is nullptr, returns the default target favored address space
+  /// for global variable.
+  virtual unsigned getGlobalVarAddressSpace(CodeGenModule &CGM,
+                                            const VarDecl *D) const;
+
   /// Get the AST address space for alloca.
   virtual unsigned getASTAllocaAddressSpace() const { return LangAS::Default; }
 
@@ -243,6 +250,15 @@ class TargetCodeGenInfo {
                                             unsigned DestAddr,
                                             llvm::Type *DestTy,
                                             bool IsNonNull = false) const;
+
+  /// Perform address space cast of a constant expression of pointer type.
+  /// \param V is the LLVM constant to be casted to another address space.
+  /// \param SrcAddr is the language address space of \p V.
+  /// \param DestAddr is the targeted language address space.
+  /// \param DestTy is the destination LLVM pointer type.
+  virtual llvm::Constant *
+  performAddrSpaceCast(CodeGenModule &CGM, llvm::Constant *V, unsigned SrcAddr,
+                       unsigned DestAddr, llvm::Type *DestTy) const;
 };
 
 } // namespace CodeGen
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index faced0697ed9..42478013ccec 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -572,8 +572,22 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
           if (TT.getArch() == llvm::Triple::UnknownArch)
             Diag(clang::diag::err_drv_invalid_omp_target) << Val;
           else {
-            const ToolChain &TC = getToolChain(C.getInputArgs(), TT);
-            C.addOffloadDeviceToolChain(&TC, Action::OFK_OpenMP);
+            const ToolChain *TC;
+            // CUDA toolchains have to be selected differently. They pair host
+            // and device in their implementation.
+            if (TT.isNVPTX()) {
+              const ToolChain *HostTC =
+                  C.getSingleOffloadToolChain<Action::OFK_Host>();
+              assert(HostTC && "Host toolchain should be always defined.");
+              auto &CudaTC =
+                  ToolChains[TT.str() + "/" + HostTC->getTriple().str()];
+              if (!CudaTC)
+                CudaTC = llvm::make_unique<toolchains::CudaToolChain>(
+                    *this, TT, *HostTC, C.getInputArgs());
+              TC = CudaTC.get();
+            } else
+              TC = &getToolChain(C.getInputArgs(), TT);
+            C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP);
           }
         }
       } else
@@ -1247,11 +1261,20 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     StringRef PassedFlags = A->getValue();
     std::vector<std::string> SuggestedCompletions;
 
+    unsigned short DisableFlags = options::NoDriverOption | options::Unsupported | options::Ignored;
+    // We want to show cc1-only options only when clang is invoked as "clang -cc1".
+    // When clang is invoked as "clang -cc1", we add "#" to the beginning of an --autocomplete
+    // option so that the clang driver can distinguish whether it is requested to show cc1-only options or not.
+    if (PassedFlags[0] == '#') {
+      DisableFlags &= ~options::NoDriverOption;
+      PassedFlags = PassedFlags.substr(1);
+    }
+
     if (PassedFlags.find(',') == StringRef::npos) {
       // If the flag is in the form of "--autocomplete=-foo",
       // we were requested to print out all option names that start with "-foo".
       // For example, "--autocomplete=-fsyn" is expanded to "-fsyntax-only".
-      SuggestedCompletions = Opts->findByPrefix(PassedFlags);
+      SuggestedCompletions = Opts->findByPrefix(PassedFlags, DisableFlags);
     } else {
       // If the flag is in the form of "--autocomplete=foo,bar", we were
       // requested to print out all option values for "-foo" that start with
diff --git a/lib/Driver/ToolChain.cpp b/lib/Driver/ToolChain.cpp
index 4f82503276f4..2be7f0f69004 100644
--- a/lib/Driver/ToolChain.cpp
+++ b/lib/Driver/ToolChain.cpp
@@ -544,9 +544,9 @@ void ToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   // Each toolchain should provide the appropriate include flags.
 }
 
-void ToolChain::addClangTargetOptions(const ArgList &DriverArgs,
-                                      ArgStringList &CC1Args) const {
-}
+void ToolChain::addClangTargetOptions(
+    const ArgList &DriverArgs, ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadKind) const {}
 
 void ToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {}
 
diff --git a/lib/Driver/ToolChains/Arch/ARM.cpp b/lib/Driver/ToolChains/Arch/ARM.cpp
index 8cafd3c74bfb..95b86f784f91 100644
--- a/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -179,6 +179,18 @@ arm::FloatABI arm::getARMFloatABI(const ToolChain &TC, const ArgList &Args) {
       ABI = FloatABI::Hard;
       break;
 
+    case llvm::Triple::NetBSD:
+      switch (Triple.getEnvironment()) {
+      case llvm::Triple::EABIHF:
+      case llvm::Triple::GNUEABIHF:
+        ABI = FloatABI::Hard;
+        break;
+      default:
+        ABI = FloatABI::Soft;
+        break;
+      }
+      break;
+
     case llvm::Triple::FreeBSD:
       switch (Triple.getEnvironment()) {
       case llvm::Triple::GNUEABIHF:
diff --git a/lib/Driver/ToolChains/Arch/Mips.cpp b/lib/Driver/ToolChains/Arch/Mips.cpp
index 33eff32b9731..b45dcd6db678 100644
--- a/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -227,12 +227,11 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
          O.matches(options::OPT_fno_PIE) || O.matches(options::OPT_fno_pie));
   }
 
-  if (IsN64 && NonPIC) {
+  if (IsN64 && NonPIC)
     Features.push_back("+noabicalls");
-  } else {
+  else
     AddTargetFeature(Args, Features, options::OPT_mno_abicalls,
                      options::OPT_mabicalls, "noabicalls");
-  }
 
   mips::FloatABI FloatABI = mips::getMipsFloatABI(D, Args);
   if (FloatABI == mips::FloatABI::Soft) {
@@ -298,13 +297,11 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
 
   AddTargetFeature(Args, Features, options::OPT_mno_odd_spreg,
                    options::OPT_modd_spreg, "nooddspreg");
-
-  if (Arg *A = Args.getLastArg(options::OPT_mmadd4, options::OPT_mno_madd4)) {
-    if (A->getOption().matches(options::OPT_mmadd4))
-      Features.push_back("-nomadd4");
-    else
-      Features.push_back("+nomadd4");
-  }
+  AddTargetFeature(Args, Features, options::OPT_mno_madd4, options::OPT_mmadd4,
+                   "nomadd4");
+  AddTargetFeature(Args, Features, options::OPT_mlong_calls,
+                   options::OPT_mno_long_calls, "long-calls");
+  AddTargetFeature(Args, Features, options::OPT_mmt, options::OPT_mno_mt,"mt");
 }
 
 mips::NanEncoding mips::getSupportedNanEncoding(StringRef &CPU) {
diff --git a/lib/Driver/ToolChains/BareMetal.cpp b/lib/Driver/ToolChains/BareMetal.cpp
index 66246f6d71cd..5dc6dfad927b 100644
--- a/lib/Driver/ToolChains/BareMetal.cpp
+++ b/lib/Driver/ToolChains/BareMetal.cpp
@@ -98,7 +98,8 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
 }
 
 void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
-                                      ArgStringList &CC1Args) const {
+                                      ArgStringList &CC1Args,
+                                      Action::OffloadKind) const {
   CC1Args.push_back("-nostdsysteminc");
 }
 
diff --git a/lib/Driver/ToolChains/BareMetal.h b/lib/Driver/ToolChains/BareMetal.h
index 064c1199735b..4b74899fa53e 100644
--- a/lib/Driver/ToolChains/BareMetal.h
+++ b/lib/Driver/ToolChains/BareMetal.h
@@ -54,7 +54,8 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
   void AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                  llvm::opt::ArgStringList &CC1Args) const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
   std::string findLibCxxIncludePath(ToolChain::CXXStdlibType LibType) const;
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index 3731aa83ef06..e759e3ec619a 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -2544,7 +2544,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                    AsynchronousUnwindTables))
     CmdArgs.push_back("-munwind-tables");
 
-  getToolChain().addClangTargetOptions(Args, CmdArgs);
+  getToolChain().addClangTargetOptions(Args, CmdArgs,
+                                       JA.getOffloadingDeviceKind());
 
   if (Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
@@ -2974,6 +2975,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     A->claim();
   }
 
+  for (const Arg *A :
+       Args.filtered(options::OPT_clang_ignored_legacy_options_Group)) {
+    D.Diag(diag::warn_ignored_clang_option) << A->getAsString(Args);
+    A->claim();
+  }
+
   claimNoWarnArgs(Args);
 
   Args.AddAllArgs(CmdArgs, options::OPT_R_Group);
diff --git a/lib/Driver/ToolChains/CommonArgs.cpp b/lib/Driver/ToolChains/CommonArgs.cpp
index e8bb703054de..00bd60bc24bb 100644
--- a/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/lib/Driver/ToolChains/CommonArgs.cpp
@@ -524,6 +524,7 @@ void tools::linkSanitizerRuntimeDeps(const ToolChain &TC,
   CmdArgs.push_back("-lm");
   // There's no libdl on FreeBSD or RTEMS.
   if (TC.getTriple().getOS() != llvm::Triple::FreeBSD &&
+      TC.getTriple().getOS() != llvm::Triple::NetBSD &&
       TC.getTriple().getOS() != llvm::Triple::RTEMS)
     CmdArgs.push_back("-ldl");
 }
diff --git a/lib/Driver/ToolChains/Cuda.cpp b/lib/Driver/ToolChains/Cuda.cpp
index 42bf164f1b3f..935a5a37ada5 100644
--- a/lib/Driver/ToolChains/Cuda.cpp
+++ b/lib/Driver/ToolChains/Cuda.cpp
@@ -338,24 +338,31 @@ CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
 
 void CudaToolChain::addClangTargetOptions(
     const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  HostTC.addClangTargetOptions(DriverArgs, CC1Args);
-
-  CC1Args.push_back("-fcuda-is-device");
-
-  if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
-                         options::OPT_fno_cuda_flush_denormals_to_zero, false))
-    CC1Args.push_back("-fcuda-flush-denormals-to-zero");
-
-  if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
-                         options::OPT_fno_cuda_approx_transcendentals, false))
-    CC1Args.push_back("-fcuda-approx-transcendentals");
-
-  if (DriverArgs.hasArg(options::OPT_nocudalib))
-    return;
+    llvm::opt::ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadingKind) const {
+  HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
   StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
   assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
+  assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
+          DeviceOffloadingKind == Action::OFK_Cuda) &&
+         "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
+
+  if (DeviceOffloadingKind == Action::OFK_Cuda) {
+    CC1Args.push_back("-fcuda-is-device");
+
+    if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
+                           options::OPT_fno_cuda_flush_denormals_to_zero, false))
+      CC1Args.push_back("-fcuda-flush-denormals-to-zero");
+
+    if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
+                           options::OPT_fno_cuda_approx_transcendentals, false))
+      CC1Args.push_back("-fcuda-approx-transcendentals");
+
+    if (DriverArgs.hasArg(options::OPT_nocudalib))
+      return;
+  }
+
   std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
 
   if (LibDeviceFile.empty()) {
@@ -396,6 +403,24 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
 
   const OptTable &Opts = getDriver().getOpts();
 
+  // For OpenMP device offloading, append derived arguments. Make sure
+  // flags are not duplicated.
+  // TODO: Append the compute capability.
+  if (DeviceOffloadKind == Action::OFK_OpenMP) {
+    for (Arg *A : Args){
+      bool IsDuplicate = false;
+      for (Arg *DALArg : *DAL){
+        if (A == DALArg) {
+          IsDuplicate = true;
+          break;
+        }
+      }
+      if (!IsDuplicate)
+        DAL->append(A);
+    }
+    return DAL;
+  }
+
   for (Arg *A : Args) {
     if (A->getOption().matches(options::OPT_Xarch__)) {
       // Skip this argument unless the architecture matches BoundArch
diff --git a/lib/Driver/ToolChains/Cuda.h b/lib/Driver/ToolChains/Cuda.h
index acdb4c4efd6d..e66fc23d82f3 100644
--- a/lib/Driver/ToolChains/Cuda.h
+++ b/lib/Driver/ToolChains/Cuda.h
@@ -130,7 +130,8 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
   TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
                 Action::OffloadKind DeviceOffloadKind) const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
 
   // Never try to use the integrated assembler with CUDA; always fork out to
   // ptxas.
diff --git a/lib/Driver/ToolChains/Darwin.cpp b/lib/Driver/ToolChains/Darwin.cpp
index 7b61095c3ba9..ba1a5ee95594 100644
--- a/lib/Driver/ToolChains/Darwin.cpp
+++ b/lib/Driver/ToolChains/Darwin.cpp
@@ -1118,6 +1118,27 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args,
   }
 }
 
+/// Returns the most appropriate macOS target version for the current process.
+///
+/// If the macOS SDK version is the same or earlier than the system version,
+/// then the SDK version is returned. Otherwise the system version is returned.
+static std::string getSystemOrSDKMacOSVersion(StringRef MacOSSDKVersion) {
+  unsigned Major, Minor, Micro;
+  llvm::Triple SystemTriple(llvm::sys::getProcessTriple());
+  if (!SystemTriple.isMacOSX())
+    return MacOSSDKVersion;
+  SystemTriple.getMacOSXVersion(Major, Minor, Micro);
+  VersionTuple SystemVersion(Major, Minor, Micro);
+  bool HadExtra;
+  if (!Driver::GetReleaseVersion(MacOSSDKVersion, Major, Minor, Micro,
+                                 HadExtra))
+    return MacOSSDKVersion;
+  VersionTuple SDKVersion(Major, Minor, Micro);
+  if (SDKVersion > SystemVersion)
+    return SystemVersion.getAsString();
+  return MacOSSDKVersion;
+}
+
 void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
   const OptTable &Opts = getDriver().getOpts();
 
@@ -1229,7 +1250,7 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
                 SDK.startswith("iPhoneSimulator"))
               iOSTarget = Version;
             else if (SDK.startswith("MacOSX"))
-              OSXTarget = Version;
+              OSXTarget = getSystemOrSDKMacOSVersion(Version);
             else if (SDK.startswith("WatchOS") ||
                      SDK.startswith("WatchSimulator"))
               WatchOSTarget = Version;
@@ -1708,7 +1729,8 @@ bool Darwin::isAlignedAllocationUnavailable() const {
 }
 
 void Darwin::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                                   llvm::opt::ArgStringList &CC1Args) const {
+                                   llvm::opt::ArgStringList &CC1Args,
+                                   Action::OffloadKind DeviceOffloadKind) const {
   if (isAlignedAllocationUnavailable())
     CC1Args.push_back("-faligned-alloc-unavailable");
 }
diff --git a/lib/Driver/ToolChains/Darwin.h b/lib/Driver/ToolChains/Darwin.h
index ffcdf9a71a46..6cb1d04b78c0 100644
--- a/lib/Driver/ToolChains/Darwin.h
+++ b/lib/Driver/ToolChains/Darwin.h
@@ -390,7 +390,8 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public MachO {
   bool isAlignedAllocationUnavailable() const;
 
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
 
   StringRef getPlatformFamily() const;
   static StringRef getSDKName(StringRef isysroot);
diff --git a/lib/Driver/ToolChains/Fuchsia.cpp b/lib/Driver/ToolChains/Fuchsia.cpp
index b8757cf4aa73..d8b8fe8f0bfe 100644
--- a/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/lib/Driver/ToolChains/Fuchsia.cpp
@@ -131,16 +131,44 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
 /// Fuchsia - Fuchsia tool chain which can call as(1) and ld(1) directly.
 
-Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
-                 const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-
-  getFilePaths().push_back(D.SysRoot + "/lib");
-  getFilePaths().push_back(D.ResourceDir + "/lib/fuchsia");
+static std::string normalizeTriple(llvm::Triple Triple) {
+  SmallString<64> T;
+  T += Triple.getArchName();
+  T += "-";
+  T += Triple.getOSName();
+  return T.str();
 }
 
-Tool *Fuchsia::buildAssembler() const {
-  return new tools::gnutools::Assembler(*this);
+static std::string getTargetDir(const Driver &D,
+                                llvm::Triple Triple) {
+  SmallString<128> P(llvm::sys::path::parent_path(D.Dir));
+  llvm::sys::path::append(P, "lib", normalizeTriple(Triple));
+  return P.str();
+}
+
+Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
+                 const ArgList &Args)
+    : ToolChain(D, Triple, Args) {
+  getProgramPaths().push_back(getDriver().getInstalledDir());
+  if (getDriver().getInstalledDir() != D.Dir)
+    getProgramPaths().push_back(D.Dir);
+
+  SmallString<128> P(getTargetDir(D, getTriple()));
+  llvm::sys::path::append(P, "lib");
+  getFilePaths().push_back(P.str());
+
+  if (!D.SysRoot.empty()) {
+    SmallString<128> P(D.SysRoot);
+    llvm::sys::path::append(P, "lib");
+    getFilePaths().push_back(P.str());
+  }
+}
+
+std::string Fuchsia::ComputeEffectiveClangTriple(const ArgList &Args,
+                                                 types::ID InputType) const {
+  llvm::Triple Triple(ComputeLLVMTriple(Args, InputType));
+  Triple.setTriple(normalizeTriple(Triple));
+  return Triple.getTriple();
 }
 
 Tool *Fuchsia::buildLinker() const {
@@ -172,7 +200,8 @@ Fuchsia::GetCXXStdlibType(const ArgList &Args) const {
 }
 
 void Fuchsia::addClangTargetOptions(const ArgList &DriverArgs,
-                                    ArgStringList &CC1Args) const {
+                                    ArgStringList &CC1Args,
+                                    Action::OffloadKind) const {
   if (DriverArgs.hasFlag(options::OPT_fuse_init_array,
                          options::OPT_fno_use_init_array, true))
     CC1Args.push_back("-fuse-init-array");
@@ -207,19 +236,44 @@ void Fuchsia::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
     return;
   }
 
-  addExternCSystemInclude(DriverArgs, CC1Args, D.SysRoot + "/include");
+  if (!D.SysRoot.empty()) {
+    SmallString<128> P(D.SysRoot);
+    llvm::sys::path::append(P, "include");
+    addExternCSystemInclude(DriverArgs, CC1Args, P.str());
+  }
 }
 
-std::string Fuchsia::findLibCxxIncludePath() const {
-  return getDriver().SysRoot + "/include/c++/v1";
+void Fuchsia::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
+                                           ArgStringList &CC1Args) const {
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc) ||
+      DriverArgs.hasArg(options::OPT_nostdincxx))
+    return;
+
+  switch (GetCXXStdlibType(DriverArgs)) {
+  case ToolChain::CST_Libcxx: {
+    SmallString<128> P(getTargetDir(getDriver(), getTriple()));
+    llvm::sys::path::append(P, "include", "c++", "v1");
+    addSystemInclude(DriverArgs, CC1Args, P.str());
+    break;
+  }
+
+  default:
+    llvm_unreachable("invalid stdlib name");
+  }
 }
 
 void Fuchsia::AddCXXStdlibLibArgs(const ArgList &Args,
                                   ArgStringList &CmdArgs) const {
-  (void) GetCXXStdlibType(Args);
-  CmdArgs.push_back("-lc++");
-  CmdArgs.push_back("-lc++abi");
-  CmdArgs.push_back("-lunwind");
+  switch (GetCXXStdlibType(Args)) {
+  case ToolChain::CST_Libcxx:
+    CmdArgs.push_back("-lc++");
+    CmdArgs.push_back("-lc++abi");
+    CmdArgs.push_back("-lunwind");
+    break;
+
+  case ToolChain::CST_Libstdcxx:
+    llvm_unreachable("invalid stdlib name");
+  }
 }
 
 SanitizerMask Fuchsia::getSupportedSanitizers() const {
diff --git a/lib/Driver/ToolChains/Fuchsia.h b/lib/Driver/ToolChains/Fuchsia.h
index 1a8c9903fe4d..a723a99dfa3b 100644
--- a/lib/Driver/ToolChains/Fuchsia.h
+++ b/lib/Driver/ToolChains/Fuchsia.h
@@ -35,18 +35,29 @@ class LLVM_LIBRARY_VISIBILITY Linker : public GnuTool {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY Fuchsia : public Generic_ELF {
+class LLVM_LIBRARY_VISIBILITY Fuchsia : public ToolChain {
 public:
   Fuchsia(const Driver &D, const llvm::Triple &Triple,
           const llvm::opt::ArgList &Args);
 
-  bool isPIEDefault() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
   bool IsIntegratedAssemblerDefault() const override { return true; }
+  RuntimeLibType GetDefaultRuntimeLibType() const override {
+    return ToolChain::RLT_CompilerRT;
+  }
+  CXXStdlibType GetDefaultCXXStdlibType() const override {
+    return ToolChain::CST_Libcxx;
+  }
+  bool isPICDefault() const override { return false; }
+  bool isPIEDefault() const override { return true; }
+  bool isPICDefaultForced() const override { return false; }
   llvm::DebuggerKind getDefaultDebuggerTuning() const override {
     return llvm::DebuggerKind::GDB;
   }
 
+  std::string ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
+                                          types::ID InputType) const override;
+
   SanitizerMask getSupportedSanitizers() const override;
 
   RuntimeLibType
@@ -55,11 +66,14 @@ class LLVM_LIBRARY_VISIBILITY Fuchsia : public Generic_ELF {
   GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
 
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
-  std::string findLibCxxIncludePath() const override;
+  void
+  AddClangCXXStdlibIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                               llvm::opt::ArgStringList &CC1Args) const override;
   void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs) const override;
 
@@ -68,7 +82,6 @@ class LLVM_LIBRARY_VISIBILITY Fuchsia : public Generic_ELF {
   }
 
 protected:
-  Tool *buildAssembler() const override;
   Tool *buildLinker() const override;
 };
 
diff --git a/lib/Driver/ToolChains/Gnu.cpp b/lib/Driver/ToolChains/Gnu.cpp
index bca5d3a3f28b..ad5f7df50d2e 100644
--- a/lib/Driver/ToolChains/Gnu.cpp
+++ b/lib/Driver/ToolChains/Gnu.cpp
@@ -2461,7 +2461,8 @@ Generic_GCC::TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef,
 void Generic_ELF::anchor() {}
 
 void Generic_ELF::addClangTargetOptions(const ArgList &DriverArgs,
-                                        ArgStringList &CC1Args) const {
+                                        ArgStringList &CC1Args,
+                                        Action::OffloadKind) const {
   const Generic_GCC::GCCVersion &V = GCCInstallation.getVersion();
   bool UseInitArrayDefault =
       getTriple().getArch() == llvm::Triple::aarch64 ||
diff --git a/lib/Driver/ToolChains/Gnu.h b/lib/Driver/ToolChains/Gnu.h
index 1dc1ad49e305..cdf610054401 100644
--- a/lib/Driver/ToolChains/Gnu.h
+++ b/lib/Driver/ToolChains/Gnu.h
@@ -341,7 +341,8 @@ class LLVM_LIBRARY_VISIBILITY Generic_ELF : public Generic_GCC {
       : Generic_GCC(D, Triple, Args) {}
 
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
 };
 
 } // end namespace toolchains
diff --git a/lib/Driver/ToolChains/Hexagon.cpp b/lib/Driver/ToolChains/Hexagon.cpp
index 1d7bcf8e4df0..9bf1590e6a37 100644
--- a/lib/Driver/ToolChains/Hexagon.cpp
+++ b/lib/Driver/ToolChains/Hexagon.cpp
@@ -428,7 +428,8 @@ unsigned HexagonToolChain::getOptimizationLevel(
 }
 
 void HexagonToolChain::addClangTargetOptions(const ArgList &DriverArgs,
-                                             ArgStringList &CC1Args) const {
+                                             ArgStringList &CC1Args,
+                                             Action::OffloadKind) const {
   if (DriverArgs.hasArg(options::OPT_ffp_contract))
     return;
   unsigned OptLevel = getOptimizationLevel(DriverArgs);
diff --git a/lib/Driver/ToolChains/Hexagon.h b/lib/Driver/ToolChains/Hexagon.h
index 78f97a3d59fd..7f9a6b2f34b9 100644
--- a/lib/Driver/ToolChains/Hexagon.h
+++ b/lib/Driver/ToolChains/Hexagon.h
@@ -69,7 +69,8 @@ class LLVM_LIBRARY_VISIBILITY HexagonToolChain : public Linux {
   ~HexagonToolChain() override;
 
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/lib/Driver/ToolChains/NetBSD.cpp b/lib/Driver/ToolChains/NetBSD.cpp
index d7d3ad61df42..a1a3108cb28d 100644
--- a/lib/Driver/ToolChains/NetBSD.cpp
+++ b/lib/Driver/ToolChains/NetBSD.cpp
@@ -15,6 +15,7 @@
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
 #include "llvm/Option/ArgList.h"
 
 using namespace clang::driver;
@@ -246,6 +247,7 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag);
   Args.AddAllArgs(CmdArgs, options::OPT_r);
 
+  bool NeedsSanitizerDeps = addSanitizerRuntimes(getToolChain(), Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
 
   unsigned Major, Minor, Micro;
@@ -279,6 +281,8 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
+    if (NeedsSanitizerDeps)
+      linkSanitizerRuntimeDeps(getToolChain(), CmdArgs);
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-lpthread");
     CmdArgs.push_back("-lc");
@@ -410,3 +414,12 @@ void NetBSD::addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
   addLibStdCXXIncludePaths(getDriver().SysRoot, "/usr/include/g++", "", "", "",
                            "", DriverArgs, CC1Args);
 }
+
+SanitizerMask NetBSD::getSupportedSanitizers() const {
+  const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
+  SanitizerMask Res = ToolChain::getSupportedSanitizers();
+  if (IsX86_64) {
+    Res |= SanitizerKind::Address;
+  }
+  return Res;
+}
diff --git a/lib/Driver/ToolChains/NetBSD.h b/lib/Driver/ToolChains/NetBSD.h
index d53aa6867872..412d0815e81a 100644
--- a/lib/Driver/ToolChains/NetBSD.h
+++ b/lib/Driver/ToolChains/NetBSD.h
@@ -66,6 +66,7 @@ class LLVM_LIBRARY_VISIBILITY NetBSD : public Generic_ELF {
       llvm::opt::ArgStringList &CC1Args) const override;
 
   bool IsUnwindTablesDefault() const override { return true; }
+  SanitizerMask getSupportedSanitizers() const override;
 
 protected:
   Tool *buildAssembler() const override;
diff --git a/lib/Driver/ToolChains/WebAssembly.cpp b/lib/Driver/ToolChains/WebAssembly.cpp
index fcb6418b2517..058bc42323e2 100644
--- a/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/lib/Driver/ToolChains/WebAssembly.cpp
@@ -134,7 +134,8 @@ bool WebAssembly::SupportsProfiling() const { return false; }
 bool WebAssembly::HasNativeLLVMSupport() const { return true; }
 
 void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
-                                        ArgStringList &CC1Args) const {
+                                        ArgStringList &CC1Args,
+                                        Action::OffloadKind) const {
   if (DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array,
                          options::OPT_fno_use_init_array, true))
     CC1Args.push_back("-fuse-init-array");
diff --git a/lib/Driver/ToolChains/WebAssembly.h b/lib/Driver/ToolChains/WebAssembly.h
index ca42fc651a6d..2999db477f79 100644
--- a/lib/Driver/ToolChains/WebAssembly.h
+++ b/lib/Driver/ToolChains/WebAssembly.h
@@ -53,7 +53,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly final : public ToolChain {
   bool SupportsProfiling() const override;
   bool HasNativeLLVMSupport() const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
   RuntimeLibType GetDefaultRuntimeLibType() const override;
   CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
   void AddClangSystemIncludeArgs(
diff --git a/lib/Driver/ToolChains/XCore.cpp b/lib/Driver/ToolChains/XCore.cpp
index c3ae9582124f..43175ad7d632 100644
--- a/lib/Driver/ToolChains/XCore.cpp
+++ b/lib/Driver/ToolChains/XCore.cpp
@@ -124,7 +124,8 @@ void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
 }
 
 void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs,
-                                           ArgStringList &CC1Args) const {
+                                           ArgStringList &CC1Args,
+                                           Action::OffloadKind) const {
   CC1Args.push_back("-nostdsysteminc");
 }
 
diff --git a/lib/Driver/ToolChains/XCore.h b/lib/Driver/ToolChains/XCore.h
index 4084b1cdec13..00c89bd15f78 100644
--- a/lib/Driver/ToolChains/XCore.h
+++ b/lib/Driver/ToolChains/XCore.h
@@ -67,7 +67,8 @@ class LLVM_LIBRARY_VISIBILITY XCoreToolChain : public ToolChain {
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                             llvm::opt::ArgStringList &CC1Args) const override;
+                             llvm::opt::ArgStringList &CC1Args,
+                             Action::OffloadKind DeviceOffloadKind) const override;
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp
index 4197587a74c0..3bf1cd8f7c13 100644
--- a/lib/Format/ContinuationIndenter.cpp
+++ b/lib/Format/ContinuationIndenter.cpp
@@ -66,6 +66,16 @@ static bool startsNextParameter(const FormatToken &Current,
            !Style.BreakBeforeInheritanceComma));
 }
 
+static bool opensProtoMessageField(const FormatToken &LessTok,
+                                   const FormatStyle &Style) {
+  if (LessTok.isNot(tok::less))
+    return false;
+  return Style.Language == FormatStyle::LK_TextProto ||
+         (Style.Language == FormatStyle::LK_Proto &&
+          (LessTok.NestingLevel > 0 ||
+           (LessTok.Previous && LessTok.Previous->is(tok::equal))));
+}
+
 ContinuationIndenter::ContinuationIndenter(const FormatStyle &Style,
                                            const AdditionalKeywords &Keywords,
                                            const SourceManager &SourceMgr,
@@ -94,6 +104,13 @@ LineState ContinuationIndenter::getInitialState(unsigned FirstIndent,
   State.LowestLevelOnLine = 0;
   State.IgnoreStackForComparison = false;
 
+  if (Style.Language == FormatStyle::LK_TextProto) {
+    // We need this in order to deal with the bin packing of text fields at
+    // global scope.
+    State.Stack.back().AvoidBinPacking = true;
+    State.Stack.back().BreakBeforeParameter = true;
+  }
+
   // The first token has already been indented and thus consumed.
   moveStateToNextToken(State, DryRun, /*Newline=*/false);
   return State;
@@ -176,7 +193,8 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
     return true;
   if (((Previous.is(TT_DictLiteral) && Previous.is(tok::l_brace)) ||
        (Previous.is(TT_ArrayInitializerLSquare) &&
-        Previous.ParameterCount > 1)) &&
+        Previous.ParameterCount > 1) ||
+       opensProtoMessageField(Previous, Style)) &&
       Style.ColumnLimit > 0 &&
       getLengthToMatchingParen(Previous) + State.Column - 1 >
           getColumnLimit(State))
@@ -501,13 +519,6 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   }
 }
 
-static bool lessOpensProtoMessageField(const FormatToken &LessTok,
-                                       const LineState &State) {
-  assert(LessTok.is(tok::less));
-  return LessTok.NestingLevel > 0 ||
-         (LessTok.Previous && LessTok.Previous->is(tok::equal));
-}
-
 unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
                                                  bool DryRun) {
   FormatToken &Current = *State.NextToken;
@@ -650,9 +661,7 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
   // before the corresponding } or ].
   if (PreviousNonComment &&
       (PreviousNonComment->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) ||
-      (Style.Language == FormatStyle::LK_Proto &&
-       PreviousNonComment->is(tok::less) &&
-       lessOpensProtoMessageField(*PreviousNonComment, State)) ||
+       opensProtoMessageField(*PreviousNonComment, Style) ||
        (PreviousNonComment->is(TT_TemplateString) &&
         PreviousNonComment->opensScope())))
     State.Stack.back().BreakBeforeClosingBrace = true;
@@ -695,7 +704,9 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
     return Current.NestingLevel == 0 ? State.FirstIndent
                                      : State.Stack.back().Indent;
   if ((Current.isOneOf(tok::r_brace, tok::r_square) ||
-       (Current.is(tok::greater) && Style.Language == FormatStyle::LK_Proto)) &&
+       (Current.is(tok::greater) &&
+        (Style.Language == FormatStyle::LK_Proto ||
+         Style.Language == FormatStyle::LK_TextProto))) &&
       State.Stack.size() > 1) {
     if (Current.closesBlockOrBlockTypeList(Style))
       return State.Stack[State.Stack.size() - 2].NestedBlockIndent;
@@ -1050,8 +1061,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State,
   unsigned NestedBlockIndent = std::max(State.Stack.back().StartOfFunctionCall,
                                         State.Stack.back().NestedBlockIndent);
   if (Current.isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) ||
-      (Style.Language == FormatStyle::LK_Proto && Current.is(tok::less) &&
-       lessOpensProtoMessageField(Current, State))) {
+      opensProtoMessageField(Current, Style)) {
     if (Current.opensBlockOrBlockTypeList(Style)) {
       NewIndent = Style.IndentWidth +
                   std::min(State.Column, State.Stack.back().NestedBlockIndent);
@@ -1064,7 +1074,9 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State,
                        Current.MatchingParen->Previous->is(tok::comma);
     AvoidBinPacking =
         EndsInComma || Current.is(TT_DictLiteral) ||
-        Style.Language == FormatStyle::LK_Proto || !Style.BinPackArguments ||
+        Style.Language == FormatStyle::LK_Proto ||
+        Style.Language == FormatStyle::LK_TextProto ||
+        !Style.BinPackArguments ||
         (NextNoComment &&
          NextNoComment->isOneOf(TT_DesignatedInitializerPeriod,
                                 TT_DesignatedInitializerLSquare));
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index bb6781d79517..aa4ed8c42a70 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -56,6 +56,7 @@ template <> struct ScalarEnumerationTraits<FormatStyle::LanguageKind> {
     IO.enumCase(Value, "ObjC", FormatStyle::LK_ObjC);
     IO.enumCase(Value, "Proto", FormatStyle::LK_Proto);
     IO.enumCase(Value, "TableGen", FormatStyle::LK_TableGen);
+    IO.enumCase(Value, "TextProto", FormatStyle::LK_TextProto);
   }
 };
 
@@ -631,6 +632,12 @@ FormatStyle getLLVMStyle() {
 }
 
 FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
+  if (Language == FormatStyle::LK_TextProto) {
+    FormatStyle GoogleStyle = getGoogleStyle(FormatStyle::LK_Proto);
+    GoogleStyle.Language = FormatStyle::LK_TextProto;
+    return GoogleStyle;
+  }
+
   FormatStyle GoogleStyle = getLLVMStyle();
   GoogleStyle.Language = Language;
 
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index 0fe91adcd472..a60361a8e5fa 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -21,6 +21,7 @@
 #include "clang/Format/Format.h"
 #include "clang/Lex/Lexer.h"
 #include <memory>
+#include <unordered_set>
 
 namespace clang {
 namespace format {
@@ -466,7 +467,8 @@ struct FormatToken {
            (is(tok::l_brace) &&
             (BlockKind == BK_Block || is(TT_DictLiteral) ||
              (!Style.Cpp11BracedListStyle && NestingLevel == 0))) ||
-           (is(tok::less) && Style.Language == FormatStyle::LK_Proto);
+           (is(tok::less) && (Style.Language == FormatStyle::LK_Proto ||
+                              Style.Language == FormatStyle::LK_TextProto));
   }
 
   /// \brief Same as opensBlockOrBlockTypeList, but for the closing token.
@@ -639,6 +641,7 @@ struct AdditionalKeywords {
     kw_is = &IdentTable.get("is");
     kw_let = &IdentTable.get("let");
     kw_module = &IdentTable.get("module");
+    kw_readonly = &IdentTable.get("readonly");
     kw_set = &IdentTable.get("set");
     kw_type = &IdentTable.get("type");
     kw_var = &IdentTable.get("var");
@@ -671,6 +674,15 @@ struct AdditionalKeywords {
     kw_qsignals = &IdentTable.get("Q_SIGNALS");
     kw_slots = &IdentTable.get("slots");
     kw_qslots = &IdentTable.get("Q_SLOTS");
+
+    // Keep this at the end of the constructor to make sure everything here is
+    // already initialized.
+    JsExtraKeywords = std::unordered_set<IdentifierInfo *>(
+        {kw_as, kw_async, kw_await, kw_declare, kw_finally, kw_from,
+         kw_function, kw_get, kw_import, kw_is, kw_let, kw_module, kw_readonly,
+         kw_set, kw_type, kw_var, kw_yield,
+         // Keywords from the Java section.
+         kw_abstract, kw_extends, kw_implements, kw_instanceof, kw_interface});
   }
 
   // Context sensitive keywords.
@@ -699,6 +711,7 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_is;
   IdentifierInfo *kw_let;
   IdentifierInfo *kw_module;
+  IdentifierInfo *kw_readonly;
   IdentifierInfo *kw_set;
   IdentifierInfo *kw_type;
   IdentifierInfo *kw_var;
@@ -732,6 +745,18 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_qsignals;
   IdentifierInfo *kw_slots;
   IdentifierInfo *kw_qslots;
+
+  /// \brief Returns \c true if \p Tok is a true JavaScript identifier, returns
+  /// \c false if it is a keyword or a pseudo keyword.
+  bool IsJavaScriptIdentifier(const FormatToken &Tok) const {
+    return Tok.is(tok::identifier) &&
+           JsExtraKeywords.find(Tok.Tok.getIdentifierInfo()) ==
+               JsExtraKeywords.end();
+  }
+
+private:
+  /// \brief The JavaScript keywords beyond the C++ keyword set.
+  std::unordered_set<IdentifierInfo *> JsExtraKeywords;
 };
 
 } // namespace format
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index d78a37532fe8..c6e90a9175e1 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -90,7 +90,8 @@ class AnnotatingParser {
       }
       if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace) ||
           (CurrentToken->isOneOf(tok::colon, tok::question) && InExprContext &&
-           Style.Language != FormatStyle::LK_Proto))
+           Style.Language != FormatStyle::LK_Proto &&
+           Style.Language != FormatStyle::LK_TextProto))
         return false;
       // If a && or || is found and interpreted as a binary operator, this set
       // of angles is likely part of something like "a < b && c > d". If the
@@ -453,7 +454,8 @@ class AnnotatingParser {
           FormatToken *Previous = CurrentToken->getPreviousNonComment();
           if (((CurrentToken->is(tok::colon) &&
                 (!Contexts.back().ColonIsDictLiteral || !Style.isCpp())) ||
-               Style.Language == FormatStyle::LK_Proto) &&
+               Style.Language == FormatStyle::LK_Proto ||
+               Style.Language == FormatStyle::LK_TextProto) &&
               (Previous->Tok.getIdentifierInfo() ||
                Previous->is(tok::string_literal)))
             Previous->Type = TT_SelectorName;
@@ -536,8 +538,13 @@ class AnnotatingParser {
         }
       }
       if (Contexts.back().ColonIsDictLiteral ||
-          Style.Language == FormatStyle::LK_Proto) {
+          Style.Language == FormatStyle::LK_Proto ||
+          Style.Language == FormatStyle::LK_TextProto) {
         Tok->Type = TT_DictLiteral;
+        if (Style.Language == FormatStyle::LK_TextProto) {
+          if (FormatToken *Previous = Tok->getPreviousNonComment())
+            Previous->Type = TT_SelectorName;
+        }
       } else if (Contexts.back().ColonIsObjCMethodExpr ||
                  Line.startsWith(TT_ObjCMethodSpecifier)) {
         Tok->Type = TT_ObjCMethodExpr;
@@ -635,12 +642,22 @@ class AnnotatingParser {
         return false;
       break;
     case tok::l_brace:
+      if (Style.Language == FormatStyle::LK_TextProto) {
+        FormatToken *Previous =Tok->getPreviousNonComment();
+        if (Previous && Previous->Type != TT_DictLiteral)
+          Previous->Type = TT_SelectorName;
+      }
       if (!parseBrace())
         return false;
       break;
     case tok::less:
       if (parseAngle()) {
         Tok->Type = TT_TemplateOpener;
+        if (Style.Language == FormatStyle::LK_TextProto) {
+          FormatToken *Previous = Tok->getPreviousNonComment();
+          if (Previous && Previous->Type != TT_DictLiteral)
+            Previous->Type = TT_SelectorName;
+        }
       } else {
         Tok->Type = TT_BinaryOperator;
         NonTemplateLess.insert(Tok);
@@ -1572,7 +1589,8 @@ class ExpressionParser {
         return prec::Conditional;
       if (NextNonComment && Current->is(TT_SelectorName) &&
           (NextNonComment->is(TT_DictLiteral) ||
-           (Style.Language == FormatStyle::LK_Proto &&
+           ((Style.Language == FormatStyle::LK_Proto ||
+             Style.Language == FormatStyle::LK_TextProto) &&
             NextNonComment->is(tok::less))))
         return prec::Assignment;
       if (Current->is(TT_JsComputedPropertyName))
@@ -1676,17 +1694,26 @@ void TokenAnnotator::setCommentLineLevels(
   for (SmallVectorImpl<AnnotatedLine *>::reverse_iterator I = Lines.rbegin(),
                                                           E = Lines.rend();
        I != E; ++I) {
-    bool CommentLine = (*I)->First;
+    bool CommentLine = true;
     for (const FormatToken *Tok = (*I)->First; Tok; Tok = Tok->Next) {
       if (!Tok->is(tok::comment)) {
         CommentLine = false;
         break;
       }
     }
-    if (NextNonCommentLine && CommentLine)
-      (*I)->Level = NextNonCommentLine->Level;
-    else
+
+    if (NextNonCommentLine && CommentLine) {
+      // If the comment is currently aligned with the line immediately following
+      // it, that's probably intentional and we should keep it.
+      bool AlignedWithNextLine =
+          NextNonCommentLine->First->NewlinesBefore <= 1 &&
+          NextNonCommentLine->First->OriginalColumn ==
+              (*I)->First->OriginalColumn;
+      if (AlignedWithNextLine)
+        (*I)->Level = NextNonCommentLine->Level;
+    } else {
       NextNonCommentLine = (*I)->First->isNot(tok::r_brace) ? (*I) : nullptr;
+    }
 
     setCommentLineLevels((*I)->Children);
   }
@@ -2274,7 +2301,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
   if (Style.isCpp()) {
     if (Left.is(tok::kw_operator))
       return Right.is(tok::coloncolon);
-  } else if (Style.Language == FormatStyle::LK_Proto) {
+  } else if (Style.Language == FormatStyle::LK_Proto ||
+             Style.Language == FormatStyle::LK_TextProto) {
     if (Right.is(tok::period) &&
         Left.isOneOf(Keywords.kw_optional, Keywords.kw_required,
                      Keywords.kw_repeated, Keywords.kw_extend))
@@ -2282,6 +2310,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     if (Right.is(tok::l_paren) &&
         Left.isOneOf(Keywords.kw_returns, Keywords.kw_option))
       return true;
+    if (Right.isOneOf(tok::l_brace, tok::less) && Left.is(TT_SelectorName))
+      return true;
   } else if (Style.Language == FormatStyle::LK_JavaScript) {
     if (Left.is(TT_JsFatArrow))
       return true;
@@ -2300,7 +2330,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     if ((Left.is(TT_TemplateString) && Left.TokenText.endswith("${")) ||
         (Right.is(TT_TemplateString) && Right.TokenText.startswith("}")))
       return false;
-    if (Left.is(tok::identifier) && Right.is(TT_TemplateString))
+    // In tagged template literals ("html`bar baz`"), there is no space between
+    // the tag identifier and the template string. getIdentifierInfo makes sure
+    // that the identifier is not a pseudo keyword like `yield`, either.
+    if (Left.is(tok::identifier) && Keywords.IsJavaScriptIdentifier(Left) &&
+        Right.is(TT_TemplateString))
       return false;
     if (Right.is(tok::star) &&
         Left.isOneOf(Keywords.kw_function, Keywords.kw_yield))
@@ -2604,11 +2638,12 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   } else if (Style.Language == FormatStyle::LK_JavaScript) {
     const FormatToken *NonComment = Right.getPreviousNonComment();
     if (NonComment &&
-        NonComment->isOneOf(
-            tok::kw_return, tok::kw_continue, tok::kw_break, tok::kw_throw,
-            Keywords.kw_interface, Keywords.kw_type, tok::kw_static,
-            tok::kw_public, tok::kw_private, tok::kw_protected,
-            Keywords.kw_abstract, Keywords.kw_get, Keywords.kw_set))
+        NonComment->isOneOf(tok::kw_return, tok::kw_continue, tok::kw_break,
+                            tok::kw_throw, Keywords.kw_interface,
+                            Keywords.kw_type, tok::kw_static, tok::kw_public,
+                            tok::kw_private, tok::kw_protected,
+                            Keywords.kw_readonly, Keywords.kw_abstract,
+                            Keywords.kw_get, Keywords.kw_set))
       return false; // Otherwise automatic semicolon insertion would trigger.
     if (Left.is(TT_JsFatArrow) && Right.is(tok::l_brace))
       return false;
diff --git a/lib/Format/UnwrappedLineParser.cpp b/lib/Format/UnwrappedLineParser.cpp
index ba3a4c17ee12..4b57919d1929 100644
--- a/lib/Format/UnwrappedLineParser.cpp
+++ b/lib/Format/UnwrappedLineParser.cpp
@@ -286,7 +286,10 @@ void UnwrappedLineParser::parseFile() {
       !Line->InPPDirective && Style.Language != FormatStyle::LK_JavaScript;
   ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack,
                                           MustBeDeclaration);
-  parseLevel(/*HasOpeningBrace=*/false);
+  if (Style.Language == FormatStyle::LK_TextProto)
+    parseBracedList();
+  else
+    parseLevel(/*HasOpeningBrace=*/false);
   // Make sure to format the remaining tokens.
   flushComments(true);
   addUnwrappedLine();
@@ -832,6 +835,7 @@ void UnwrappedLineParser::parseStructuralElement() {
   case tok::at:
     nextToken();
     if (FormatTok->Tok.is(tok::l_brace)) {
+      nextToken();
       parseBracedList();
       break;
     }
@@ -996,8 +1000,10 @@ void UnwrappedLineParser::parseStructuralElement() {
     switch (FormatTok->Tok.getKind()) {
     case tok::at:
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace))
+      if (FormatTok->Tok.is(tok::l_brace)) {
+        nextToken();
         parseBracedList();
+      }
       break;
     case tok::kw_enum:
       // Ignore if this is part of "template <enum ...".
@@ -1176,12 +1182,15 @@ void UnwrappedLineParser::parseStructuralElement() {
       }
 
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace))
+      if (FormatTok->Tok.is(tok::l_brace)) {
+        nextToken();
         parseBracedList();
-      else if (Style.Language == FormatStyle::LK_Proto &&
-               FormatTok->Tok.is(tok::less))
+      } else if (Style.Language == FormatStyle::LK_Proto &&
+               FormatTok->Tok.is(tok::less)) {
+        nextToken();
         parseBracedList(/*ContinueOnSemicolons=*/false,
                         /*ClosingBraceKind=*/tok::greater);
+      }
       break;
     case tok::l_square:
       parseSquare();
@@ -1345,6 +1354,7 @@ bool UnwrappedLineParser::tryToParseBracedList() {
   assert(FormatTok->BlockKind != BK_Unknown);
   if (FormatTok->BlockKind == BK_Block)
     return false;
+  nextToken();
   parseBracedList();
   return true;
 }
@@ -1352,7 +1362,6 @@ bool UnwrappedLineParser::tryToParseBracedList() {
 bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons,
                                           tok::TokenKind ClosingBraceKind) {
   bool HasError = false;
-  nextToken();
 
   // FIXME: Once we have an expression parser in the UnwrappedLineParser,
   // replace this by using parseAssigmentExpression() inside.
@@ -1407,6 +1416,7 @@ bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons,
       // Assume there are no blocks inside a braced init list apart
       // from the ones we explicitly parse out (like lambdas).
       FormatTok->BlockKind = BK_BracedInit;
+      nextToken();
       parseBracedList();
       break;
     case tok::semi:
@@ -1459,8 +1469,10 @@ void UnwrappedLineParser::parseParens() {
       break;
     case tok::at:
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace))
+      if (FormatTok->Tok.is(tok::l_brace)) {
+        nextToken();
         parseBracedList();
+      }
       break;
     case tok::kw_class:
       if (Style.Language == FormatStyle::LK_JavaScript)
@@ -1508,8 +1520,10 @@ void UnwrappedLineParser::parseSquare() {
     }
     case tok::at:
       nextToken();
-      if (FormatTok->Tok.is(tok::l_brace))
+      if (FormatTok->Tok.is(tok::l_brace)) {
+        nextToken();
         parseBracedList();
+      }
       break;
     default:
       nextToken();
@@ -1836,6 +1850,7 @@ bool UnwrappedLineParser::parseEnum() {
   }
 
   // Parse enum body.
+  nextToken();
   bool HasError = !parseBracedList(/*ContinueOnSemicolons=*/true);
   if (HasError) {
     if (FormatTok->is(tok::semi))
@@ -1870,6 +1885,7 @@ void UnwrappedLineParser::parseJavaEnumBody() {
   FormatTok = Tokens->setPosition(StoredPosition);
 
   if (IsSimple) {
+    nextToken();
     parseBracedList();
     addUnwrappedLine();
     return;
@@ -2081,6 +2097,7 @@ void UnwrappedLineParser::parseJavaScriptEs6ImportExport() {
     }
     if (FormatTok->is(tok::l_brace)) {
       FormatTok->BlockKind = BK_Block;
+      nextToken();
       parseBracedList();
     } else {
       nextToken();
diff --git a/lib/Frontend/FrontendActions.cpp b/lib/Frontend/FrontendActions.cpp
index 0fbcc1c7399e..d42400183a43 100644
--- a/lib/Frontend/FrontendActions.cpp
+++ b/lib/Frontend/FrontendActions.cpp
@@ -163,6 +163,16 @@ GenerateModuleAction::CreateASTConsumer(CompilerInstance &CI,
   return llvm::make_unique<MultiplexConsumer>(std::move(Consumers));
 }
 
+bool GenerateModuleFromModuleMapAction::BeginSourceFileAction(
+    CompilerInstance &CI) {
+  if (!CI.getLangOpts().Modules) {
+    CI.getDiagnostics().Report(diag::err_module_build_requires_fmodules);
+    return false;
+  }
+
+  return GenerateModuleAction::BeginSourceFileAction(CI);
+}
+
 std::unique_ptr<raw_pwrite_stream>
 GenerateModuleFromModuleMapAction::CreateOutputFile(CompilerInstance &CI,
                                                     StringRef InFile) {
diff --git a/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 83290a6fbc28..38be684cec86 100644
--- a/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -146,7 +146,7 @@ namespace {
     
     llvm::DenseMap<BlockExpr *, std::string> RewrittenBlockExprs;
     llvm::DenseMap<ObjCInterfaceDecl *, 
-                    llvm::SmallPtrSet<ObjCIvarDecl *, 8> > ReferencedIvars;
+                    llvm::SmallSetVector<ObjCIvarDecl *, 8> > ReferencedIvars;
     
     // ivar bitfield grouping containers
     llvm::DenseSet<const ObjCInterfaceDecl *> ObjCInterefaceHasBitfieldGroups;
@@ -1013,7 +1013,7 @@ void RewriteModernObjC::RewritePropertyImplDecl(ObjCPropertyImplDecl *PID,
     Setr = "\nextern \"C\" __declspec(dllimport) "
     "void objc_setProperty (id, SEL, long, id, bool, bool);\n";
   }
-  
+
   RewriteObjCMethodDecl(OID->getContainingInterface(), 
                         PD->getSetterMethodDecl(), Setr);
   Setr += "{ ";
@@ -3965,10 +3965,11 @@ void RewriteModernObjC::RewriteIvarOffsetSymbols(ObjCInterfaceDecl *CDecl,
                                                   std::string &Result) {
   // write out ivar offset symbols which have been referenced in an ivar
   // access expression.
-  llvm::SmallPtrSet<ObjCIvarDecl *, 8> Ivars = ReferencedIvars[CDecl];
+  llvm::SmallSetVector<ObjCIvarDecl *, 8> Ivars = ReferencedIvars[CDecl];
+
   if (Ivars.empty())
     return;
-  
+
   llvm::DenseSet<std::pair<const ObjCInterfaceDecl*, unsigned> > GroupSymbolOutput;
   for (ObjCIvarDecl *IvarDecl : Ivars) {
     const ObjCInterfaceDecl *IDecl = IvarDecl->getContainingInterface();
@@ -6068,7 +6069,7 @@ void RewriteModernObjC::Initialize(ASTContext &context) {
   Preamble += "\n#define __OFFSETOFIVAR__(TYPE, MEMBER) ((long long) &((TYPE *)0)->MEMBER)\n";
 }
 
-/// RewriteIvarOffsetComputation - This rutine synthesizes computation of
+/// RewriteIvarOffsetComputation - This routine synthesizes computation of
 /// ivar offset.
 void RewriteModernObjC::RewriteIvarOffsetComputation(ObjCIvarDecl *ivar,
                                                          std::string &Result) {
diff --git a/lib/Frontend/Rewrite/RewriteObjC.cpp b/lib/Frontend/Rewrite/RewriteObjC.cpp
index 7d809c610c86..5a1e001d65b8 100644
--- a/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -5052,7 +5052,7 @@ void RewriteObjCFragileABI::Initialize(ASTContext &context) {
   Preamble += "\n#define __OFFSETOFIVAR__(TYPE, MEMBER) ((long long) &((TYPE *)0)->MEMBER)\n";
 }
 
-/// RewriteIvarOffsetComputation - This rutine synthesizes computation of
+/// RewriteIvarOffsetComputation - This routine synthesizes computation of
 /// ivar offset.
 void RewriteObjCFragileABI::RewriteIvarOffsetComputation(ObjCIvarDecl *ivar,
                                                          std::string &Result) {
diff --git a/lib/Frontend/SerializedDiagnosticReader.cpp b/lib/Frontend/SerializedDiagnosticReader.cpp
index 8a8161488f44..08b7087fbad6 100644
--- a/lib/Frontend/SerializedDiagnosticReader.cpp
+++ b/lib/Frontend/SerializedDiagnosticReader.cpp
@@ -27,6 +27,9 @@ std::error_code SerializedDiagnosticReader::readDiagnostics(StringRef File) {
   llvm::BitstreamCursor Stream(**Buffer);
   Optional<llvm::BitstreamBlockInfo> BlockInfo;
 
+  if (Stream.AtEndOfStream())
+    return SDError::InvalidSignature;
+
   // Sniff for the signature.
   if (Stream.Read(8) != 'D' ||
       Stream.Read(8) != 'I' ||
diff --git a/lib/Frontend/TextDiagnostic.cpp b/lib/Frontend/TextDiagnostic.cpp
index 1e12ea5e597a..6a72b00c602b 100644
--- a/lib/Frontend/TextDiagnostic.cpp
+++ b/lib/Frontend/TextDiagnostic.cpp
@@ -1052,7 +1052,8 @@ static void highlightRange(const CharSourceRange &R,
   std::fill(CaretLine.begin()+StartColNo,CaretLine.begin()+EndColNo,'~');
 }
 
-static std::string buildFixItInsertionLine(unsigned LineNo,
+static std::string buildFixItInsertionLine(FileID FID,
+                                           unsigned LineNo,
                                            const SourceColumnMap &map,
                                            ArrayRef<FixItHint> Hints,
                                            const SourceManager &SM,
@@ -1069,7 +1070,8 @@ static std::string buildFixItInsertionLine(unsigned LineNo,
       // code contains no newlines and is on the same line as the caret.
       std::pair<FileID, unsigned> HintLocInfo
         = SM.getDecomposedExpansionLoc(I->RemoveRange.getBegin());
-      if (LineNo == SM.getLineNumber(HintLocInfo.first, HintLocInfo.second) &&
+      if (FID == HintLocInfo.first &&
+          LineNo == SM.getLineNumber(HintLocInfo.first, HintLocInfo.second) &&
           StringRef(I->CodeToInsert).find_first_of("\n\r") == StringRef::npos) {
         // Insert the new code into the line just below the code
         // that the user wrote.
@@ -1105,9 +1107,6 @@ static std::string buildFixItInsertionLine(unsigned LineNo,
 
         PrevHintEndCol =
           HintCol + llvm::sys::locale::columnWidth(I->CodeToInsert);
-      } else {
-        FixItInsertionLine.clear();
-        break;
       }
     }
   }
@@ -1222,7 +1221,7 @@ void TextDiagnostic::emitSnippetAndCaret(
     }
 
     std::string FixItInsertionLine = buildFixItInsertionLine(
-        LineNo, sourceColMap, Hints, SM, DiagOpts.get());
+        FID, LineNo, sourceColMap, Hints, SM, DiagOpts.get());
 
     // If the source line is too long for our terminal, select only the
     // "interesting" source region within that line.
diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h
index 361e5f720ea1..e812a1632b91 100644
--- a/lib/Headers/bmiintrin.h
+++ b/lib/Headers/bmiintrin.h
@@ -318,7 +318,7 @@ __blsi_u64(unsigned long long __X)
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
-/// \returns A unsigned 64-bit integer containing the newly created mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
diff --git a/lib/Headers/cpuid.h b/lib/Headers/cpuid.h
index 400dcfacd552..2dd0add236b8 100644
--- a/lib/Headers/cpuid.h
+++ b/lib/Headers/cpuid.h
@@ -79,7 +79,7 @@
 #define signature_VORTEX_edx 0x36387865
 #define signature_VORTEX_ecx 0x436f5320
 
-/* Features in %ecx for level 1 */
+/* Features in %ecx for leaf 1 */
 #define bit_SSE3        0x00000001
 #define bit_PCLMULQDQ   0x00000002
 #define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
@@ -114,7 +114,7 @@
 #define bit_F16C        0x20000000
 #define bit_RDRND       0x40000000
 
-/* Features in %edx for level 1 */
+/* Features in %edx for leaf 1 */
 #define bit_FPU         0x00000001
 #define bit_VME         0x00000002
 #define bit_DE          0x00000004
@@ -147,44 +147,95 @@
 #define bit_TM          0x20000000
 #define bit_PBE         0x80000000
 
-/* Features in %ebx for level 7 sub-leaf 0 */
+/* Features in %ebx for leaf 7 sub-leaf 0 */
 #define bit_FSGSBASE    0x00000001
+#define bit_SGX         0x00000004
+#define bit_BMI         0x00000008
+#define bit_HLE         0x00000010
+#define bit_AVX2        0x00000020
 #define bit_SMEP        0x00000080
+#define bit_BMI2        0x00000100
 #define bit_ENH_MOVSB   0x00000200
+#define bit_RTM         0x00000800
+#define bit_MPX         0x00004000
+#define bit_AVX512F     0x00010000
+#define bit_AVX512DQ    0x00020000
+#define bit_RDSEED      0x00040000
+#define bit_ADX         0x00080000
+#define bit_AVX512IFMA  0x00200000
+#define bit_CLFLUSHOPT  0x00800000
+#define bit_CLWB        0x01000000
+#define bit_AVX512PF    0x04000000
+#define bit_AVX51SER    0x08000000
+#define bit_AVX512CD    0x10000000
+#define bit_SHA         0x20000000
+#define bit_AVX512BW    0x40000000
+#define bit_AVX512VL    0x80000000
+
+/* Features in %ecx for leaf 7 sub-leaf 0 */
+#define bit_PREFTCHWT1  0x00000001
+#define bit_AVX512VBMI  0x00000002
+#define bit_PKU         0x00000004
+#define bit_OSPKE       0x00000010
+#define bit_AVX512VPOPCNTDQ  0x00004000
+#define bit_RDPID       0x00400000
+
+/* Features in %edx for leaf 7 sub-leaf 0 */
+#define bit_AVX5124VNNIW  0x00000004
+#define bit_AVX5124FMAPS  0x00000008
+
+/* Features in %eax for leaf 13 sub-leaf 1 */
+#define bit_XSAVEOPT    0x00000001
+#define bit_XSAVEC      0x00000002
+#define bit_XSAVES      0x00000008
+
+/* Features in %ecx for leaf 0x80000001 */
+#define bit_LAHF_LM     0x00000001
+#define bit_ABM         0x00000020
+#define bit_SSE4a       0x00000040
+#define bit_PRFCHW      0x00000100
+#define bit_XOP         0x00000800
+#define bit_LWP         0x00008000
+#define bit_FMA4        0x00010000
+#define bit_TBM         0x00200000
+#define bit_MWAITX      0x20000000
+
+/* Features in %edx for leaf 0x80000001 */
+#define bit_MMXEXT      0x00400000
+#define bit_LM          0x20000000
+#define bit_3DNOWP      0x40000000
+#define bit_3DNOW       0x80000000
+
+/* Features in %ebx for leaf 0x80000001 */
+#define bit_CLZERO      0x00000001
+
 
 #if __i386__
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__level))
+                  : "0"(__leaf))
 
-#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__level), "2"(__count))
+                  : "0"(__leaf), "2"(__count))
 #else
 /* x86-64 uses %rbx as the base register, so preserve it. */
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("  xchgq  %%rbx,%q1\n" \
           "  cpuid\n" \
           "  xchgq  %%rbx,%q1" \
         : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__level))
+        : "0"(__leaf))
 
-#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
     __asm("  xchgq  %%rbx,%q1\n" \
           "  cpuid\n" \
           "  xchgq  %%rbx,%q1" \
         : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__level), "2"(__count))
+        : "0"(__leaf), "2"(__count))
 #endif
 
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
-    return 1;
-}
-
-static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
 {
     unsigned int __eax, __ebx, __ecx, __edx;
 #if __i386__
@@ -208,8 +259,35 @@ static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
         return 0;
 #endif
 
-    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    __cpuid(__leaf, __eax, __ebx, __ecx, __edx);
     if (__sig)
         *__sig = __ebx;
     return __eax;
 }
+
+static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_count (unsigned int __leaf,
+                                       unsigned int __subleaf,
+                                       unsigned int *__eax, unsigned int *__ebx,
+                                       unsigned int *__ecx, unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index e22dd231427d..c5f25bfcb5c1 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -212,6 +212,15 @@ _rdrand32_step(unsigned int *__p)
   return __builtin_ia32_rdrand32_step(__p);
 }
 
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
 /* __bit_scan_forward */
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _bit_scan_forward(int __A) {
@@ -224,15 +233,6 @@ _bit_scan_reverse(int __A) {
   return 31 - __builtin_clz(__A);
 }
 
-#ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
-  return __builtin_ia32_rdrand64_step(__p);
-}
-#endif
-#endif /* __RDRND__ */
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
index 5a7968bec842..4b38d51713d8 100644
--- a/lib/Headers/mmintrin.h
+++ b/lib/Headers/mmintrin.h
@@ -1289,7 +1289,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
diff --git a/lib/Index/IndexBody.cpp b/lib/Index/IndexBody.cpp
index d3632b8b9b15..6bbd38102509 100644
--- a/lib/Index/IndexBody.cpp
+++ b/lib/Index/IndexBody.cpp
@@ -230,7 +230,31 @@ class BodyIndexer : public RecursiveASTVisitor<BodyIndexer> {
       SmallVector<SymbolRelation, 2> Relations;
       addCallRole(Roles, Relations);
       Stmt *Containing = getParentStmt();
-      if (E->isImplicit() || (Containing && isa<PseudoObjectExpr>(Containing)))
+
+      auto IsImplicitProperty = [](const PseudoObjectExpr *POE) -> bool {
+        const auto *E = POE->getSyntacticForm();
+        if (const auto *BinOp = dyn_cast<BinaryOperator>(E))
+          E = BinOp->getLHS();
+        const auto *PRE = dyn_cast<ObjCPropertyRefExpr>(E);
+        if (!PRE)
+          return false;
+        if (PRE->isExplicitProperty())
+          return false;
+        if (const ObjCMethodDecl *Getter = PRE->getImplicitPropertyGetter()) {
+          // Class properties that are explicitly defined using @property
+          // declarations are represented implicitly as there is no ivar for
+          // class properties.
+          if (Getter->isClassMethod() &&
+              Getter->getCanonicalDecl()->findPropertyDecl())
+            return false;
+        }
+        return true;
+      };
+      bool IsPropCall = Containing && isa<PseudoObjectExpr>(Containing);
+      // Implicit property message sends are not 'implicit'.
+      if ((E->isImplicit() || IsPropCall) &&
+          !(IsPropCall &&
+            IsImplicitProperty(cast<PseudoObjectExpr>(Containing))))
         Roles |= (unsigned)SymbolRole::Implicit;
 
       if (isDynamic(E)) {
diff --git a/lib/Index/IndexDecl.cpp b/lib/Index/IndexDecl.cpp
index d1127722c8ca..c5230c0f9acf 100644
--- a/lib/Index/IndexDecl.cpp
+++ b/lib/Index/IndexDecl.cpp
@@ -618,6 +618,8 @@ class IndexingDeclVisitor : public ConstDeclVisitor<IndexingDeclVisitor, bool> {
         Template.is<ClassTemplateDecl *>()
             ? (Decl *)Template.get<ClassTemplateDecl *>()
             : Template.get<ClassTemplatePartialSpecializationDecl *>();
+    if (!D->isThisDeclarationADefinition())
+      IndexCtx.indexNestedNameSpecifierLoc(D->getQualifierLoc(), D);
     IndexCtx.indexTagDecl(
         D, SymbolRelation(SymbolRoleSet(SymbolRole::RelationSpecializationOf),
                           SpecializationOf));
diff --git a/lib/Index/IndexSymbol.cpp b/lib/Index/IndexSymbol.cpp
index bf358a372149..0dc3720208ca 100644
--- a/lib/Index/IndexSymbol.cpp
+++ b/lib/Index/IndexSymbol.cpp
@@ -69,11 +69,13 @@ bool index::isFunctionLocalSymbol(const Decl *D) {
   if (const NamedDecl *ND = dyn_cast<NamedDecl>(D)) {
     switch (ND->getFormalLinkage()) {
       case NoLinkage:
-      case VisibleNoLinkage:
       case InternalLinkage:
         return true;
+      case VisibleNoLinkage:
       case UniqueExternalLinkage:
+      case ModuleInternalLinkage:
         llvm_unreachable("Not a sema linkage");
+      case ModuleLinkage:
       case ExternalLinkage:
         return false;
     }
diff --git a/lib/Index/IndexingContext.cpp b/lib/Index/IndexingContext.cpp
index 754bc84ff4b2..c4aa51d62f02 100644
--- a/lib/Index/IndexingContext.cpp
+++ b/lib/Index/IndexingContext.cpp
@@ -229,6 +229,12 @@ static bool isDeclADefinition(const Decl *D, const DeclContext *ContainerDC, AST
   return false;
 }
 
+/// Whether the given NamedDecl should be skipped because it has no name.
+static bool shouldSkipNamelessDecl(const NamedDecl *ND) {
+  return ND->getDeclName().isEmpty() && !isa<TagDecl>(ND) &&
+         !isa<ObjCCategoryDecl>(ND);
+}
+
 static const Decl *adjustParent(const Decl *Parent) {
   if (!Parent)
     return nullptr;
@@ -243,8 +249,8 @@ static const Decl *adjustParent(const Decl *Parent) {
     } else if (auto RD = dyn_cast<RecordDecl>(Parent)) {
       if (RD->isAnonymousStructOrUnion())
         continue;
-    } else if (auto FD = dyn_cast<FieldDecl>(Parent)) {
-      if (FD->getDeclName().isEmpty())
+    } else if (auto ND = dyn_cast<NamedDecl>(Parent)) {
+      if (shouldSkipNamelessDecl(ND))
         continue;
     }
     return Parent;
@@ -315,9 +321,7 @@ bool IndexingContext::handleDeclOccurrence(const Decl *D, SourceLocation Loc,
                                            const DeclContext *ContainerDC) {
   if (D->isImplicit() && !isa<ObjCMethodDecl>(D))
     return true;
-  if (!isa<NamedDecl>(D) ||
-      (cast<NamedDecl>(D)->getDeclName().isEmpty() &&
-       !isa<TagDecl>(D) && !isa<ObjCCategoryDecl>(D)))
+  if (!isa<NamedDecl>(D) || shouldSkipNamelessDecl(cast<NamedDecl>(D)))
     return true;
 
   SourceManager &SM = Ctx->getSourceManager();
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 012189aa6f9f..61bcef8cb760 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -2550,7 +2550,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
     return true;
   }
   
-  if (PP->isRecordingPreamble() && !PP->isInMainFile()) {
+  if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
     PP->setRecordedPreambleConditionalStack(ConditionalStack);
     ConditionalStack.clear();
   }
diff --git a/lib/Lex/PPLexerChange.cpp b/lib/Lex/PPLexerChange.cpp
index 1c0cd5636835..5a589d6a17b3 100644
--- a/lib/Lex/PPLexerChange.cpp
+++ b/lib/Lex/PPLexerChange.cpp
@@ -46,12 +46,6 @@ bool Preprocessor::isInPrimaryFile() const {
   });
 }
 
-bool Preprocessor::isInMainFile() const {
-  if (IsFileLexer())
-    return IncludeMacroStack.size() == 0;
-  return true;
-}
-
 /// getCurrentLexer - Return the current file lexer being lexed from.  Note
 /// that this ignores any potentially active macro expansions and _Pragma
 /// expansions going on at the time.
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index f9a399cd7fd7..63f39524d12a 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -535,7 +535,9 @@ void Preprocessor::EnterMainSourceFile() {
 
   // Start parsing the predefines.
   EnterSourceFile(FID, nullptr, SourceLocation());
+}
 
+void Preprocessor::replayPreambleConditionalStack() {
   // Restore the conditional stack from the preamble, if there is one.
   if (PreambleConditionalStack.isReplaying()) {
     CurPPLexer->setConditionalLevels(PreambleConditionalStack.getStack());
diff --git a/lib/Parse/ParseCXXInlineMethods.cpp b/lib/Parse/ParseCXXInlineMethods.cpp
index cc1e8850a523..b68559485a5e 100644
--- a/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/lib/Parse/ParseCXXInlineMethods.cpp
@@ -889,7 +889,7 @@ bool Parser::ConsumeAndStoreFunctionPrologue(CachedTokens &Toks) {
         // If the opening brace is not preceded by one of these tokens, we are
         // missing the mem-initializer-id. In order to recover better, we need
         // to use heuristics to determine if this '{' is most likely the
-        // begining of a brace-init-list or the function body.
+        // beginning of a brace-init-list or the function body.
         // Check the token after the corresponding '}'.
         TentativeParsingAction PA(*this);
         if (SkipUntil(tok::r_brace) &&
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index 07054546f42f..a4610698c46d 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -6650,7 +6650,7 @@ void Parser::ParseTypeofSpecifier(DeclSpec &DS) {
     return;
   }
 
-  // If we get here, the operand to the typeof was an expresion.
+  // If we get here, the operand to the typeof was an expression.
   if (Operand.isInvalid()) {
     DS.SetTypeSpecError();
     return;
diff --git a/lib/Parse/ParseExpr.cpp b/lib/Parse/ParseExpr.cpp
index aacb00e8be64..44b87af01abd 100644
--- a/lib/Parse/ParseExpr.cpp
+++ b/lib/Parse/ParseExpr.cpp
@@ -1866,7 +1866,7 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok,
     }
   }
 
-  // If we get here, the operand to the typeof/sizeof/alignof was an expresion.
+  // If we get here, the operand to the typeof/sizeof/alignof was an expression.
   isCastExpr = false;
   return Operand;
 }
@@ -1972,7 +1972,7 @@ ExprResult Parser::ParseUnaryExprOrTypeTraitExpression() {
   if (OpTok.isOneOf(tok::kw_alignof, tok::kw__Alignof))
     Diag(OpTok, diag::ext_alignof_expr) << OpTok.getIdentifierInfo();
 
-  // If we get here, the operand to the sizeof/alignof was an expresion.
+  // If we get here, the operand to the sizeof/alignof was an expression.
   if (!Operand.isInvalid())
     Operand = Actions.ActOnUnaryExprOrTypeTraitExpr(OpTok.getLocation(),
                                                     ExprKind,
diff --git a/lib/Parse/ParseObjc.cpp b/lib/Parse/ParseObjc.cpp
index caa6323d3209..f7410b8a092a 100644
--- a/lib/Parse/ParseObjc.cpp
+++ b/lib/Parse/ParseObjc.cpp
@@ -2255,7 +2255,7 @@ Parser::ObjCImplParsingDataRAII::~ObjCImplParsingDataRAII() {
 
 void Parser::ObjCImplParsingDataRAII::finish(SourceRange AtEnd) {
   assert(!Finished);
-  P.Actions.DefaultSynthesizeProperties(P.getCurScope(), Dcl);
+  P.Actions.DefaultSynthesizeProperties(P.getCurScope(), Dcl, AtEnd.getBegin());
   for (size_t i = 0; i < LateParsedObjCMethods.size(); ++i)
     P.ParseLexedObjCMethodDefs(*LateParsedObjCMethods[i], 
                                true/*Methods*/);
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index af29b5e9c673..4aa9a5971929 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -516,6 +516,8 @@ void Parser::Initialize() {
 
   // Prime the lexer look-ahead.
   ConsumeToken();
+
+  PP.replayPreambleConditionalStack();
 }
 
 void Parser::LateTemplateParserCleanupCallback(void *P) {
@@ -526,6 +528,8 @@ void Parser::LateTemplateParserCleanupCallback(void *P) {
 }
 
 bool Parser::ParseFirstTopLevelDecl(DeclGroupPtrTy &Result) {
+  Actions.ActOnStartOfTranslationUnit();
+
   // C11 6.9p1 says translation units must have at least one top-level
   // declaration. C++ doesn't have this restriction. We also don't want to
   // complain if we have a precompiled header, although technically if the PCH
diff --git a/lib/Sema/AnalysisBasedWarnings.cpp b/lib/Sema/AnalysisBasedWarnings.cpp
index fd2d07957c2b..f83baa790b49 100644
--- a/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/lib/Sema/AnalysisBasedWarnings.cpp
@@ -394,15 +394,21 @@ static bool hasThrowOutNonThrowingFunc(SourceLocation &OpLoc, CFG *BodyCFG) {
 
 static void EmitDiagForCXXThrowInNonThrowingFunc(Sema &S, SourceLocation OpLoc,
                                                  const FunctionDecl *FD) {
-  if (!S.getSourceManager().isInSystemHeader(OpLoc)) {
+  if (!S.getSourceManager().isInSystemHeader(OpLoc) &&
+      FD->getTypeSourceInfo()) {
     S.Diag(OpLoc, diag::warn_throw_in_noexcept_func) << FD;
     if (S.getLangOpts().CPlusPlus11 &&
         (isa<CXXDestructorDecl>(FD) ||
          FD->getDeclName().getCXXOverloadedOperator() == OO_Delete ||
-         FD->getDeclName().getCXXOverloadedOperator() == OO_Array_Delete))
-      S.Diag(FD->getLocation(), diag::note_throw_in_dtor);
-    else
-      S.Diag(FD->getLocation(), diag::note_throw_in_function);
+         FD->getDeclName().getCXXOverloadedOperator() == OO_Array_Delete)) {
+      if (const auto *Ty = FD->getTypeSourceInfo()->getType()->
+                                         getAs<FunctionProtoType>())
+        S.Diag(FD->getLocation(), diag::note_throw_in_dtor)
+            << !isa<CXXDestructorDecl>(FD) << !Ty->hasExceptionSpec()
+            << FD->getExceptionSpecSourceRange();
+    } else 
+      S.Diag(FD->getLocation(), diag::note_throw_in_function)
+          << FD->getExceptionSpecSourceRange();
   }
 }
 
@@ -420,8 +426,7 @@ static void checkThrowInNonThrowingFunc(Sema &S, const FunctionDecl *FD,
 
 static bool isNoexcept(const FunctionDecl *FD) {
   const auto *FPT = FD->getType()->castAs<FunctionProtoType>();
-  if (FPT->getExceptionSpecType() != EST_None &&
-      FPT->isNothrow(FD->getASTContext()))
+  if (FPT->isNothrow(FD->getASTContext()))
     return true;
   return false;
 }
diff --git a/lib/Sema/DelayedDiagnostic.cpp b/lib/Sema/DelayedDiagnostic.cpp
index 2fa5718d4e9b..3d321d561e60 100644
--- a/lib/Sema/DelayedDiagnostic.cpp
+++ b/lib/Sema/DelayedDiagnostic.cpp
@@ -22,7 +22,8 @@ using namespace sema;
 DelayedDiagnostic
 DelayedDiagnostic::makeAvailability(AvailabilityResult AR,
                                     SourceLocation Loc,
-                                    const NamedDecl *D,
+                                    const NamedDecl *ReferringDecl,
+                                    const NamedDecl *OffendingDecl,
                                     const ObjCInterfaceDecl *UnknownObjCClass,
                                     const ObjCPropertyDecl  *ObjCProperty,
                                     StringRef Msg,
@@ -31,7 +32,8 @@ DelayedDiagnostic::makeAvailability(AvailabilityResult AR,
   DD.Kind = Availability;
   DD.Triggered = false;
   DD.Loc = Loc;
-  DD.AvailabilityData.Decl = D;
+  DD.AvailabilityData.ReferringDecl = ReferringDecl;
+  DD.AvailabilityData.OffendingDecl = OffendingDecl;
   DD.AvailabilityData.UnknownObjCClass = UnknownObjCClass;
   DD.AvailabilityData.ObjCProperty = ObjCProperty;
   char *MessageData = nullptr;
diff --git a/lib/Sema/Sema.cpp b/lib/Sema/Sema.cpp
index 34f5e26be810..dc9f977d41ac 100644
--- a/lib/Sema/Sema.cpp
+++ b/lib/Sema/Sema.cpp
@@ -705,6 +705,18 @@ void Sema::emitAndClearUnusedLocalTypedefWarnings() {
   UnusedLocalTypedefNameCandidates.clear();
 }
 
+/// This is called before the very first declaration in the translation unit
+/// is parsed. Note that the ASTContext may have already injected some
+/// declarations.
+void Sema::ActOnStartOfTranslationUnit() {
+  if (getLangOpts().ModulesTS) {
+    // We start in the global module; all those declarations are implicitly
+    // module-private (though they do not have module linkage).
+    Context.getTranslationUnitDecl()->setModuleOwnershipKind(
+        Decl::ModuleOwnershipKind::ModulePrivate);
+  }
+}
+
 /// ActOnEndOfTranslationUnit - This is called at the very end of the
 /// translation unit when EOF is reached and all but the top-level scope is
 /// popped.
diff --git a/lib/Sema/SemaCast.cpp b/lib/Sema/SemaCast.cpp
index 7d534263f468..ba2049d8a606 100644
--- a/lib/Sema/SemaCast.cpp
+++ b/lib/Sema/SemaCast.cpp
@@ -143,6 +143,9 @@ namespace {
   };
 }
 
+static void DiagnoseCastQual(Sema &Self, const ExprResult &SrcExpr,
+                             QualType DestType);
+
 // The Try functions attempt a specific way of casting. If they succeed, they
 // return TC_Success. If their way of casting is not appropriate for the given
 // arguments, they return TC_NotApplicable and *may* set diag to a diagnostic
@@ -427,6 +430,10 @@ static void diagnoseBadCast(Sema &S, unsigned msg, CastType castType,
 /// the same kind of pointer (plain or to-member). Unlike the Sema function,
 /// this one doesn't care if the two pointers-to-member don't point into the
 /// same class. This is because CastsAwayConstness doesn't care.
+/// And additionally, it handles C++ references. If both the types are
+/// references, then their pointee types are returned,
+/// else if only one of them is reference, it's pointee type is returned,
+/// and the other type is returned as-is.
 static bool UnwrapDissimilarPointerTypes(QualType& T1, QualType& T2) {
   const PointerType *T1PtrType = T1->getAs<PointerType>(),
                     *T2PtrType = T2->getAs<PointerType>();
@@ -475,6 +482,26 @@ static bool UnwrapDissimilarPointerTypes(QualType& T1, QualType& T2) {
     return true;
   }
   
+  const LValueReferenceType *T1RefType = T1->getAs<LValueReferenceType>(),
+                            *T2RefType = T2->getAs<LValueReferenceType>();
+  if (T1RefType && T2RefType) {
+    T1 = T1RefType->getPointeeType();
+    T2 = T2RefType->getPointeeType();
+    return true;
+  }
+
+  if (T1RefType) {
+    T1 = T1RefType->getPointeeType();
+    // T2 = T2;
+    return true;
+  }
+
+  if (T2RefType) {
+    // T1 = T1;
+    T2 = T2RefType->getPointeeType();
+    return true;
+  }
+
   return false;
 }
 
@@ -503,11 +530,13 @@ CastsAwayConstness(Sema &Self, QualType SrcType, QualType DestType,
   // the rules are non-trivial. So first we construct Tcv *...cv* as described
   // in C++ 5.2.11p8.
   assert((SrcType->isAnyPointerType() || SrcType->isMemberPointerType() ||
-          SrcType->isBlockPointerType()) &&
+          SrcType->isBlockPointerType() ||
+          DestType->isLValueReferenceType()) &&
          "Source type is not pointer or pointer to member.");
   assert((DestType->isAnyPointerType() || DestType->isMemberPointerType() ||
-          DestType->isBlockPointerType()) &&
-         "Destination type is not pointer or pointer to member.");
+          DestType->isBlockPointerType() ||
+          DestType->isLValueReferenceType()) &&
+         "Destination type is not pointer or pointer to member, or reference.");
 
   QualType UnwrappedSrcType = Self.Context.getCanonicalType(SrcType), 
            UnwrappedDestType = Self.Context.getCanonicalType(DestType);
@@ -2177,6 +2206,8 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
 
 void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
                                        bool ListInitialization) {
+  assert(Self.getLangOpts().CPlusPlus);
+
   // Handle placeholders.
   if (isPlaceholder()) {
     // C-style casts can resolve __unknown_any types.
@@ -2580,30 +2611,42 @@ void CastOperation::CheckCStyleCast() {
 
   if (Kind == CK_BitCast)
     checkCastAlign();
+}
+
+/// DiagnoseCastQual - Warn whenever casts discards a qualifiers, be it either
+/// const, volatile or both.
+static void DiagnoseCastQual(Sema &Self, const ExprResult &SrcExpr,
+                             QualType DestType) {
+  if (SrcExpr.isInvalid())
+    return;
+
+  QualType SrcType = SrcExpr.get()->getType();
+  if (!((SrcType->isAnyPointerType() && DestType->isAnyPointerType()) ||
+        DestType->isLValueReferenceType()))
+    return;
 
-  // -Wcast-qual
   QualType TheOffendingSrcType, TheOffendingDestType;
   Qualifiers CastAwayQualifiers;
-  if (SrcType->isAnyPointerType() && DestType->isAnyPointerType() &&
-      CastsAwayConstness(Self, SrcType, DestType, true, false,
-                         &TheOffendingSrcType, &TheOffendingDestType,
-                         &CastAwayQualifiers)) {
-    int qualifiers = -1;
-    if (CastAwayQualifiers.hasConst() && CastAwayQualifiers.hasVolatile()) {
-      qualifiers = 0;
-    } else if (CastAwayQualifiers.hasConst()) {
-      qualifiers = 1;
-    } else if (CastAwayQualifiers.hasVolatile()) {
-      qualifiers = 2;
-    }
-    // This is a variant of int **x; const int **y = (const int **)x;
-    if (qualifiers == -1)
-      Self.Diag(SrcExpr.get()->getLocStart(), diag::warn_cast_qual2) <<
-        SrcType << DestType;
-    else
-      Self.Diag(SrcExpr.get()->getLocStart(), diag::warn_cast_qual) <<
-        TheOffendingSrcType << TheOffendingDestType << qualifiers;
+  if (!CastsAwayConstness(Self, SrcType, DestType, true, false,
+                          &TheOffendingSrcType, &TheOffendingDestType,
+                          &CastAwayQualifiers))
+    return;
+
+  int qualifiers = -1;
+  if (CastAwayQualifiers.hasConst() && CastAwayQualifiers.hasVolatile()) {
+    qualifiers = 0;
+  } else if (CastAwayQualifiers.hasConst()) {
+    qualifiers = 1;
+  } else if (CastAwayQualifiers.hasVolatile()) {
+    qualifiers = 2;
   }
+  // This is a variant of int **x; const int **y = (const int **)x;
+  if (qualifiers == -1)
+    Self.Diag(SrcExpr.get()->getLocStart(), diag::warn_cast_qual2)
+        << SrcType << DestType;
+  else
+    Self.Diag(SrcExpr.get()->getLocStart(), diag::warn_cast_qual)
+        << TheOffendingSrcType << TheOffendingDestType << qualifiers;
 }
 
 ExprResult Sema::BuildCStyleCastExpr(SourceLocation LPLoc,
@@ -2624,6 +2667,9 @@ ExprResult Sema::BuildCStyleCastExpr(SourceLocation LPLoc,
   if (Op.SrcExpr.isInvalid())
     return ExprError();
 
+  // -Wcast-qual
+  DiagnoseCastQual(Op.Self, Op.SrcExpr, Op.DestType);
+
   return Op.complete(CStyleCastExpr::Create(Context, Op.ResultType,
                               Op.ValueKind, Op.Kind, Op.SrcExpr.get(),
                               &Op.BasePath, CastTypeInfo, LPLoc, RPLoc));
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 845c4bf61b7a..8446601334ee 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -309,7 +309,8 @@ static bool SemaOpenCLBuiltinKernelWorkGroupSize(Sema &S, CallExpr *TheCall) {
   Expr *BlockArg = TheCall->getArg(0);
   if (!isBlockPointer(BlockArg)) {
     S.Diag(BlockArg->getLocStart(),
-           diag::err_opencl_enqueue_kernel_expected_type) << "block";
+           diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "block";
     return true;
   }
   return checkOpenCLBlockArgs(S, BlockArg);
@@ -394,24 +395,24 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
   // First argument always needs to be a queue_t type.
   if (!Arg0->getType()->isQueueT()) {
     S.Diag(TheCall->getArg(0)->getLocStart(),
-           diag::err_opencl_enqueue_kernel_expected_type)
-        << S.Context.OCLQueueTy;
+           diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << S.Context.OCLQueueTy;
     return true;
   }
 
   // Second argument always needs to be a kernel_enqueue_flags_t enum value.
   if (!Arg1->getType()->isIntegerType()) {
     S.Diag(TheCall->getArg(1)->getLocStart(),
-           diag::err_opencl_enqueue_kernel_expected_type)
-        << "'kernel_enqueue_flags_t' (i.e. uint)";
+           diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "'kernel_enqueue_flags_t' (i.e. uint)";
     return true;
   }
 
   // Third argument is always an ndrange_t type.
   if (Arg2->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
     S.Diag(TheCall->getArg(2)->getLocStart(),
-           diag::err_opencl_enqueue_kernel_expected_type)
-        << "'ndrange_t'";
+           diag::err_opencl_builtin_expected_type)
+        << TheCall->getDirectCallee() << "'ndrange_t'";
     return true;
   }
 
@@ -420,8 +421,8 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
   if (NumArgs == 4) {
     // check that the last argument is the right block type.
     if (!isBlockPointer(Arg3)) {
-      S.Diag(Arg3->getLocStart(), diag::err_opencl_enqueue_kernel_expected_type)
-          << "block";
+      S.Diag(Arg3->getLocStart(), diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee() << "block";
       return true;
     }
     // we have a block type, check the prototype
@@ -443,8 +444,8 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
     // check common block argument.
     Expr *Arg6 = TheCall->getArg(6);
     if (!isBlockPointer(Arg6)) {
-      S.Diag(Arg6->getLocStart(), diag::err_opencl_enqueue_kernel_expected_type)
-          << "block";
+      S.Diag(Arg6->getLocStart(), diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee() << "block";
       return true;
     }
     if (checkOpenCLBlockArgs(S, Arg6))
@@ -453,8 +454,8 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
     // Forth argument has to be any integer type.
     if (!Arg3->getType()->isIntegerType()) {
       S.Diag(TheCall->getArg(3)->getLocStart(),
-             diag::err_opencl_enqueue_kernel_expected_type)
-          << "integer";
+             diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee() << "integer";
       return true;
     }
     // check remaining common arguments.
@@ -466,7 +467,8 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
                                      Expr::NPC_ValueDependentIsNotNull) &&
         !Arg4->getType()->getPointeeOrArrayElementType()->isClkEventT()) {
       S.Diag(TheCall->getArg(4)->getLocStart(),
-             diag::err_opencl_enqueue_kernel_expected_type)
+             diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee()
           << S.Context.getPointerType(S.Context.OCLClkEventTy);
       return true;
     }
@@ -477,7 +479,8 @@ static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
         !(Arg5->getType()->isPointerType() &&
           Arg5->getType()->getPointeeType()->isClkEventT())) {
       S.Diag(TheCall->getArg(5)->getLocStart(),
-             diag::err_opencl_enqueue_kernel_expected_type)
+             diag::err_opencl_builtin_expected_type)
+          << TheCall->getDirectCallee()
           << S.Context.getPointerType(S.Context.OCLClkEventTy);
       return true;
     }
@@ -12094,6 +12097,8 @@ void Sema::RefersToMemberWithReducedAlignment(
     if (ME->isArrow())
       BaseType = BaseType->getPointeeType();
     RecordDecl *RD = BaseType->getAs<RecordType>()->getDecl();
+    if (RD->isInvalidDecl())
+      return;
 
     ValueDecl *MD = ME->getMemberDecl();
     auto *FD = dyn_cast<FieldDecl>(MD);
diff --git a/lib/Sema/SemaCoroutine.cpp b/lib/Sema/SemaCoroutine.cpp
index b05c0998d3dd..dc7d8e4e9cec 100644
--- a/lib/Sema/SemaCoroutine.cpp
+++ b/lib/Sema/SemaCoroutine.cpp
@@ -43,9 +43,10 @@ static bool lookupMember(Sema &S, const char *Name, CXXRecordDecl *RD,
 
 /// Look up the std::coroutine_traits<...>::promise_type for the given
 /// function type.
-static QualType lookupPromiseType(Sema &S, const FunctionProtoType *FnType,
-                                  SourceLocation KwLoc,
-                                  SourceLocation FuncLoc) {
+static QualType lookupPromiseType(Sema &S, const FunctionDecl *FD,
+                                  SourceLocation KwLoc) {
+  const FunctionProtoType *FnType = FD->getType()->castAs<FunctionProtoType>();
+  const SourceLocation FuncLoc = FD->getLocation();
   // FIXME: Cache std::coroutine_traits once we've found it.
   NamespaceDecl *StdExp = S.lookupStdExperimentalNamespace();
   if (!StdExp) {
@@ -71,16 +72,35 @@ static QualType lookupPromiseType(Sema &S, const FunctionProtoType *FnType,
     return QualType();
   }
 
-  // Form template argument list for coroutine_traits<R, P1, P2, ...>.
+  // Form template argument list for coroutine_traits<R, P1, P2, ...> according
+  // to [dcl.fct.def.coroutine]3
   TemplateArgumentListInfo Args(KwLoc, KwLoc);
-  Args.addArgument(TemplateArgumentLoc(
-      TemplateArgument(FnType->getReturnType()),
-      S.Context.getTrivialTypeSourceInfo(FnType->getReturnType(), KwLoc)));
-  // FIXME: If the function is a non-static member function, add the type
-  // of the implicit object parameter before the formal parameters.
-  for (QualType T : FnType->getParamTypes())
+  auto AddArg = [&](QualType T) {
     Args.addArgument(TemplateArgumentLoc(
         TemplateArgument(T), S.Context.getTrivialTypeSourceInfo(T, KwLoc)));
+  };
+  AddArg(FnType->getReturnType());
+  // If the function is a non-static member function, add the type
+  // of the implicit object parameter before the formal parameters.
+  if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
+    if (MD->isInstance()) {
+      // [over.match.funcs]4
+      // For non-static member functions, the type of the implicit object
+      // parameter is
+      //  -- "lvalue reference to cv X" for functions declared without a
+      //      ref-qualifier or with the & ref-qualifier
+      //  -- "rvalue reference to cv X" for functions declared with the &&
+      //      ref-qualifier
+      QualType T =
+          MD->getThisType(S.Context)->getAs<PointerType>()->getPointeeType();
+      T = FnType->getRefQualifier() == RQ_RValue
+              ? S.Context.getRValueReferenceType(T)
+              : S.Context.getLValueReferenceType(T, /*SpelledAsLValue*/ true);
+      AddArg(T);
+    }
+  }
+  for (QualType T : FnType->getParamTypes())
+    AddArg(T);
 
   // Build the template-id.
   QualType CoroTrait =
@@ -424,12 +444,16 @@ static ExprResult buildPromiseCall(Sema &S, VarDecl *Promise,
 VarDecl *Sema::buildCoroutinePromise(SourceLocation Loc) {
   assert(isa<FunctionDecl>(CurContext) && "not in a function scope");
   auto *FD = cast<FunctionDecl>(CurContext);
+  bool IsThisDependentType = [&] {
+    if (auto *MD = dyn_cast_or_null<CXXMethodDecl>(FD))
+      return MD->isInstance() && MD->getThisType(Context)->isDependentType();
+    else
+      return false;
+  }();
 
-  QualType T =
-      FD->getType()->isDependentType()
-          ? Context.DependentTy
-          : lookupPromiseType(*this, FD->getType()->castAs<FunctionProtoType>(),
-                              Loc, FD->getLocation());
+  QualType T = FD->getType()->isDependentType() || IsThisDependentType
+                   ? Context.DependentTy
+                   : lookupPromiseType(*this, FD, Loc);
   if (T.isNull())
     return nullptr;
 
@@ -721,8 +745,6 @@ static Expr *buildStdNoThrowDeclRef(Sema &S, SourceLocation Loc) {
     return nullptr;
   }
 
-  // FIXME: Mark the variable as ODR used. This currently does not work
-  // likely due to the scope at in which this function is called.
   auto *VD = Result.getAsSingle<VarDecl>();
   if (!VD) {
     Result.suppressDiagnostics();
diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp
index ef8a408f90de..31b24f91c1d9 100644
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -1998,8 +1998,7 @@ static void filterNonConflictingPreviousTypedefDecls(Sema &S,
 
       // If both declarations give a tag declaration a typedef name for linkage
       // purposes, then they declare the same entity.
-      if (S.getLangOpts().CPlusPlus &&
-          OldTD->getAnonDeclWithTypedefName(/*AnyRedecl*/true) &&
+      if (OldTD->getAnonDeclWithTypedefName(/*AnyRedecl*/true) &&
           Decl->getAnonDeclWithTypedefName())
         continue;
     }
@@ -2117,7 +2116,7 @@ void Sema::MergeTypedefNameDecl(Scope *S, TypedefNameDecl *New,
     auto *OldTag = OldTD->getAnonDeclWithTypedefName(/*AnyRedecl*/true);
     auto *NewTag = New->getAnonDeclWithTypedefName();
     NamedDecl *Hidden = nullptr;
-    if (getLangOpts().CPlusPlus && OldTag && NewTag &&
+    if (OldTag && NewTag &&
         OldTag->getCanonicalDecl() != NewTag->getCanonicalDecl() &&
         !hasVisibleDefinition(OldTag, &Hidden)) {
       // There is a definition of this tag, but it is not visible. Use it
@@ -16055,8 +16054,6 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
     return nullptr;
   }
 
-  // FIXME: Create a ModuleDecl and return it.
-
   // FIXME: Most of this work should be done by the preprocessor rather than
   // here, in order to support macro import.
 
@@ -16070,6 +16067,8 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
     ModuleName += Piece.first->getName();
   }
 
+  // FIXME: If we've already seen a module-declaration, report an error.
+
   // If a module name was explicitly specified on the command line, it must be
   // correct.
   if (!getLangOpts().CurrentModule.empty() &&
@@ -16082,6 +16081,7 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
   const_cast<LangOptions&>(getLangOpts()).CurrentModule = ModuleName;
 
   auto &Map = PP.getHeaderSearchInfo().getModuleMap();
+  Module *Mod;
 
   switch (MDK) {
   case ModuleDeclKind::Module: {
@@ -16100,12 +16100,9 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
     }
 
     // Create a Module for the module that we're defining.
-    Module *Mod = Map.createModuleForInterfaceUnit(ModuleLoc, ModuleName);
+    Mod = Map.createModuleForInterfaceUnit(ModuleLoc, ModuleName);
     assert(Mod && "module creation should not fail");
-
-    // Enter the semantic scope of the module.
-    ActOnModuleBegin(ModuleLoc, Mod);
-    return nullptr;
+    break;
   }
 
   case ModuleDeclKind::Partition:
@@ -16115,14 +16112,26 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc,
   case ModuleDeclKind::Implementation:
     std::pair<IdentifierInfo *, SourceLocation> ModuleNameLoc(
         PP.getIdentifierInfo(ModuleName), Path[0].second);
-
-    DeclResult Import = ActOnModuleImport(ModuleLoc, ModuleLoc, ModuleNameLoc);
-    if (Import.isInvalid())
+    Mod = getModuleLoader().loadModule(ModuleLoc, Path, Module::AllVisible,
+                                       /*IsIncludeDirective=*/false);
+    if (!Mod)
       return nullptr;
-    return ConvertDeclToDeclGroup(Import.get());
+    break;
   }
 
-  llvm_unreachable("unexpected module decl kind");
+  // Enter the semantic scope of the module.
+  ModuleScopes.push_back({});
+  ModuleScopes.back().Module = Mod;
+  ModuleScopes.back().OuterVisibleModules = std::move(VisibleModules);
+  VisibleModules.setVisible(Mod, ModuleLoc);
+
+  // From now on, we have an owning module for all declarations we see.
+  // However, those declarations are module-private unless explicitly
+  // exported.
+  Context.getTranslationUnitDecl()->setLocalOwningModule(Mod);
+
+  // FIXME: Create a ModuleDecl.
+  return nullptr;
 }
 
 DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
@@ -16311,6 +16320,7 @@ Decl *Sema::ActOnStartExportDecl(Scope *S, SourceLocation ExportLoc,
 
   CurContext->addDecl(D);
   PushDeclContext(S, D);
+  D->setModuleOwnershipKind(Decl::ModuleOwnershipKind::VisibleWhenImported);
   return D;
 }
 
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index 1929bc539188..5fb79a6bf630 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -6929,8 +6929,34 @@ shouldDiagnoseAvailabilityByDefault(const ASTContext &Context,
          DeclVersion >= ForceAvailabilityFromVersion;
 }
 
+static NamedDecl *findEnclosingDeclToAnnotate(Decl *OrigCtx) {
+  for (Decl *Ctx = OrigCtx; Ctx;
+       Ctx = cast_or_null<Decl>(Ctx->getDeclContext())) {
+    if (isa<TagDecl>(Ctx) || isa<FunctionDecl>(Ctx) || isa<ObjCMethodDecl>(Ctx))
+      return cast<NamedDecl>(Ctx);
+    if (auto *CD = dyn_cast<ObjCContainerDecl>(Ctx)) {
+      if (auto *Imp = dyn_cast<ObjCImplDecl>(Ctx))
+        return Imp->getClassInterface();
+      return CD;
+    }
+  }
+
+  return dyn_cast<NamedDecl>(OrigCtx);
+}
+
+/// Actually emit an availability diagnostic for a reference to an unavailable
+/// decl.
+///
+/// \param Ctx The context that the reference occurred in
+/// \param ReferringDecl The exact declaration that was referenced.
+/// \param OffendingDecl A related decl to \c ReferringDecl that has an
+/// availability attribute corrisponding to \c K attached to it. Note that this
+/// may not be the same as ReferringDecl, i.e. if an EnumDecl is annotated and
+/// we refer to a member EnumConstantDecl, ReferringDecl is the EnumConstantDecl
+/// and OffendingDecl is the EnumDecl.
 static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
-                                      Decl *Ctx, const NamedDecl *D,
+                                      Decl *Ctx, const NamedDecl *ReferringDecl,
+                                      const NamedDecl *OffendingDecl,
                                       StringRef Message, SourceLocation Loc,
                                       const ObjCInterfaceDecl *UnknownObjCClass,
                                       const ObjCPropertyDecl *ObjCProperty,
@@ -6938,7 +6964,7 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   // Diagnostics for deprecated or unavailable.
   unsigned diag, diag_message, diag_fwdclass_message;
   unsigned diag_available_here = diag::note_availability_specified_here;
-  SourceLocation NoteLocation = D->getLocation();
+  SourceLocation NoteLocation = OffendingDecl->getLocation();
 
   // Matches 'diag::note_property_attribute' options.
   unsigned property_note_select;
@@ -6947,7 +6973,7 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   unsigned available_here_select_kind;
 
   VersionTuple DeclVersion;
-  if (const AvailabilityAttr *AA = getAttrForPlatform(S.Context, D))
+  if (const AvailabilityAttr *AA = getAttrForPlatform(S.Context, OffendingDecl))
     DeclVersion = AA->getIntroduced();
 
   if (!ShouldDiagnoseAvailabilityInContext(S, K, DeclVersion, Ctx))
@@ -6961,7 +6987,7 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
     diag_fwdclass_message = diag::warn_deprecated_fwdclass_message;
     property_note_select = /* deprecated */ 0;
     available_here_select_kind = /* deprecated */ 2;
-    if (const auto *attr = D->getAttr<DeprecatedAttr>())
+    if (const auto *attr = OffendingDecl->getAttr<DeprecatedAttr>())
       NoteLocation = attr->getLocation();
     break;
 
@@ -6973,13 +6999,14 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
     property_note_select = /* unavailable */ 1;
     available_here_select_kind = /* unavailable */ 0;
 
-    if (auto attr = D->getAttr<UnavailableAttr>()) {
+    if (auto attr = OffendingDecl->getAttr<UnavailableAttr>()) {
       if (attr->isImplicit() && attr->getImplicitReason()) {
         // Most of these failures are due to extra restrictions in ARC;
         // reflect that in the primary diagnostic when applicable.
         auto flagARCError = [&] {
           if (S.getLangOpts().ObjCAutoRefCount &&
-              S.getSourceManager().isInSystemHeader(D->getLocation()))
+              S.getSourceManager().isInSystemHeader(
+                  OffendingDecl->getLocation()))
             diag = diag::err_unavailable_in_arc;
         };
 
@@ -7022,7 +7049,8 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
     // not specified for deployment targets >= to iOS 11 or equivalent or
     // for declarations that were introduced in iOS 11 (macOS 10.13, ...) or
     // later.
-    const AvailabilityAttr *AA = getAttrForPlatform(S.getASTContext(), D);
+    const AvailabilityAttr *AA =
+        getAttrForPlatform(S.getASTContext(), OffendingDecl);
     VersionTuple Introduced = AA->getIntroduced();
     bool NewWarning = shouldDiagnoseAvailabilityByDefault(
         S.Context, S.Context.getTargetInfo().getPlatformMinVersion(),
@@ -7045,9 +7073,9 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   CharSourceRange UseRange;
   StringRef Replacement;
   if (K == AR_Deprecated) {
-    if (auto attr = D->getAttr<DeprecatedAttr>())
+    if (auto attr = OffendingDecl->getAttr<DeprecatedAttr>())
       Replacement = attr->getReplacement();
-    if (auto attr = getAttrForPlatform(S.Context, D))
+    if (auto attr = getAttrForPlatform(S.Context, OffendingDecl))
       Replacement = attr->getReplacement();
 
     if (!Replacement.empty())
@@ -7056,21 +7084,21 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   }
 
   if (!Message.empty()) {
-    S.Diag(Loc, diag_message) << D << Message
+    S.Diag(Loc, diag_message) << ReferringDecl << Message
       << (UseRange.isValid() ?
           FixItHint::CreateReplacement(UseRange, Replacement) : FixItHint());
     if (ObjCProperty)
       S.Diag(ObjCProperty->getLocation(), diag::note_property_attribute)
           << ObjCProperty->getDeclName() << property_note_select;
   } else if (!UnknownObjCClass) {
-    S.Diag(Loc, diag) << D
+    S.Diag(Loc, diag) << ReferringDecl
       << (UseRange.isValid() ?
           FixItHint::CreateReplacement(UseRange, Replacement) : FixItHint());
     if (ObjCProperty)
       S.Diag(ObjCProperty->getLocation(), diag::note_property_attribute)
           << ObjCProperty->getDeclName() << property_note_select;
   } else {
-    S.Diag(Loc, diag_fwdclass_message) << D
+    S.Diag(Loc, diag_fwdclass_message) << ReferringDecl
       << (UseRange.isValid() ?
           FixItHint::CreateReplacement(UseRange, Replacement) : FixItHint());
     S.Diag(UnknownObjCClass->getLocation(), diag::note_forward_class);
@@ -7078,16 +7106,16 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
 
   // The declaration can have multiple availability attributes, we are looking
   // at one of them.
-  const AvailabilityAttr *A = getAttrForPlatform(S.Context, D);
+  const AvailabilityAttr *A = getAttrForPlatform(S.Context, OffendingDecl);
   if (A && A->isInherited()) {
-    for (const Decl *Redecl = D->getMostRecentDecl(); Redecl;
+    for (const Decl *Redecl = OffendingDecl->getMostRecentDecl(); Redecl;
          Redecl = Redecl->getPreviousDecl()) {
       const AvailabilityAttr *AForRedecl = getAttrForPlatform(S.Context,
                                                               Redecl);
       if (AForRedecl && !AForRedecl->isInherited()) {
         // If D is a declaration with inherited attributes, the note should
         // point to the declaration with actual attributes.
-        S.Diag(Redecl->getLocation(), diag_available_here) << D
+        S.Diag(Redecl->getLocation(), diag_available_here) << OffendingDecl
             << available_here_select_kind;
         break;
       }
@@ -7095,10 +7123,19 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   }
   else
     S.Diag(NoteLocation, diag_available_here)
-        << D << available_here_select_kind;
+        << OffendingDecl << available_here_select_kind;
 
   if (K == AR_NotYetIntroduced)
-    S.Diag(Loc, diag::note_partial_availability_silence) << D;
+    if (const auto *Enclosing = findEnclosingDeclToAnnotate(Ctx)) {
+      if (auto *TD = dyn_cast<TagDecl>(Enclosing))
+        if (TD->getDeclName().isEmpty()) {
+          S.Diag(TD->getLocation(), diag::note_partial_availability_silence)
+              << /*Anonymous*/1 << TD->getKindName();
+          return;
+        }
+      S.Diag(Enclosing->getLocation(), diag::note_partial_availability_silence)
+          << /*Named*/0 << Enclosing;
+    }
 }
 
 static void handleDelayedAvailabilityCheck(Sema &S, DelayedDiagnostic &DD,
@@ -7108,9 +7145,9 @@ static void handleDelayedAvailabilityCheck(Sema &S, DelayedDiagnostic &DD,
 
   DD.Triggered = true;
   DoEmitAvailabilityWarning(
-      S, DD.getAvailabilityResult(), Ctx, DD.getAvailabilityDecl(),
-      DD.getAvailabilityMessage(), DD.Loc, DD.getUnknownObjCClass(),
-      DD.getObjCProperty(), false);
+      S, DD.getAvailabilityResult(), Ctx, DD.getAvailabilityReferringDecl(),
+      DD.getAvailabilityOffendingDecl(), DD.getAvailabilityMessage(), DD.Loc,
+      DD.getUnknownObjCClass(), DD.getObjCProperty(), false);
 }
 
 void Sema::PopParsingDeclaration(ParsingDeclState state, Decl *decl) {
@@ -7169,22 +7206,25 @@ void Sema::redelayDiagnostics(DelayedDiagnosticPool &pool) {
 }
 
 void Sema::EmitAvailabilityWarning(AvailabilityResult AR,
-                                   NamedDecl *D, StringRef Message,
-                                   SourceLocation Loc,
+                                   const NamedDecl *ReferringDecl,
+                                   const NamedDecl *OffendingDecl,
+                                   StringRef Message, SourceLocation Loc,
                                    const ObjCInterfaceDecl *UnknownObjCClass,
-                                   const ObjCPropertyDecl  *ObjCProperty,
+                                   const ObjCPropertyDecl *ObjCProperty,
                                    bool ObjCPropertyAccess) {
   // Delay if we're currently parsing a declaration.
   if (DelayedDiagnostics.shouldDelayDiagnostics()) {
-    DelayedDiagnostics.add(DelayedDiagnostic::makeAvailability(
-        AR, Loc, D, UnknownObjCClass, ObjCProperty, Message,
-        ObjCPropertyAccess));
+    DelayedDiagnostics.add(
+        DelayedDiagnostic::makeAvailability(
+            AR, Loc, ReferringDecl, OffendingDecl, UnknownObjCClass,
+            ObjCProperty, Message, ObjCPropertyAccess));
     return;
   }
 
   Decl *Ctx = cast<Decl>(getCurLexicalContext());
-  DoEmitAvailabilityWarning(*this, AR, Ctx, D, Message, Loc, UnknownObjCClass,
-                            ObjCProperty, ObjCPropertyAccess);
+  DoEmitAvailabilityWarning(*this, AR, Ctx, ReferringDecl, OffendingDecl,
+                            Message, Loc, UnknownObjCClass, ObjCProperty,
+                            ObjCPropertyAccess);
 }
 
 namespace {
@@ -7336,19 +7376,21 @@ class DiagnoseUnguardedAvailability
 
 void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
     NamedDecl *D, SourceRange Range) {
-
-  VersionTuple ContextVersion = AvailabilityStack.back();
-  if (AvailabilityResult Result =
-          SemaRef.ShouldDiagnoseAvailabilityOfDecl(D, nullptr)) {
+  AvailabilityResult Result;
+  const NamedDecl *OffendingDecl;
+  std::tie(Result, OffendingDecl) =
+      SemaRef.ShouldDiagnoseAvailabilityOfDecl(D, nullptr);
+  if (Result != AR_Available) {
     // All other diagnostic kinds have already been handled in
     // DiagnoseAvailabilityOfDecl.
     if (Result != AR_NotYetIntroduced)
       return;
 
-    const AvailabilityAttr *AA = getAttrForPlatform(SemaRef.getASTContext(), D);
+    const AvailabilityAttr *AA =
+      getAttrForPlatform(SemaRef.getASTContext(), OffendingDecl);
     VersionTuple Introduced = AA->getIntroduced();
 
-    if (ContextVersion >= Introduced)
+    if (AvailabilityStack.back() >= Introduced)
       return;
 
     // If the context of this function is less available than D, we should not
@@ -7373,8 +7415,9 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
                SemaRef.getASTContext().getTargetInfo().getPlatformName())
         << Introduced.getAsString();
 
-    SemaRef.Diag(D->getLocation(), diag::note_availability_specified_here)
-        << D << /* partial */ 3;
+    SemaRef.Diag(OffendingDecl->getLocation(),
+                 diag::note_availability_specified_here)
+        << OffendingDecl << /* partial */ 3;
 
     auto FixitDiag =
         SemaRef.Diag(Range.getBegin(), diag::note_unguarded_available_silence)
diff --git a/lib/Sema/SemaDeclObjC.cpp b/lib/Sema/SemaDeclObjC.cpp
index 2c8080dbf02b..778b8062f68c 100644
--- a/lib/Sema/SemaDeclObjC.cpp
+++ b/lib/Sema/SemaDeclObjC.cpp
@@ -458,7 +458,10 @@ static void diagnoseUseOfProtocols(Sema &TheSema,
   // Diagnose availability in the context of the ObjC container.
   Sema::ContextRAII SavedContext(TheSema, CD);
   for (unsigned i = 0; i < NumProtoRefs; ++i) {
-    (void)TheSema.DiagnoseUseOfDecl(ProtoRefs[i], ProtoLocs[i]);
+    (void)TheSema.DiagnoseUseOfDecl(ProtoRefs[i], ProtoLocs[i],
+                                    /*UnknownObjCClass=*/nullptr,
+                                    /*ObjCPropertyAccess=*/false,
+                                    /*AvoidPartialAvailabilityChecks=*/true);
   }
 }
 
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index f49df6b3216d..8016bf99889f 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -87,24 +87,9 @@ static void DiagnoseUnusedOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc) {
   }
 }
 
-static bool HasRedeclarationWithoutAvailabilityInCategory(const Decl *D) {
-  const auto *OMD = dyn_cast<ObjCMethodDecl>(D);
-  if (!OMD)
-    return false;
-  const ObjCInterfaceDecl *OID = OMD->getClassInterface();
-  if (!OID)
-    return false;
-
-  for (const ObjCCategoryDecl *Cat : OID->visible_categories())
-    if (ObjCMethodDecl *CatMeth =
-            Cat->getMethod(OMD->getSelector(), OMD->isInstanceMethod()))
-      if (!CatMeth->hasAttr<AvailabilityAttr>())
-        return true;
-  return false;
-}
-
-AvailabilityResult
-Sema::ShouldDiagnoseAvailabilityOfDecl(NamedDecl *&D, std::string *Message) {
+std::pair<AvailabilityResult, const NamedDecl *>
+Sema::ShouldDiagnoseAvailabilityOfDecl(const NamedDecl *D,
+                                       std::string *Message) {
   AvailabilityResult Result = D->getAvailability(Message);
 
   // For typedefs, if the typedef declaration appears available look
@@ -121,78 +106,61 @@ Sema::ShouldDiagnoseAvailabilityOfDecl(NamedDecl *&D, std::string *Message) {
   }
 
   // Forward class declarations get their attributes from their definition.
-  if (ObjCInterfaceDecl *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
+  if (const ObjCInterfaceDecl *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
     if (IDecl->getDefinition()) {
       D = IDecl->getDefinition();
       Result = D->getAvailability(Message);
     }
   }
 
-  if (const EnumConstantDecl *ECD = dyn_cast<EnumConstantDecl>(D))
+  if (const auto *ECD = dyn_cast<EnumConstantDecl>(D))
     if (Result == AR_Available) {
       const DeclContext *DC = ECD->getDeclContext();
-      if (const EnumDecl *TheEnumDecl = dyn_cast<EnumDecl>(DC))
+      if (const auto *TheEnumDecl = dyn_cast<EnumDecl>(DC)) {
         Result = TheEnumDecl->getAvailability(Message);
+        D = TheEnumDecl;
+      }
     }
 
-  if (Result == AR_NotYetIntroduced) {
-    // Don't do this for enums, they can't be redeclared.
-    if (isa<EnumConstantDecl>(D) || isa<EnumDecl>(D))
-      return AR_Available;
-
-    bool Warn = !D->getAttr<AvailabilityAttr>()->isInherited();
-    // Objective-C method declarations in categories are not modelled as
-    // redeclarations, so manually look for a redeclaration in a category
-    // if necessary.
-    if (Warn && HasRedeclarationWithoutAvailabilityInCategory(D))
-      Warn = false;
-    // In general, D will point to the most recent redeclaration. However,
-    // for `@class A;` decls, this isn't true -- manually go through the
-    // redecl chain in that case.
-    if (Warn && isa<ObjCInterfaceDecl>(D))
-      for (Decl *Redecl = D->getMostRecentDecl(); Redecl && Warn;
-           Redecl = Redecl->getPreviousDecl())
-        if (!Redecl->hasAttr<AvailabilityAttr>() ||
-            Redecl->getAttr<AvailabilityAttr>()->isInherited())
-          Warn = false;
-
-    return Warn ? AR_NotYetIntroduced : AR_Available;
-  }
-
-  return Result;
+  return {Result, D};
 }
 
 static void
 DiagnoseAvailabilityOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc,
                            const ObjCInterfaceDecl *UnknownObjCClass,
-                           bool ObjCPropertyAccess) {
+                           bool ObjCPropertyAccess,
+                           bool AvoidPartialAvailabilityChecks = false) {
   std::string Message;
+  AvailabilityResult Result;
+  const NamedDecl* OffendingDecl;
   // See if this declaration is unavailable, deprecated, or partial.
-  if (AvailabilityResult Result =
-          S.ShouldDiagnoseAvailabilityOfDecl(D, &Message)) {
+  std::tie(Result, OffendingDecl) = S.ShouldDiagnoseAvailabilityOfDecl(D, &Message);
+  if (Result == AR_Available)
+    return;
 
-    if (Result == AR_NotYetIntroduced) {
-      if (S.getCurFunctionOrMethodDecl()) {
-        S.getEnclosingFunction()->HasPotentialAvailabilityViolations = true;
-        return;
-      } else if (S.getCurBlock() || S.getCurLambda()) {
-        S.getCurFunction()->HasPotentialAvailabilityViolations = true;
-        return;
-      }
+  if (Result == AR_NotYetIntroduced) {
+    if (AvoidPartialAvailabilityChecks)
+      return;
+    if (S.getCurFunctionOrMethodDecl()) {
+      S.getEnclosingFunction()->HasPotentialAvailabilityViolations = true;
+      return;
+    } else if (S.getCurBlock() || S.getCurLambda()) {
+      S.getCurFunction()->HasPotentialAvailabilityViolations = true;
+      return;
     }
-
-    const ObjCPropertyDecl *ObjCPDecl = nullptr;
-    if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
-      if (const ObjCPropertyDecl *PD = MD->findPropertyDecl()) {
-        AvailabilityResult PDeclResult = PD->getAvailability(nullptr);
-        if (PDeclResult == Result)
-          ObjCPDecl = PD;
-      }
-    }
-
-    S.EmitAvailabilityWarning(Result, D, Message, Loc, UnknownObjCClass,
-                              ObjCPDecl, ObjCPropertyAccess);
   }
+
+  const ObjCPropertyDecl *ObjCPDecl = nullptr;
+  if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
+    if (const ObjCPropertyDecl *PD = MD->findPropertyDecl()) {
+      AvailabilityResult PDeclResult = PD->getAvailability(nullptr);
+      if (PDeclResult == Result)
+        ObjCPDecl = PD;
+    }
+  }
+
+  S.EmitAvailabilityWarning(Result, D, OffendingDecl, Message, Loc,
+                            UnknownObjCClass, ObjCPDecl, ObjCPropertyAccess);
 }
 
 /// \brief Emit a note explaining that this function is deleted.
@@ -310,7 +278,8 @@ void Sema::MaybeSuggestAddingStaticToDecl(const FunctionDecl *Cur) {
 ///
 bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
                              const ObjCInterfaceDecl *UnknownObjCClass,
-                             bool ObjCPropertyAccess) {
+                             bool ObjCPropertyAccess,
+                             bool AvoidPartialAvailabilityChecks) {
   if (getLangOpts().CPlusPlus && isa<FunctionDecl>(D)) {
     // If there were any diagnostics suppressed by template argument deduction,
     // emit them now.
@@ -395,7 +364,8 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
   }
 
   DiagnoseAvailabilityOfDecl(*this, D, Loc, UnknownObjCClass,
-                             ObjCPropertyAccess);
+                             ObjCPropertyAccess,
+                             AvoidPartialAvailabilityChecks);
 
   DiagnoseUnusedOfDecl(*this, D, Loc);
 
@@ -14695,24 +14665,24 @@ static void MarkExprReferenced(Sema &SemaRef, SourceLocation Loc,
                           ME->performsVirtualDispatch(SemaRef.getLangOpts());
   if (!IsVirtualCall)
     return;
-  const Expr *Base = ME->getBase();
-  const CXXRecordDecl *MostDerivedClassDecl = Base->getBestDynamicClassType();
-  if (!MostDerivedClassDecl)
-    return;
-  CXXMethodDecl *DM = MD->getCorrespondingMethodInClass(MostDerivedClassDecl);
-  if (!DM || DM->isPure())
-    return;
-  SemaRef.MarkAnyDeclReferenced(Loc, DM, MightBeOdrUse);
+
+  // If it's possible to devirtualize the call, mark the called function
+  // referenced.
+  CXXMethodDecl *DM = MD->getDevirtualizedMethod(
+      ME->getBase(), SemaRef.getLangOpts().AppleKext);
+  if (DM)
+    SemaRef.MarkAnyDeclReferenced(Loc, DM, MightBeOdrUse);
 } 
 
 /// \brief Perform reference-marking and odr-use handling for a DeclRefExpr.
-void Sema::MarkDeclRefReferenced(DeclRefExpr *E) {
+void Sema::MarkDeclRefReferenced(DeclRefExpr *E, const Expr *Base) {
   // TODO: update this with DR# once a defect report is filed.
   // C++11 defect. The address of a pure member should not be an ODR use, even
   // if it's a qualified reference.
   bool OdrUse = true;
-  if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(E->getDecl()))
-    if (Method->isVirtual())
+  if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(E->getDecl()))
+    if (Method->isVirtual() &&
+        !Method->getDevirtualizedMethod(Base, getLangOpts().AppleKext))
       OdrUse = false;
   MarkExprReferenced(*this, E->getLocation(), E->getDecl(), E, OdrUse);
 }
diff --git a/lib/Sema/SemaExprMember.cpp b/lib/Sema/SemaExprMember.cpp
index b18de7e94686..c3d0e2db76b6 100644
--- a/lib/Sema/SemaExprMember.cpp
+++ b/lib/Sema/SemaExprMember.cpp
@@ -1842,10 +1842,6 @@ Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
                                                   FoundDecl, Field);
   if (Base.isInvalid())
     return ExprError();
-  MemberExpr *ME =
-      BuildMemberExpr(*this, Context, Base.get(), IsArrow, OpLoc, SS,
-                      /*TemplateKWLoc=*/SourceLocation(), Field, FoundDecl,
-                      MemberNameInfo, MemberType, VK, OK);
 
   // Build a reference to a private copy for non-static data members in
   // non-static member functions, privatized by OpenMP constructs.
@@ -1855,7 +1851,10 @@ Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
     if (auto *PrivateCopy = IsOpenMPCapturedDecl(Field))
       return getOpenMPCapturedExpr(PrivateCopy, VK, OK, OpLoc);
   }
-  return ME;
+
+  return BuildMemberExpr(*this, Context, Base.get(), IsArrow, OpLoc, SS,
+                         /*TemplateKWLoc=*/SourceLocation(), Field, FoundDecl,
+                         MemberNameInfo, MemberType, VK, OK);
 }
 
 /// Builds an implicit member access expression.  The current context
diff --git a/lib/Sema/SemaLambda.cpp b/lib/Sema/SemaLambda.cpp
index d6b70610d461..46f2ba376006 100644
--- a/lib/Sema/SemaLambda.cpp
+++ b/lib/Sema/SemaLambda.cpp
@@ -1595,7 +1595,7 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
                                           ContainsUnexpandedParameterPack);
   // If the lambda expression's call operator is not explicitly marked constexpr
   // and we are not in a dependent context, analyze the call operator to infer
-  // its constexpr-ness, supressing diagnostics while doing so.
+  // its constexpr-ness, suppressing diagnostics while doing so.
   if (getLangOpts().CPlusPlus1z && !CallOperator->isInvalidDecl() &&
       !CallOperator->isConstexpr() &&
       !isa<CoroutineBodyStmt>(CallOperator->getBody()) &&
diff --git a/lib/Sema/SemaLookup.cpp b/lib/Sema/SemaLookup.cpp
index 2e7fb875a276..85596ed52e9d 100644
--- a/lib/Sema/SemaLookup.cpp
+++ b/lib/Sema/SemaLookup.cpp
@@ -1395,6 +1395,20 @@ bool Sema::hasVisibleMergedDefinition(NamedDecl *Def) {
   return false;
 }
 
+bool Sema::hasMergedDefinitionInCurrentModule(NamedDecl *Def) {
+  // FIXME: When not in local visibility mode, we can't tell the difference
+  // between a declaration being visible because we merged a local copy of
+  // the same declaration into it, and it being visible because its owning
+  // module is visible.
+  if (Def->getModuleOwnershipKind() == Decl::ModuleOwnershipKind::Visible &&
+      getLangOpts().ModulesLocalVisibility)
+    return true;
+  for (Module *Merged : Context.getModulesWithMergedDefinition(Def))
+    if (Merged->getTopLevelModuleName() == getLangOpts().CurrentModule)
+      return true;
+  return false;
+}
+
 template<typename ParmDecl>
 static bool
 hasVisibleDefaultArgument(Sema &S, const ParmDecl *D,
@@ -1495,23 +1509,40 @@ bool LookupResult::isVisibleSlow(Sema &SemaRef, NamedDecl *D) {
   assert(D->isHidden() && "should not call this: not in slow case");
 
   Module *DeclModule = SemaRef.getOwningModule(D);
-  assert(DeclModule && "hidden decl not from a module");
+  if (!DeclModule) {
+    // A module-private declaration with no owning module means this is in the
+    // global module in the C++ Modules TS. This is visible within the same
+    // translation unit only.
+    // FIXME: Don't assume that "same translation unit" means the same thing
+    // as "not from an AST file".
+    assert(D->isModulePrivate() && "hidden decl has no module");
+    if (!D->isFromASTFile() || SemaRef.hasMergedDefinitionInCurrentModule(D))
+      return true;
+  } else {
+    // If the owning module is visible, and the decl is not module private,
+    // then the decl is visible too. (Module private is ignored within the same
+    // top-level module.)
+    if (D->isModulePrivate()
+          ? DeclModule->getTopLevelModuleName() ==
+                    SemaRef.getLangOpts().CurrentModule ||
+            SemaRef.hasMergedDefinitionInCurrentModule(D)
+          : SemaRef.isModuleVisible(DeclModule) ||
+            SemaRef.hasVisibleMergedDefinition(D))
+      return true;
+  }
 
-  // If the owning module is visible, and the decl is not module private,
-  // then the decl is visible too. (Module private is ignored within the same
-  // top-level module.)
-  // FIXME: Check the owning module for module-private declarations rather than
-  // assuming "same AST file" is the same thing as "same module".
-  if ((!D->isFromASTFile() || !D->isModulePrivate()) &&
-      (SemaRef.isModuleVisible(DeclModule) ||
-       SemaRef.hasVisibleMergedDefinition(D)))
-    return true;
+  // Determine whether a decl context is a file context for the purpose of
+  // visibility. This looks through some (export and linkage spec) transparent
+  // contexts, but not others (enums).
+  auto IsEffectivelyFileContext = [](const DeclContext *DC) {
+    return DC->isFileContext() || isa<LinkageSpecDecl>(DC) ||
+           isa<ExportDecl>(DC);
+  };
 
-  // If this declaration is not at namespace scope nor module-private,
+  // If this declaration is not at namespace scope
   // then it is visible if its lexical parent has a visible definition.
   DeclContext *DC = D->getLexicalDeclContext();
-  if (!D->isModulePrivate() && DC && !DC->isFileContext() &&
-      !isa<LinkageSpecDecl>(DC) && !isa<ExportDecl>(DC)) {
+  if (DC && !IsEffectivelyFileContext(DC)) {
     // For a parameter, check whether our current template declaration's
     // lexical context is visible, not whether there's some other visible
     // definition of it, because parameters aren't "within" the definition.
@@ -1519,32 +1550,45 @@ bool LookupResult::isVisibleSlow(Sema &SemaRef, NamedDecl *D) {
     // In C++ we need to check for a visible definition due to ODR merging,
     // and in C we must not because each declaration of a function gets its own
     // set of declarations for tags in prototype scope.
-    if ((D->isTemplateParameter() || isa<ParmVarDecl>(D)
-         || (isa<FunctionDecl>(DC) && !SemaRef.getLangOpts().CPlusPlus))
-            ? isVisible(SemaRef, cast<NamedDecl>(DC))
-            : SemaRef.hasVisibleDefinition(cast<NamedDecl>(DC))) {
-      if (SemaRef.CodeSynthesisContexts.empty() &&
-          // FIXME: Do something better in this case.
-          !SemaRef.getLangOpts().ModulesLocalVisibility) {
-        // Cache the fact that this declaration is implicitly visible because
-        // its parent has a visible definition.
-        D->setVisibleDespiteOwningModule();
-      }
-      return true;
+    bool VisibleWithinParent;
+    if (D->isTemplateParameter() || isa<ParmVarDecl>(D) ||
+        (isa<FunctionDecl>(DC) && !SemaRef.getLangOpts().CPlusPlus))
+      VisibleWithinParent = isVisible(SemaRef, cast<NamedDecl>(DC));
+    else if (D->isModulePrivate()) {
+      // A module-private declaration is only visible if an enclosing lexical
+      // parent was merged with another definition in the current module.
+      VisibleWithinParent = false;
+      do {
+        if (SemaRef.hasMergedDefinitionInCurrentModule(cast<NamedDecl>(DC))) {
+          VisibleWithinParent = true;
+          break;
+        }
+        DC = DC->getLexicalParent();
+      } while (!IsEffectivelyFileContext(DC));
+    } else {
+      VisibleWithinParent = SemaRef.hasVisibleDefinition(cast<NamedDecl>(DC));
     }
-    return false;
+
+    if (VisibleWithinParent && SemaRef.CodeSynthesisContexts.empty() &&
+        // FIXME: Do something better in this case.
+        !SemaRef.getLangOpts().ModulesLocalVisibility) {
+      // Cache the fact that this declaration is implicitly visible because
+      // its parent has a visible definition.
+      D->setVisibleDespiteOwningModule();
+    }
+    return VisibleWithinParent;
   }
 
+  // FIXME: All uses of DeclModule below this point should also check merged
+  // modules.
+  if (!DeclModule)
+    return false;
+
   // Find the extra places where we need to look.
   llvm::DenseSet<Module*> &LookupModules = SemaRef.getLookupModules();
   if (LookupModules.empty())
     return false;
 
-  if (!DeclModule) {
-    DeclModule = SemaRef.getOwningModule(D);
-    assert(DeclModule && "hidden decl not from a module");
-  }
-
   // If our lookup set contains the decl's module, it's visible.
   if (LookupModules.count(DeclModule))
     return true;
diff --git a/lib/Sema/SemaObjCProperty.cpp b/lib/Sema/SemaObjCProperty.cpp
index 6c5716454874..62a771bcffa0 100644
--- a/lib/Sema/SemaObjCProperty.cpp
+++ b/lib/Sema/SemaObjCProperty.cpp
@@ -1676,8 +1676,9 @@ static bool SuperClassImplementsProperty(ObjCInterfaceDecl *IDecl,
 
 /// \brief Default synthesizes all properties which must be synthesized
 /// in class's \@implementation.
-void Sema::DefaultSynthesizeProperties(Scope *S, ObjCImplDecl* IMPDecl,
-                                       ObjCInterfaceDecl *IDecl) {
+void Sema::DefaultSynthesizeProperties(Scope *S, ObjCImplDecl *IMPDecl,
+                                       ObjCInterfaceDecl *IDecl,
+                                       SourceLocation AtEnd) {
   ObjCInterfaceDecl::PropertyMap PropMap;
   ObjCInterfaceDecl::PropertyDeclOrder PropertyOrder;
   IDecl->collectPropertiesToImplement(PropMap, PropertyOrder);
@@ -1725,6 +1726,10 @@ void Sema::DefaultSynthesizeProperties(Scope *S, ObjCImplDecl* IMPDecl,
              diag::warn_auto_synthesizing_protocol_property)
           << Prop << Proto;
         Diag(Prop->getLocation(), diag::note_property_declare);
+        std::string FixIt =
+            (Twine("@synthesize ") + Prop->getName() + ";\n\n").str();
+        Diag(AtEnd, diag::note_add_synthesize_directive)
+            << FixItHint::CreateInsertion(AtEnd, FixIt);
       }
       continue;
     }
@@ -1764,7 +1769,8 @@ void Sema::DefaultSynthesizeProperties(Scope *S, ObjCImplDecl* IMPDecl,
   }
 }
 
-void Sema::DefaultSynthesizeProperties(Scope *S, Decl *D) {
+void Sema::DefaultSynthesizeProperties(Scope *S, Decl *D,
+                                       SourceLocation AtEnd) {
   if (!LangOpts.ObjCDefaultSynthProperties || LangOpts.ObjCRuntime.isFragile())
     return;
   ObjCImplementationDecl *IC=dyn_cast_or_null<ObjCImplementationDecl>(D);
@@ -1772,7 +1778,7 @@ void Sema::DefaultSynthesizeProperties(Scope *S, Decl *D) {
     return;
   if (ObjCInterfaceDecl* IDecl = IC->getClassInterface())
     if (!IDecl->isObjCRequiresPropertyDefs())
-      DefaultSynthesizeProperties(S, IC, IDecl);
+      DefaultSynthesizeProperties(S, IC, IDecl, AtEnd);
 }
 
 static void DiagnoseUnimplementedAccessor(
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index 49da0e499771..1e0b6c158348 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -6106,6 +6106,33 @@ static bool checkGrainsizeNumTasksClauses(Sema &S,
   return ErrorFound;
 }
 
+static bool checkReductionClauseWithNogroup(Sema &S,
+                                            ArrayRef<OMPClause *> Clauses) {
+  OMPClause *ReductionClause = nullptr;
+  OMPClause *NogroupClause = nullptr;
+  for (auto *C : Clauses) {
+    if (C->getClauseKind() == OMPC_reduction) {
+      ReductionClause = C;
+      if (NogroupClause)
+        break;
+      continue;
+    }
+    if (C->getClauseKind() == OMPC_nogroup) {
+      NogroupClause = C;
+      if (ReductionClause)
+        break;
+      continue;
+    }
+  }
+  if (ReductionClause && NogroupClause) {
+    S.Diag(ReductionClause->getLocStart(), diag::err_omp_reduction_with_nogroup)
+        << SourceRange(NogroupClause->getLocStart(),
+                       NogroupClause->getLocEnd());
+    return true;
+  }
+  return false;
+}
+
 StmtResult Sema::ActOnOpenMPTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc,
@@ -6132,6 +6159,11 @@ StmtResult Sema::ActOnOpenMPTaskLoopDirective(
   // not appear on the same taskloop directive.
   if (checkGrainsizeNumTasksClauses(*this, Clauses))
     return StmtError();
+  // OpenMP, [2.9.2 taskloop Construct, Restrictions]
+  // If a reduction clause is present on the taskloop directive, the nogroup
+  // clause must not be specified.
+  if (checkReductionClauseWithNogroup(*this, Clauses))
+    return StmtError();
 
   getCurFunction()->setHasBranchProtectedScope();
   return OMPTaskLoopDirective::Create(Context, StartLoc, EndLoc,
@@ -6175,6 +6207,11 @@ StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
   // not appear on the same taskloop directive.
   if (checkGrainsizeNumTasksClauses(*this, Clauses))
     return StmtError();
+  // OpenMP, [2.9.2 taskloop Construct, Restrictions]
+  // If a reduction clause is present on the taskloop directive, the nogroup
+  // clause must not be specified.
+  if (checkReductionClauseWithNogroup(*this, Clauses))
+    return StmtError();
 
   getCurFunction()->setHasBranchProtectedScope();
   return OMPTaskLoopSimdDirective::Create(Context, StartLoc, EndLoc,
@@ -9399,6 +9436,12 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
                          SimpleRefExpr, RefRes.get());
           if (!PostUpdateRes.isUsable())
             continue;
+          if (isOpenMPTaskingDirective(DSAStack->getCurrentDirective())) {
+            Diag(RefExpr->getExprLoc(),
+                 diag::err_omp_reduction_non_addressable_expression)
+                << RefExpr->getSourceRange();
+            continue;
+          }
           ExprPostUpdates.push_back(
               IgnoredValueConversions(PostUpdateRes.get()).get());
         }
diff --git a/lib/Sema/SemaOverload.cpp b/lib/Sema/SemaOverload.cpp
index 5cc13f391d11..36f24fd9c463 100644
--- a/lib/Sema/SemaOverload.cpp
+++ b/lib/Sema/SemaOverload.cpp
@@ -48,7 +48,7 @@ static bool functionHasPassObjectSizeParams(const FunctionDecl *FD) {
 /// A convenience routine for creating a decayed reference to a function.
 static ExprResult
 CreateFunctionRefExpr(Sema &S, FunctionDecl *Fn, NamedDecl *FoundDecl,
-                      bool HadMultipleCandidates,
+                      const Expr *Base, bool HadMultipleCandidates,
                       SourceLocation Loc = SourceLocation(),
                       const DeclarationNameLoc &LocInfo = DeclarationNameLoc()){
   if (S.DiagnoseUseOfDecl(FoundDecl, Loc))
@@ -68,7 +68,7 @@ CreateFunctionRefExpr(Sema &S, FunctionDecl *Fn, NamedDecl *FoundDecl,
   if (HadMultipleCandidates)
     DRE->setHadMultipleCandidates(true);
 
-  S.MarkDeclRefReferenced(DRE);
+  S.MarkDeclRefReferenced(DRE, Base);
   return S.ImpCastExprToType(DRE, S.Context.getPointerType(DRE->getType()),
                              CK_FunctionToPointerDecay);
 }
@@ -9830,6 +9830,15 @@ static void DiagnoseBadDeduction(Sema &S, NamedDecl *Found, Decl *Templated,
       return;
     }
 
+    // We found a specific requirement that disabled the enable_if.
+    if (PDiag && PDiag->second.getDiagID() ==
+        diag::err_typename_nested_not_found_requirement) {
+      S.Diag(Templated->getLocation(),
+             diag::note_ovl_candidate_disabled_by_requirement)
+        << PDiag->second.getStringArg(0) << TemplateArgString;
+      return;
+    }
+
     // Format the SFINAE diagnostic into the argument string.
     // FIXME: Add a general mechanism to include a PartialDiagnostic *'s
     //        formatted message in another diagnostic.
@@ -11937,6 +11946,7 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
     FunctionDecl *FnDecl = Best->Function;
 
     if (FnDecl) {
+      Expr *Base = nullptr;
       // We matched an overloaded operator. Build a call to that
       // operator.
 
@@ -11949,7 +11959,7 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
                                               Best->FoundDecl, Method);
         if (InputRes.isInvalid())
           return ExprError();
-        Input = InputRes.get();
+        Base = Input = InputRes.get();
       } else {
         // Convert the arguments.
         ExprResult InputInit
@@ -11965,7 +11975,8 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
 
       // Build the actual expression node.
       ExprResult FnExpr = CreateFunctionRefExpr(*this, FnDecl, Best->FoundDecl,
-                                                HadMultipleCandidates, OpLoc);
+                                                Base, HadMultipleCandidates,
+                                                OpLoc);
       if (FnExpr.isInvalid())
         return ExprError();
 
@@ -12150,6 +12161,7 @@ Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
       FunctionDecl *FnDecl = Best->Function;
 
       if (FnDecl) {
+        Expr *Base = nullptr;
         // We matched an overloaded operator. Build a call to that
         // operator.
 
@@ -12171,7 +12183,7 @@ Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
                                                 Best->FoundDecl, Method);
           if (Arg0.isInvalid())
             return ExprError();
-          Args[0] = Arg0.getAs<Expr>();
+          Base = Args[0] = Arg0.getAs<Expr>();
           Args[1] = RHS = Arg1.getAs<Expr>();
         } else {
           // Convert the arguments.
@@ -12195,7 +12207,7 @@ Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
 
         // Build the actual expression node.
         ExprResult FnExpr = CreateFunctionRefExpr(*this, FnDecl,
-                                                  Best->FoundDecl,
+                                                  Best->FoundDecl, Base,
                                                   HadMultipleCandidates, OpLoc);
         if (FnExpr.isInvalid())
           return ExprError();
@@ -12417,6 +12429,7 @@ Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
         OpLocInfo.setCXXOperatorNameRange(SourceRange(LLoc, RLoc));
         ExprResult FnExpr = CreateFunctionRefExpr(*this, FnDecl,
                                                   Best->FoundDecl,
+                                                  Base,
                                                   HadMultipleCandidates,
                                                   OpLocInfo.getLoc(),
                                                   OpLocInfo.getInfo());
@@ -12975,7 +12988,7 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
                Context.DeclarationNames.getCXXOperatorName(OO_Call), LParenLoc);
   OpLocInfo.setCXXOperatorNameRange(SourceRange(LParenLoc, RParenLoc));
   ExprResult NewFn = CreateFunctionRefExpr(*this, Method, Best->FoundDecl,
-                                           HadMultipleCandidates,
+                                           Obj, HadMultipleCandidates,
                                            OpLocInfo.getLoc(),
                                            OpLocInfo.getInfo());
   if (NewFn.isInvalid())
@@ -13166,7 +13179,7 @@ Sema::BuildOverloadedArrowExpr(Scope *S, Expr *Base, SourceLocation OpLoc,
 
   // Build the operator call.
   ExprResult FnExpr = CreateFunctionRefExpr(*this, Method, Best->FoundDecl,
-                                            HadMultipleCandidates, OpLoc);
+                                            Base, HadMultipleCandidates, OpLoc);
   if (FnExpr.isInvalid())
     return ExprError();
 
@@ -13225,7 +13238,7 @@ ExprResult Sema::BuildLiteralOperatorCall(LookupResult &R,
 
   FunctionDecl *FD = Best->Function;
   ExprResult Fn = CreateFunctionRefExpr(*this, FD, Best->FoundDecl,
-                                        HadMultipleCandidates,
+                                        nullptr, HadMultipleCandidates,
                                         SuffixInfo.getLoc(),
                                         SuffixInfo.getInfo());
   if (Fn.isInvalid())
diff --git a/lib/Sema/SemaPseudoObject.cpp b/lib/Sema/SemaPseudoObject.cpp
index b6b429d1f25c..d159172a6990 100644
--- a/lib/Sema/SemaPseudoObject.cpp
+++ b/lib/Sema/SemaPseudoObject.cpp
@@ -1176,8 +1176,6 @@ bool ObjCSubscriptOpBuilder::findAtIndexGetter() {
   
   AtIndexGetter = S.LookupMethodInObjectType(AtIndexGetterSelector, ResultType, 
                                              true /*instance*/);
-  bool receiverIdType = (BaseT->isObjCIdType() ||
-                         BaseT->isObjCQualifiedIdType());
   
   if (!AtIndexGetter && S.getLangOpts().DebuggerObjCLiteral) {
     AtIndexGetter = ObjCMethodDecl::Create(S.Context, SourceLocation(), 
@@ -1203,7 +1201,7 @@ bool ObjCSubscriptOpBuilder::findAtIndexGetter() {
   }
 
   if (!AtIndexGetter) {
-    if (!receiverIdType) {
+    if (!BaseT->isObjCIdType()) {
       S.Diag(BaseExpr->getExprLoc(), diag::err_objc_subscript_method_not_found)
       << BaseExpr->getType() << 0 << arrayRef;
       return false;
@@ -1284,9 +1282,6 @@ bool ObjCSubscriptOpBuilder::findAtIndexSetter() {
   }
   AtIndexSetter = S.LookupMethodInObjectType(AtIndexSetterSelector, ResultType, 
                                              true /*instance*/);
-  
-  bool receiverIdType = (BaseT->isObjCIdType() ||
-                         BaseT->isObjCQualifiedIdType());
 
   if (!AtIndexSetter && S.getLangOpts().DebuggerObjCLiteral) {
     TypeSourceInfo *ReturnTInfo = nullptr;
@@ -1321,7 +1316,7 @@ bool ObjCSubscriptOpBuilder::findAtIndexSetter() {
   }
   
   if (!AtIndexSetter) {
-    if (!receiverIdType) {
+    if (!BaseT->isObjCIdType()) {
       S.Diag(BaseExpr->getExprLoc(), 
              diag::err_objc_subscript_method_not_found)
       << BaseExpr->getType() << 1 << arrayRef;
diff --git a/lib/Sema/SemaStmt.cpp b/lib/Sema/SemaStmt.cpp
index eed10b077eb8..2a38a1f8e1d8 100644
--- a/lib/Sema/SemaStmt.cpp
+++ b/lib/Sema/SemaStmt.cpp
@@ -1544,23 +1544,78 @@ namespace {
 
   // A visitor to determine if a continue or break statement is a
   // subexpression.
-  class BreakContinueFinder : public EvaluatedExprVisitor<BreakContinueFinder> {
+  class BreakContinueFinder : public ConstEvaluatedExprVisitor<BreakContinueFinder> {
     SourceLocation BreakLoc;
     SourceLocation ContinueLoc;
+    bool InSwitch = false;
+
   public:
-    BreakContinueFinder(Sema &S, Stmt* Body) :
+    BreakContinueFinder(Sema &S, const Stmt* Body) :
         Inherited(S.Context) {
       Visit(Body);
     }
 
-    typedef EvaluatedExprVisitor<BreakContinueFinder> Inherited;
+    typedef ConstEvaluatedExprVisitor<BreakContinueFinder> Inherited;
 
-    void VisitContinueStmt(ContinueStmt* E) {
+    void VisitContinueStmt(const ContinueStmt* E) {
       ContinueLoc = E->getContinueLoc();
     }
 
-    void VisitBreakStmt(BreakStmt* E) {
-      BreakLoc = E->getBreakLoc();
+    void VisitBreakStmt(const BreakStmt* E) {
+      if (!InSwitch)
+        BreakLoc = E->getBreakLoc();
+    }
+
+    void VisitSwitchStmt(const SwitchStmt* S) {
+      if (const Stmt *Init = S->getInit())
+        Visit(Init);
+      if (const Stmt *CondVar = S->getConditionVariableDeclStmt())
+        Visit(CondVar);
+      if (const Stmt *Cond = S->getCond())
+        Visit(Cond);
+
+      // Don't return break statements from the body of a switch.
+      InSwitch = true;
+      if (const Stmt *Body = S->getBody())
+        Visit(Body);
+      InSwitch = false;
+    }
+
+    void VisitForStmt(const ForStmt *S) {
+      // Only visit the init statement of a for loop; the body
+      // has a different break/continue scope.
+      if (const Stmt *Init = S->getInit())
+        Visit(Init);
+    }
+
+    void VisitWhileStmt(const WhileStmt *) {
+      // Do nothing; the children of a while loop have a different
+      // break/continue scope.
+    }
+
+    void VisitDoStmt(const DoStmt *) {
+      // Do nothing; the children of a while loop have a different
+      // break/continue scope.
+    }
+
+    void VisitCXXForRangeStmt(const CXXForRangeStmt *S) {
+      // Only visit the initialization of a for loop; the body
+      // has a different break/continue scope.
+      if (const Stmt *Range = S->getRangeStmt())
+        Visit(Range);
+      if (const Stmt *Begin = S->getBeginStmt())
+        Visit(Begin);
+      if (const Stmt *End = S->getEndStmt())
+        Visit(End);
+    }
+
+    void VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) {
+      // Only visit the initialization of a for loop; the body
+      // has a different break/continue scope.
+      if (const Stmt *Element = S->getElement())
+        Visit(Element);
+      if (const Stmt *Collection = S->getCollection())
+        Visit(Collection);
     }
 
     bool ContinueFound() { return ContinueLoc.isValid(); }
diff --git a/lib/Sema/SemaTemplate.cpp b/lib/Sema/SemaTemplate.cpp
index a8923ce9e27d..e9b38551683c 100644
--- a/lib/Sema/SemaTemplate.cpp
+++ b/lib/Sema/SemaTemplate.cpp
@@ -2806,6 +2806,101 @@ checkBuiltinTemplateIdType(Sema &SemaRef, BuiltinTemplateDecl *BTD,
   llvm_unreachable("unexpected BuiltinTemplateDecl!");
 }
 
+/// Determine whether this alias template is "enable_if_t".
+static bool isEnableIfAliasTemplate(TypeAliasTemplateDecl *AliasTemplate) {
+  return AliasTemplate->getName().equals("enable_if_t");
+}
+
+/// Collect all of the separable terms in the given condition, which
+/// might be a conjunction.
+///
+/// FIXME: The right answer is to convert the logical expression into
+/// disjunctive normal form, so we can find the first failed term
+/// within each possible clause.
+static void collectConjunctionTerms(Expr *Clause,
+                                    SmallVectorImpl<Expr *> &Terms) {
+  if (auto BinOp = dyn_cast<BinaryOperator>(Clause->IgnoreParenImpCasts())) {
+    if (BinOp->getOpcode() == BO_LAnd) {
+      collectConjunctionTerms(BinOp->getLHS(), Terms);
+      collectConjunctionTerms(BinOp->getRHS(), Terms);
+    }
+
+    return;
+  }
+
+  Terms.push_back(Clause);
+}
+
+// The ranges-v3 library uses an odd pattern of a top-level "||" with
+// a left-hand side that is value-dependent but never true. Identify
+// the idiom and ignore that term.
+static Expr *lookThroughRangesV3Condition(Preprocessor &PP, Expr *Cond) {
+  // Top-level '||'.
+  auto *BinOp = dyn_cast<BinaryOperator>(Cond->IgnoreParenImpCasts());
+  if (!BinOp) return Cond;
+
+  if (BinOp->getOpcode() != BO_LOr) return Cond;
+
+  // With an inner '==' that has a literal on the right-hand side.
+  Expr *LHS = BinOp->getLHS();
+  auto *InnerBinOp = dyn_cast<BinaryOperator>(LHS->IgnoreParenImpCasts());
+  if (!InnerBinOp) return Cond;
+
+  if (InnerBinOp->getOpcode() != BO_EQ ||
+      !isa<IntegerLiteral>(InnerBinOp->getRHS()))
+    return Cond;
+
+  // If the inner binary operation came from a macro expansion named
+  // CONCEPT_REQUIRES or CONCEPT_REQUIRES_, return the right-hand side
+  // of the '||', which is the real, user-provided condition.
+  SourceLocation Loc = InnerBinOp->getExprLoc();
+  if (!Loc.isMacroID()) return Cond;
+
+  StringRef MacroName = PP.getImmediateMacroName(Loc);
+  if (MacroName == "CONCEPT_REQUIRES" || MacroName == "CONCEPT_REQUIRES_")
+    return BinOp->getRHS();
+
+  return Cond;
+}
+
+/// Find the failed subexpression within enable_if, and describe it
+/// with a string.
+static std::pair<Expr *, std::string>
+findFailedEnableIfCondition(Sema &S, Expr *Cond) {
+  Cond = lookThroughRangesV3Condition(S.PP, Cond);
+
+  // Separate out all of the terms in a conjunction.
+  SmallVector<Expr *, 4> Terms;
+  collectConjunctionTerms(Cond, Terms);
+
+  // Determine which term failed.
+  Expr *FailedCond = nullptr;
+  for (Expr *Term : Terms) {
+    // The initialization of the parameter from the argument is
+    // a constant-evaluated context.
+    EnterExpressionEvaluationContext ConstantEvaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+    bool Succeeded;
+    if (Term->EvaluateAsBooleanCondition(Succeeded, S.Context) &&
+        !Succeeded) {
+      FailedCond = Term->IgnoreParenImpCasts();
+      break;
+    }
+  }
+
+  if (!FailedCond)
+    FailedCond = Cond->IgnoreParenImpCasts();
+
+  std::string Description;
+  {
+    llvm::raw_string_ostream Out(Description);
+    FailedCond->printPretty(Out, nullptr,
+                            PrintingPolicy(S.Context.getLangOpts()));
+  }
+  return { FailedCond, Description };
+}
+
 QualType Sema::CheckTemplateIdType(TemplateName Name,
                                    SourceLocation TemplateLoc,
                                    TemplateArgumentListInfo &TemplateArgs) {
@@ -2852,12 +2947,12 @@ QualType Sema::CheckTemplateIdType(TemplateName Name,
     if (Pattern->isInvalidDecl())
       return QualType();
 
-    TemplateArgumentList TemplateArgs(TemplateArgumentList::OnStack,
-                                      Converted);
+    TemplateArgumentList StackTemplateArgs(TemplateArgumentList::OnStack,
+                                           Converted);
 
     // Only substitute for the innermost template argument list.
     MultiLevelTemplateArgumentList TemplateArgLists;
-    TemplateArgLists.addOuterTemplateArguments(&TemplateArgs);
+    TemplateArgLists.addOuterTemplateArguments(&StackTemplateArgs);
     unsigned Depth = AliasTemplate->getTemplateParameters()->getDepth();
     for (unsigned I = 0; I < Depth; ++I)
       TemplateArgLists.addOuterTemplateArguments(None);
@@ -2870,8 +2965,42 @@ QualType Sema::CheckTemplateIdType(TemplateName Name,
     CanonType = SubstType(Pattern->getUnderlyingType(),
                           TemplateArgLists, AliasTemplate->getLocation(),
                           AliasTemplate->getDeclName());
-    if (CanonType.isNull())
+    if (CanonType.isNull()) {
+      // If this was enable_if and we failed to find the nested type
+      // within enable_if in a SFINAE context, dig out the specific
+      // enable_if condition that failed and present that instead.
+      if (isEnableIfAliasTemplate(AliasTemplate)) {
+        if (auto DeductionInfo = isSFINAEContext()) {
+          if (*DeductionInfo &&
+              (*DeductionInfo)->hasSFINAEDiagnostic() &&
+              (*DeductionInfo)->peekSFINAEDiagnostic().second.getDiagID() ==
+                diag::err_typename_nested_not_found_enable_if &&
+              TemplateArgs[0].getArgument().getKind()
+                == TemplateArgument::Expression) {
+            Expr *FailedCond;
+            std::string FailedDescription;
+            std::tie(FailedCond, FailedDescription) =
+              findFailedEnableIfCondition(
+                *this, TemplateArgs[0].getSourceExpression());
+
+            // Remove the old SFINAE diagnostic.
+            PartialDiagnosticAt OldDiag =
+              {SourceLocation(), PartialDiagnostic::NullDiagnostic()};
+            (*DeductionInfo)->takeSFINAEDiagnostic(OldDiag);
+
+            // Add a new SFINAE diagnostic specifying which condition
+            // failed.
+            (*DeductionInfo)->addSFINAEDiagnostic(
+              OldDiag.first,
+              PDiag(diag::err_typename_nested_not_found_requirement)
+                << FailedDescription
+                << FailedCond->getSourceRange());
+          }
+        }
+      }
+
       return QualType();
+    }
   } else if (Name.isDependent() ||
              TemplateSpecializationType::anyDependentTemplateArguments(
                TemplateArgs, InstantiationDependent)) {
@@ -5190,10 +5319,16 @@ enum NullPointerValueKind {
 /// value of the appropriate type.
 static NullPointerValueKind
 isNullPointerValueTemplateArgument(Sema &S, NonTypeTemplateParmDecl *Param,
-                                   QualType ParamType, Expr *Arg) {
+                                   QualType ParamType, Expr *Arg,
+                                   Decl *Entity = nullptr) {
   if (Arg->isValueDependent() || Arg->isTypeDependent())
     return NPV_NotNullPointer;
 
+  // dllimport'd entities aren't constant but are available inside of template
+  // arguments.
+  if (Entity && Entity->hasAttr<DLLImportAttr>())
+    return NPV_NotNullPointer;
+
   if (!S.isCompleteType(Arg->getExprLoc(), ParamType))
     llvm_unreachable(
         "Incomplete parameter type in isNullPointerValueTemplateArgument!");
@@ -5437,14 +5572,8 @@ CheckTemplateArgumentAddressOfObjectOrFunction(Sema &S,
 
   // If our parameter has pointer type, check for a null template value.
   if (ParamType->isPointerType() || ParamType->isNullPtrType()) {
-    NullPointerValueKind NPV;
-    // dllimport'd entities aren't constant but are available inside of template
-    // arguments.
-    if (Entity && Entity->hasAttr<DLLImportAttr>())
-      NPV = NPV_NotNullPointer;
-    else
-      NPV = isNullPointerValueTemplateArgument(S, Param, ParamType, ArgIn);
-    switch (NPV) {
+    switch (isNullPointerValueTemplateArgument(S, Param, ParamType, ArgIn,
+                                               Entity)) {
     case NPV_NullPointer:
       S.Diag(Arg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
       Converted = TemplateArgument(S.Context.getCanonicalType(ParamType),
@@ -5636,39 +5765,8 @@ static bool CheckTemplateArgumentPointerToMember(Sema &S,
                                                  TemplateArgument &Converted) {
   bool Invalid = false;
 
-  // Check for a null pointer value.
   Expr *Arg = ResultArg;
-  switch (isNullPointerValueTemplateArgument(S, Param, ParamType, Arg)) {
-  case NPV_Error:
-    return true;
-  case NPV_NullPointer:
-    S.Diag(Arg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
-    Converted = TemplateArgument(S.Context.getCanonicalType(ParamType),
-                                 /*isNullPtr*/true);
-    return false;
-  case NPV_NotNullPointer:
-    break;
-  }
-
   bool ObjCLifetimeConversion;
-  if (S.IsQualificationConversion(Arg->getType(),
-                                  ParamType.getNonReferenceType(),
-                                  false, ObjCLifetimeConversion)) {
-    Arg = S.ImpCastExprToType(Arg, ParamType, CK_NoOp,
-                              Arg->getValueKind()).get();
-    ResultArg = Arg;
-  } else if (!S.Context.hasSameUnqualifiedType(Arg->getType(),
-                ParamType.getNonReferenceType())) {
-    // We can't perform this conversion.
-    S.Diag(Arg->getLocStart(), diag::err_template_arg_not_convertible)
-      << Arg->getType() << ParamType << Arg->getSourceRange();
-    S.Diag(Param->getLocation(), diag::note_template_param_here);
-    return true;
-  }
-
-  // See through any implicit casts we added to fix the type.
-  while (ImplicitCastExpr *Cast = dyn_cast<ImplicitCastExpr>(Arg))
-    Arg = Cast->getSubExpr();
 
   // C++ [temp.arg.nontype]p1:
   //
@@ -5725,6 +5823,37 @@ static bool CheckTemplateArgumentPointerToMember(Sema &S,
     DRE = nullptr;
   }
 
+  ValueDecl *Entity = DRE ? DRE->getDecl() : nullptr;
+
+  // Check for a null pointer value.
+  switch (isNullPointerValueTemplateArgument(S, Param, ParamType, ResultArg,
+                                             Entity)) {
+  case NPV_Error:
+    return true;
+  case NPV_NullPointer:
+    S.Diag(ResultArg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null);
+    Converted = TemplateArgument(S.Context.getCanonicalType(ParamType),
+                                 /*isNullPtr*/true);
+    return false;
+  case NPV_NotNullPointer:
+    break;
+  }
+
+  if (S.IsQualificationConversion(ResultArg->getType(),
+                                  ParamType.getNonReferenceType(), false,
+                                  ObjCLifetimeConversion)) {
+    ResultArg = S.ImpCastExprToType(ResultArg, ParamType, CK_NoOp,
+                                    ResultArg->getValueKind())
+                    .get();
+  } else if (!S.Context.hasSameUnqualifiedType(
+                 ResultArg->getType(), ParamType.getNonReferenceType())) {
+    // We can't perform this conversion.
+    S.Diag(ResultArg->getLocStart(), diag::err_template_arg_not_convertible)
+        << ResultArg->getType() << ParamType << ResultArg->getSourceRange();
+    S.Diag(Param->getLocation(), diag::note_template_param_here);
+    return true;
+  }
+
   if (!DRE)
     return S.Diag(Arg->getLocStart(),
                   diag::err_template_arg_not_pointer_to_member_form)
@@ -9290,7 +9419,7 @@ Sema::ActOnTypenameType(Scope *S,
 /// Determine whether this failed name lookup should be treated as being
 /// disabled by a usage of std::enable_if.
 static bool isEnableIf(NestedNameSpecifierLoc NNS, const IdentifierInfo &II,
-                       SourceRange &CondRange) {
+                       SourceRange &CondRange, Expr *&Cond) {
   // We must be looking for a ::type...
   if (!II.isStr("type"))
     return false;
@@ -9320,6 +9449,19 @@ static bool isEnableIf(NestedNameSpecifierLoc NNS, const IdentifierInfo &II,
 
   // Assume the first template argument is the condition.
   CondRange = EnableIfTSTLoc.getArgLoc(0).getSourceRange();
+
+  // Dig out the condition.
+  Cond = nullptr;
+  if (EnableIfTSTLoc.getArgLoc(0).getArgument().getKind()
+        != TemplateArgument::Expression)
+    return true;
+
+  Cond = EnableIfTSTLoc.getArgLoc(0).getSourceExpression();
+
+  // Ignore Boolean literals; they add no value.
+  if (isa<CXXBoolLiteralExpr>(Cond->IgnoreParenCasts()))
+    Cond = nullptr;
+
   return true;
 }
 
@@ -9363,9 +9505,25 @@ Sema::CheckTypenameType(ElaboratedTypeKeyword Keyword,
     // If we're looking up 'type' within a template named 'enable_if', produce
     // a more specific diagnostic.
     SourceRange CondRange;
-    if (isEnableIf(QualifierLoc, II, CondRange)) {
+    Expr *Cond = nullptr;
+    if (isEnableIf(QualifierLoc, II, CondRange, Cond)) {
+      // If we have a condition, narrow it down to the specific failed
+      // condition.
+      if (Cond) {
+        Expr *FailedCond;
+        std::string FailedDescription;
+        std::tie(FailedCond, FailedDescription) =
+          findFailedEnableIfCondition(*this, Cond);
+
+        Diag(FailedCond->getExprLoc(),
+             diag::err_typename_nested_not_found_requirement)
+          << FailedDescription
+          << FailedCond->getSourceRange();
+        return QualType();
+      }
+
       Diag(CondRange.getBegin(), diag::err_typename_nested_not_found_enable_if)
-        << Ctx << CondRange;
+          << Ctx << CondRange;
       return QualType();
     }
 
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index 3aee3c04001d..678ecfc9a3d9 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -9314,6 +9314,7 @@ void ASTReader::diagnoseOdrViolations() {
         TypeAlias,
         TypeDef,
         Var,
+        Friend,
         Other
       } FirstDiffType = Other,
         SecondDiffType = Other;
@@ -9347,6 +9348,8 @@ void ASTReader::diagnoseOdrViolations() {
           return TypeDef;
         case Decl::Var:
           return Var;
+        case Decl::Friend:
+          return Friend;
         }
       };
 
@@ -9463,6 +9466,9 @@ void ASTReader::diagnoseOdrViolations() {
         VarSingleInitializer,
         VarDifferentInitializer,
         VarConstexpr,
+        FriendTypeFunction,
+        FriendType,
+        FriendFunction,
       };
 
       // These lambdas have the common portions of the ODR diagnostics.  This
@@ -9973,6 +9979,53 @@ void ASTReader::diagnoseOdrViolations() {
         }
         break;
       }
+      case Friend: {
+        FriendDecl *FirstFriend = cast<FriendDecl>(FirstDecl);
+        FriendDecl *SecondFriend = cast<FriendDecl>(SecondDecl);
+
+        NamedDecl *FirstND = FirstFriend->getFriendDecl();
+        NamedDecl *SecondND = SecondFriend->getFriendDecl();
+
+        TypeSourceInfo *FirstTSI = FirstFriend->getFriendType();
+        TypeSourceInfo *SecondTSI = SecondFriend->getFriendType();
+
+        if (FirstND && SecondND) {
+          ODRDiagError(FirstFriend->getFriendLoc(),
+                       FirstFriend->getSourceRange(), FriendFunction)
+              << FirstND;
+          ODRDiagNote(SecondFriend->getFriendLoc(),
+                      SecondFriend->getSourceRange(), FriendFunction)
+              << SecondND;
+
+          Diagnosed = true;
+          break;
+        }
+
+        if (FirstTSI && SecondTSI) {
+          QualType FirstFriendType = FirstTSI->getType();
+          QualType SecondFriendType = SecondTSI->getType();
+          assert(ComputeQualTypeODRHash(FirstFriendType) !=
+                 ComputeQualTypeODRHash(SecondFriendType));
+          ODRDiagError(FirstFriend->getFriendLoc(),
+                       FirstFriend->getSourceRange(), FriendType)
+              << FirstFriendType;
+          ODRDiagNote(SecondFriend->getFriendLoc(),
+                      SecondFriend->getSourceRange(), FriendType)
+              << SecondFriendType;
+          Diagnosed = true;
+          break;
+        }
+
+        ODRDiagError(FirstFriend->getFriendLoc(), FirstFriend->getSourceRange(),
+                     FriendTypeFunction)
+            << (FirstTSI == nullptr);
+        ODRDiagNote(SecondFriend->getFriendLoc(),
+                    SecondFriend->getSourceRange(), FriendTypeFunction)
+            << (SecondTSI == nullptr);
+
+        Diagnosed = true;
+        break;
+      }
       }
 
       if (Diagnosed == true)
diff --git a/lib/Serialization/ASTReaderDecl.cpp b/lib/Serialization/ASTReaderDecl.cpp
index 4d9ddd2ff506..abed2586561a 100644
--- a/lib/Serialization/ASTReaderDecl.cpp
+++ b/lib/Serialization/ASTReaderDecl.cpp
@@ -573,6 +573,8 @@ void ASTDeclReader::VisitDecl(Decl *D) {
       else
         Reader.HiddenNamesMap[Owner].push_back(D);
     }
+  } else if (ModulePrivate) {
+    D->setModuleOwnershipKind(Decl::ModuleOwnershipKind::ModulePrivate);
   }
 }
 
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index c6129d326cb6..f7a49e41009d 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -1462,7 +1462,7 @@ void ASTWriter::WriteControlBlock(Preprocessor &PP, ASTContext &Context,
   }
 
   // Module map file
-  if (WritingModule) {
+  if (WritingModule && WritingModule->Kind == Module::ModuleMapModule) {
     Record.clear();
 
     auto &Map = PP.getHeaderSearchInfo().getModuleMap();
diff --git a/lib/Serialization/ASTWriterDecl.cpp b/lib/Serialization/ASTWriterDecl.cpp
index 2d648cb103cb..ec21ca2276c1 100644
--- a/lib/Serialization/ASTWriterDecl.cpp
+++ b/lib/Serialization/ASTWriterDecl.cpp
@@ -2233,8 +2233,18 @@ void ASTRecordWriter::AddFunctionDefinition(const FunctionDecl *FD) {
   Writer->ClearSwitchCaseIDs();
 
   assert(FD->doesThisDeclarationHaveABody());
-  bool ModulesCodegen = Writer->Context->getLangOpts().ModulesCodegen &&
-                        Writer->WritingModule && !FD->isDependentContext();
+  bool ModulesCodegen = false;
+  if (Writer->WritingModule && !FD->isDependentContext()) {
+    // Under -fmodules-codegen, codegen is performed for all defined functions.
+    // When building a C++ Modules TS module interface unit, a strong definition
+    // in the module interface is provided by the compilation of that module
+    // interface unit, not by its users. (Inline functions are still emitted
+    // in module users.)
+    ModulesCodegen =
+        Writer->Context->getLangOpts().ModulesCodegen ||
+        (Writer->WritingModule->Kind == Module::ModuleInterfaceUnit &&
+         Writer->Context->GetGVALinkageForFunction(FD) == GVA_StrongExternal);
+  }
   Record->push_back(ModulesCodegen);
   if (ModulesCodegen)
     Writer->ModularCodegenDecls.push_back(Writer->GetDeclRef(FD));
diff --git a/lib/StaticAnalyzer/Core/AnalysisManager.cpp b/lib/StaticAnalyzer/Core/AnalysisManager.cpp
index 54634fdffeb5..83e67662e614 100644
--- a/lib/StaticAnalyzer/Core/AnalysisManager.cpp
+++ b/lib/StaticAnalyzer/Core/AnalysisManager.cpp
@@ -23,9 +23,10 @@ AnalysisManager::AnalysisManager(ASTContext &ctx, DiagnosticsEngine &diags,
                                  AnalyzerOptions &Options,
                                  CodeInjector *injector)
   : AnaCtxMgr(Options.UnoptimizedCFG,
-              /*AddImplicitDtors=*/true,
+              Options.includeImplicitDtorsInCFG(),
               /*AddInitializers=*/true,
               Options.includeTemporaryDtorsInCFG(),
+	      Options.includeLifetimeInCFG(),
               Options.shouldSynthesizeBodies(),
               Options.shouldConditionalizeStaticInitializers(),
               /*addCXXNewAllocator=*/true,
diff --git a/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp b/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp
index 11b9f8c4f725..6f48fcb9e20c 100644
--- a/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp
+++ b/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp
@@ -172,6 +172,17 @@ bool AnalyzerOptions::includeTemporaryDtorsInCFG() {
                           /* Default = */ false);
 }
 
+bool AnalyzerOptions::includeImplicitDtorsInCFG() {
+  return getBooleanOption(IncludeImplicitDtorsInCFG,
+                          "cfg-implicit-dtors",
+                          /* Default = */ true);
+}
+
+bool AnalyzerOptions::includeLifetimeInCFG() {
+  return getBooleanOption(IncludeLifetimeInCFG, "cfg-lifetime",
+                          /* Default = */ false);
+}
+
 bool AnalyzerOptions::mayInlineCXXStandardLibrary() {
   return getBooleanOption(InlineCXXStandardLibrary,
                           "c++-stdlib-inlining",
diff --git a/lib/StaticAnalyzer/Core/ExprEngine.cpp b/lib/StaticAnalyzer/Core/ExprEngine.cpp
index f84c0ee800a3..eee5400fe177 100644
--- a/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -362,6 +362,8 @@ void ExprEngine::processCFGElement(const CFGElement E, ExplodedNode *Pred,
     case CFGElement::TemporaryDtor:
       ProcessImplicitDtor(E.castAs<CFGImplicitDtor>(), Pred);
       return;
+    case CFGElement::LifetimeEnds:
+      return;
   }
 }
 
diff --git a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
index 6aa6da560e60..d91786f74919 100644
--- a/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
+++ b/lib/StaticAnalyzer/Core/PathDiagnostic.cpp
@@ -578,6 +578,7 @@ getLocationForCaller(const StackFrameContext *SFC,
   }
   case CFGElement::TemporaryDtor:
   case CFGElement::NewAllocator:
+  case CFGElement::LifetimeEnds:
     llvm_unreachable("not yet implemented!");
   }
 
diff --git a/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 9c28457b2139..f09f9696f5ad 100644
--- a/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -71,18 +71,15 @@ SVal SimpleSValBuilder::dispatchCast(SVal Val, QualType CastTy) {
 }
 
 SVal SimpleSValBuilder::evalCastFromNonLoc(NonLoc val, QualType castTy) {
-
   bool isLocType = Loc::isLocType(castTy);
-
   if (val.getAs<nonloc::PointerToMember>())
     return val;
 
   if (Optional<nonloc::LocAsInteger> LI = val.getAs<nonloc::LocAsInteger>()) {
     if (isLocType)
       return LI->getLoc();
-
     // FIXME: Correctly support promotions/truncations.
-    unsigned castSize = Context.getTypeSize(castTy);
+    unsigned castSize = Context.getIntWidth(castTy);
     if (castSize == LI->getNumBits())
       return val;
     return makeLocAsInteger(LI->getLoc(), castSize);
@@ -173,7 +170,7 @@ SVal SimpleSValBuilder::evalCastFromLoc(Loc val, QualType castTy) {
   }
 
   if (castTy->isIntegralOrEnumerationType()) {
-    unsigned BitWidth = Context.getTypeSize(castTy);
+    unsigned BitWidth = Context.getIntWidth(castTy);
 
     if (!val.getAs<loc::ConcreteInt>())
       return makeLocAsInteger(val, BitWidth);
diff --git a/lib/Tooling/ArgumentsAdjusters.cpp b/lib/Tooling/ArgumentsAdjusters.cpp
index 48b925c698a7..ac9fd3c5cade 100644
--- a/lib/Tooling/ArgumentsAdjusters.cpp
+++ b/lib/Tooling/ArgumentsAdjusters.cpp
@@ -42,7 +42,7 @@ ArgumentsAdjuster getClangStripOutputAdjuster() {
         AdjustedArgs.push_back(Args[i]);
 
       if (Arg == "-o") {
-        // Output is specified as -o foo. Skip the next argument also.
+        // Output is specified as -o foo. Skip the next argument too.
         ++i;
       }
       // Else, the output is specified as -ofoo. Just do nothing.
@@ -51,6 +51,26 @@ ArgumentsAdjuster getClangStripOutputAdjuster() {
   };
 }
 
+ArgumentsAdjuster getClangStripDependencyFileAdjuster() {
+  return [](const CommandLineArguments &Args, StringRef /*unused*/) {
+    CommandLineArguments AdjustedArgs;
+    for (size_t i = 0, e = Args.size(); i < e; ++i) {
+      StringRef Arg = Args[i];
+      // All dependency-file options begin with -M. These include -MM,
+      // -MF, -MG, -MP, -MT, -MQ, -MD, and -MMD.
+      if (!Arg.startswith("-M"))
+        AdjustedArgs.push_back(Args[i]);
+
+      if ((Arg == "-MF") || (Arg == "-MT") || (Arg == "-MQ") ||
+          (Arg == "-MD") || (Arg == "-MMD")) {
+        // Output is specified as -MX foo. Skip the next argument also.
+        ++i;
+      }
+    }
+    return AdjustedArgs;
+  };
+}
+
 ArgumentsAdjuster getInsertArgumentAdjuster(const CommandLineArguments &Extra,
                                             ArgumentInsertPosition Pos) {
   return [Extra, Pos](const CommandLineArguments &Args, StringRef /*unused*/) {
@@ -83,4 +103,3 @@ ArgumentsAdjuster combineAdjusters(ArgumentsAdjuster First,
 
 } // end namespace tooling
 } // end namespace clang
-
diff --git a/lib/Tooling/Core/Diagnostic.cpp b/lib/Tooling/Core/Diagnostic.cpp
index 3bbc2b901e38..9e4833f2eff4 100644
--- a/lib/Tooling/Core/Diagnostic.cpp
+++ b/lib/Tooling/Core/Diagnostic.cpp
@@ -35,9 +35,9 @@ Diagnostic::Diagnostic(llvm::StringRef DiagnosticName,
       BuildDirectory(BuildDirectory) {}
 
 Diagnostic::Diagnostic(llvm::StringRef DiagnosticName,
-                       DiagnosticMessage &Message,
-                       llvm::StringMap<Replacements> &Fix,
-                       SmallVector<DiagnosticMessage, 1> &Notes,
+                       const DiagnosticMessage &Message,
+                       const llvm::StringMap<Replacements> &Fix,
+                       const SmallVector<DiagnosticMessage, 1> &Notes,
                        Level DiagLevel, llvm::StringRef BuildDirectory)
     : DiagnosticName(DiagnosticName), Message(Message), Fix(Fix), Notes(Notes),
       DiagLevel(DiagLevel), BuildDirectory(BuildDirectory) {}
diff --git a/lib/Tooling/Tooling.cpp b/lib/Tooling/Tooling.cpp
index 2e093dd9afc8..c84fbf473753 100644
--- a/lib/Tooling/Tooling.cpp
+++ b/lib/Tooling/Tooling.cpp
@@ -100,7 +100,6 @@ clang::CompilerInvocation *newInvocation(
       *Diagnostics);
   Invocation->getFrontendOpts().DisableFree = false;
   Invocation->getCodeGenOpts().DisableFree = false;
-  Invocation->getDependencyOutputOpts() = DependencyOutputOptions();
   return Invocation;
 }
 
@@ -140,9 +139,11 @@ bool runToolOnCodeWithArgs(
   OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   llvm::IntrusiveRefCntPtr<FileManager> Files(
       new FileManager(FileSystemOptions(), OverlayFileSystem));
-  ToolInvocation Invocation(getSyntaxOnlyToolArgs(ToolName, Args, FileNameRef),
-                            ToolAction, Files.get(),
-                            std::move(PCHContainerOps));
+  ArgumentsAdjuster Adjuster = getClangStripDependencyFileAdjuster();
+  ToolInvocation Invocation(
+      getSyntaxOnlyToolArgs(ToolName, Adjuster(Args, FileNameRef), FileNameRef),
+      ToolAction, Files.get(),
+      std::move(PCHContainerOps));
 
   SmallString<1024> CodeStorage;
   InMemoryFileSystem->addFile(FileNameRef, 0,
@@ -510,7 +511,8 @@ buildASTFromCode(const Twine &Code, const Twine &FileName,
 std::unique_ptr<ASTUnit> buildASTFromCodeWithArgs(
     const Twine &Code, const std::vector<std::string> &Args,
     const Twine &FileName, const Twine &ToolName,
-    std::shared_ptr<PCHContainerOperations> PCHContainerOps) {
+    std::shared_ptr<PCHContainerOperations> PCHContainerOps,
+    ArgumentsAdjuster Adjuster) {
   SmallString<16> FileNameStorage;
   StringRef FileNameRef = FileName.toNullTerminatedStringRef(FileNameStorage);
 
@@ -523,8 +525,10 @@ std::unique_ptr<ASTUnit> buildASTFromCodeWithArgs(
   OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   llvm::IntrusiveRefCntPtr<FileManager> Files(
       new FileManager(FileSystemOptions(), OverlayFileSystem));
-  ToolInvocation Invocation(getSyntaxOnlyToolArgs(ToolName, Args, FileNameRef),
-                            &Action, Files.get(), std::move(PCHContainerOps));
+
+  ToolInvocation Invocation(
+      getSyntaxOnlyToolArgs(ToolName, Adjuster(Args, FileNameRef), FileNameRef),
+      &Action, Files.get(), std::move(PCHContainerOps));
 
   SmallString<1024> CodeStorage;
   InMemoryFileSystem->addFile(FileNameRef, 0,
diff --git a/test/Analysis/analyzer-config.c b/test/Analysis/analyzer-config.c
index 70521c63fbad..0d8d34fbf6b0 100644
--- a/test/Analysis/analyzer-config.c
+++ b/test/Analysis/analyzer-config.c
@@ -12,6 +12,8 @@ void foo() {
 
 // CHECK: [config]
 // CHECK-NEXT: cfg-conditional-static-initializers = true
+// CHECK-NEXT: cfg-implicit-dtors = true
+// CHECK-NEXT: cfg-lifetime = false
 // CHECK-NEXT: cfg-temporary-dtors = false
 // CHECK-NEXT: faux-bodies = true
 // CHECK-NEXT: graph-trim-interval = 1000
@@ -27,5 +29,4 @@ void foo() {
 // CHECK-NEXT: region-store-small-struct-limit = 2
 // CHECK-NEXT: widen-loops = false
 // CHECK-NEXT: [stats]
-// CHECK-NEXT: num-entries = 15
-
+// CHECK-NEXT: num-entries = 17
diff --git a/test/Analysis/analyzer-config.cpp b/test/Analysis/analyzer-config.cpp
index 60c03c1e497b..4166a730cf53 100644
--- a/test/Analysis/analyzer-config.cpp
+++ b/test/Analysis/analyzer-config.cpp
@@ -23,6 +23,8 @@ class Foo {
 // CHECK-NEXT: c++-stdlib-inlining = true
 // CHECK-NEXT: c++-template-inlining = true
 // CHECK-NEXT: cfg-conditional-static-initializers = true
+// CHECK-NEXT: cfg-implicit-dtors = true
+// CHECK-NEXT: cfg-lifetime = false
 // CHECK-NEXT: cfg-temporary-dtors = false
 // CHECK-NEXT: faux-bodies = true
 // CHECK-NEXT: graph-trim-interval = 1000
@@ -38,4 +40,4 @@ class Foo {
 // CHECK-NEXT: region-store-small-struct-limit = 2
 // CHECK-NEXT: widen-loops = false
 // CHECK-NEXT: [stats]
-// CHECK-NEXT: num-entries = 20
+// CHECK-NEXT: num-entries = 22
diff --git a/test/Analysis/enum.cpp b/test/Analysis/enum.cpp
index b561e65ac86e..96408473cede 100644
--- a/test/Analysis/enum.cpp
+++ b/test/Analysis/enum.cpp
@@ -37,3 +37,33 @@ bool testNoCrashOnSwitchEnumBool(EnumBool E) {
   }
   return true;
 }
+
+bool testNoCrashOnSwitchEnumBoolConstant() {
+  EnumBool E = EnumBool::F;
+  switch (E) {
+    case EnumBool::F:
+      return false; 
+  }
+  return true; 
+}
+
+typedef __INTPTR_TYPE__ intptr_t;
+bool testNoCrashOnSwitchEnumBoolConstantCastedFromNullptr() {
+  EnumBool E = static_cast<EnumBool>((intptr_t)nullptr);
+  switch (E) {
+  case EnumBool::F:
+    return false;
+  }
+  return true;
+}
+
+bool testNoCrashOnSwitchEnumBoolConstantCastedFromPtr() {
+  int X;
+  intptr_t P = (intptr_t)&X;
+  EnumBool E = static_cast<EnumBool>(P);
+  switch (E) {
+  case EnumBool::F:
+    return false;
+  }
+  return true;
+}
diff --git a/test/Analysis/lifetime-cfg-output.cpp b/test/Analysis/lifetime-cfg-output.cpp
new file mode 100644
index 000000000000..1e6f56df6a47
--- /dev/null
+++ b/test/Analysis/lifetime-cfg-output.cpp
@@ -0,0 +1,783 @@
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -analyze -analyzer-checker=debug.DumpCFG -analyzer-config cfg-lifetime=true -analyzer-config cfg-implicit-dtors=false %s > %t 2>&1
+// RUN: FileCheck --input-file=%t %s
+
+extern bool UV;
+class A {
+public:
+  // CHECK:       [B2 (ENTRY)]
+  // CHECK-NEXT:    Succs (1): B1
+  // CHECK:       [B1]
+  // CHECK-NEXT:    1: true
+  // CHECK-NEXT:    2: UV
+  // CHECK-NEXT:    3: [B1.2] = [B1.1]
+  // CHECK-NEXT:    Preds (1): B2
+  // CHECK-NEXT:    Succs (1): B0
+  // CHECK:       [B0 (EXIT)]
+  // CHECK-NEXT:    Preds (1): B1
+  A() {
+    UV = true;
+  }
+  // CHECK:       [B3 (ENTRY)]
+  // CHECK-NEXT:    Succs (1): B2
+  // CHECK:       [B1]
+  // CHECK-NEXT:    1: 0
+  // CHECK-NEXT:    2: this
+  // CHECK-NEXT:    3: [B1.2]->p
+  // CHECK-NEXT:    4: [B1.3] (ImplicitCastExpr, LValueToRValue, int *)
+  // CHECK-NEXT:    5: *[B1.4]
+  // CHECK-NEXT:    6: [B1.5] = [B1.1]
+  // CHECK-NEXT:    Preds (1): B2
+  // CHECK-NEXT:    Succs (1): B0
+  // CHECK:       [B2]
+  // CHECK-NEXT:    1: this
+  // CHECK-NEXT:    2: [B2.1]->p
+  // CHECK-NEXT:    3: [B2.2] (ImplicitCastExpr, LValueToRValue, int *)
+  // CHECK-NEXT:    4: [B2.3] (ImplicitCastExpr, PointerToBoolean, _Bool)
+  // CHECK-NEXT:    T: if [B2.4]
+  // CHECK-NEXT:    Preds (1): B3
+  // CHECK-NEXT:    Succs (2): B1 B0
+  // CHECK:       [B0 (EXIT)]
+  // CHECK-NEXT:    Preds (2): B1 B2
+  ~A() {
+    if (p)
+      *p = 0;
+  }
+  // CHECK:       [B2 (ENTRY)]
+  // CHECK-NEXT:    Succs (1): B1
+  // CHECK:       [B1]
+  // CHECK-NEXT:    1: 1
+  // CHECK-NEXT:    2: return [B1.1];
+  // CHECK-NEXT:    Preds (1): B2
+  // CHECK-NEXT:    Succs (1): B0
+  // CHECK:       [B0 (EXIT)]
+  // CHECK-NEXT:    Preds (1): B1
+  operator int() const { return 1; }
+  int *p;
+};
+
+// CHECK:       [B2 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B1]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3: a
+// CHECK-NEXT:    4: [B1.3] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    5: const A &b = a;
+// CHECK-NEXT:    6: A() (CXXConstructExpr, class A)
+// CHECK-NEXT:    7: [B1.6] (BindTemporary)
+// CHECK-NEXT:    8: [B1.7] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    9: [B1.8]
+// CHECK-NEXT:   10: const A &c = A();
+// CHECK-NEXT:   11: [B1.10] (Lifetime ends)
+// CHECK-NEXT:   12: [B1.2] (Lifetime ends)
+// CHECK-NEXT:   13: [B1.5] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B2
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void test_const_ref() {
+  A a;
+  const A &b = a;
+  const A &c = A();
+}
+
+// CHECK:      [B2 (ENTRY)]
+// CHECK-NEXT:   Succs (1): B1
+// CHECK:       [B1]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A [2])
+// CHECK-NEXT:    2: A a[2];
+// CHECK-NEXT:    3:  (CXXConstructExpr, class A [0])
+// CHECK-NEXT:    4: A b[0];
+// lifetime of a ends when its destructors are run
+// CHECK-NEXT:    5: [B1.2] (Lifetime ends)
+// lifetime of b ends when its storage duration ends
+// CHECK-NEXT:    6: [B1.4] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B2
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:      [B0 (EXIT)]
+// CHECK-NEXT:   Preds (1): B1
+void test_array() {
+  A a[2];
+  A b[0];
+}
+
+// CHECK:      [B2 (ENTRY)]
+// CHECK-NEXT:   Succs (1): B1
+// CHECK:       [B1]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A c;
+// CHECK-NEXT:    5:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    6: A d;
+// CHECK-NEXT:    7: [B1.6] (Lifetime ends)
+// CHECK-NEXT:    8: [B1.4] (Lifetime ends)
+// CHECK-NEXT:    9:  (CXXConstructExpr, class A)
+// CHECK-NEXT:   10: A b;
+// CHECK-NEXT:   11: [B1.10] (Lifetime ends)
+// CHECK-NEXT:   12: [B1.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B2
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:      [B0 (EXIT)]
+// CHECK-NEXT:   Preds (1): B1
+void test_scope() {
+  A a;
+  {
+    A c;
+    A d;
+  }
+  A b;
+}
+
+// CHECK:      [B4 (ENTRY)]
+// CHECK-NEXT:   Succs (1): B3
+// CHECK:       [B1]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: [B1.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B3.4] (Lifetime ends)
+// CHECK-NEXT:    5: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B3
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    1: return;
+// CHECK-NEXT:    2: [B3.4] (Lifetime ends)
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B3
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A b;
+// CHECK-NEXT:    5: UV
+// CHECK-NEXT:    6: [B3.5] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B3.6]
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (2): B2 B1
+
+// CHECK:      [B0 (EXIT)]
+// CHECK-NEXT:   Preds (2): B1 B2
+void test_return() {
+  A a;
+  A b;
+  if (UV)
+    return;
+  A c;
+}
+
+// CHECK:       [B5 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B4
+// CHECK:       [B1]
+// CHECK-NEXT:    1: [B4.6] (Lifetime ends)
+// CHECK-NEXT:    2: [B4.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B2 B3
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: [B2.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B4]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3: a
+// CHECK-NEXT:    4: [B4.3] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    5: [B4.4] (CXXConstructExpr, class A)
+// CHECK-NEXT:    6: A b = a;
+// CHECK-NEXT:    7: b
+// CHECK-NEXT:    8: [B4.7] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    9: [B4.8].operator int
+// CHECK-NEXT:   10: [B4.8]
+// CHECK-NEXT:   11: [B4.10] (ImplicitCastExpr, UserDefinedConversion, int)
+// CHECK-NEXT:   12: [B4.11] (ImplicitCastExpr, IntegralToBoolean, _Bool)
+// CHECK-NEXT:    T: if [B4.12]
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (2): B3 B2
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void test_if_implicit_scope() {
+  A a;
+  if (A b = a)
+    A c;
+  else
+    A c;
+}
+
+// CHECK:       [B9 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B8
+// CHECK:       [B1]
+// CHECK-NEXT:    1: [B8.6] (Lifetime ends)
+// CHECK-NEXT:    2:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    3: A e;
+// CHECK-NEXT:    4: [B1.3] (Lifetime ends)
+// CHECK-NEXT:    5: [B8.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B2 B5
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A d;
+// CHECK-NEXT:    3: [B2.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B4.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B3]
+// CHECK-NEXT:    1: return;
+// CHECK-NEXT:    2: [B4.2] (Lifetime ends)
+// CHECK-NEXT:    3: [B8.6] (Lifetime ends)
+// CHECK-NEXT:    4: [B8.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B4]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: UV
+// CHECK-NEXT:    4: [B4.3] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B4.4]
+// CHECK-NEXT:    Preds (1): B8
+// CHECK-NEXT:    Succs (2): B3 B2
+// CHECK:       [B5]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A d;
+// CHECK-NEXT:    3: [B5.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B7.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B6]
+// CHECK-NEXT:    1: return;
+// CHECK-NEXT:    2: [B7.2] (Lifetime ends)
+// CHECK-NEXT:    3: [B8.6] (Lifetime ends)
+// CHECK-NEXT:    4: [B8.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B7]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: UV
+// CHECK-NEXT:    4: [B7.3] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B7.4]
+// CHECK-NEXT:    Preds (1): B8
+// CHECK-NEXT:    Succs (2): B6 B5
+// CHECK:       [B8]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3: a
+// CHECK-NEXT:    4: [B8.3] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    5: [B8.4] (CXXConstructExpr, class A)
+// CHECK-NEXT:    6: A b = a;
+// CHECK-NEXT:    7: b
+// CHECK-NEXT:    8: [B8.7] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    9: [B8.8].operator int
+// CHECK-NEXT:   10: [B8.8]
+// CHECK-NEXT:   11: [B8.10] (ImplicitCastExpr, UserDefinedConversion, int)
+// CHECK-NEXT:   12: [B8.11] (ImplicitCastExpr, IntegralToBoolean, _Bool)
+// CHECK-NEXT:    T: if [B8.12]
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (2): B7 B4
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (3): B1 B3 B6
+void test_if_jumps() {
+  A a;
+  if (A b = a) {
+    A c;
+    if (UV)
+      return;
+    A d;
+  } else {
+    A c;
+    if (UV)
+      return;
+    A d;
+  }
+  A e;
+}
+
+// CHECK:       [B6 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B5
+// CHECK:       [B1]
+// CHECK-NEXT:    1: [B4.4] (Lifetime ends)
+// CHECK-NEXT:    2: [B5.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    Preds (1): B3
+// CHECK-NEXT:    Succs (1): B4
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B4.4] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B4]
+// CHECK-NEXT:    1: a
+// CHECK-NEXT:    2: [B4.1] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    3: [B4.2] (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A b = a;
+// CHECK-NEXT:    5: b
+// CHECK-NEXT:    6: [B4.5] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    7: [B4.6].operator int
+// CHECK-NEXT:    8: [B4.6]
+// CHECK-NEXT:    9: [B4.8] (ImplicitCastExpr, UserDefinedConversion, int)
+// CHECK-NEXT:   10: [B4.9] (ImplicitCastExpr, IntegralToBoolean, _Bool)
+// CHECK-NEXT:    T: while [B4.10]
+// CHECK-NEXT:    Preds (2): B2 B5
+// CHECK-NEXT:    Succs (2): B3 B1
+// CHECK:       [B5]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    Preds (1): B6
+// CHECK-NEXT:    Succs (1): B4
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void test_while_implicit_scope() {
+  A a;
+  while (A b = a)
+    A c;
+}
+
+// CHECK:       [B12 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B11
+// CHECK:       [B1]
+// CHECK-NEXT:    1: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    2:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    3: A e;
+// CHECK-NEXT:    4: [B1.3] (Lifetime ends)
+// CHECK-NEXT:    5: [B11.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B8 B10
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    Preds (2): B3 B6
+// CHECK-NEXT:    Succs (1): B10
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A d;
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    5: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B4]
+// CHECK-NEXT:    1: return;
+// CHECK-NEXT:    2: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    3: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    4: [B11.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B5]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B5.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B5.2]
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (2): B4 B3
+// CHECK:       [B6]
+// CHECK-NEXT:    1: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    2: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    T: continue;
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B7]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B7.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B7.2]
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (2): B6 B5
+// CHECK:       [B8]
+// CHECK-NEXT:    1: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    T: break;
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B9]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: UV
+// CHECK-NEXT:    4: [B9.3] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B9.4]
+// CHECK-NEXT:    Preds (1): B10
+// CHECK-NEXT:    Succs (2): B8 B7
+// CHECK:       [B10]
+// CHECK-NEXT:    1: a
+// CHECK-NEXT:    2: [B10.1] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    3: [B10.2] (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A b = a;
+// CHECK-NEXT:    5: b
+// CHECK-NEXT:    6: [B10.5] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    7: [B10.6].operator int
+// CHECK-NEXT:    8: [B10.6]
+// CHECK-NEXT:    9: [B10.8] (ImplicitCastExpr, UserDefinedConversion, int)
+// CHECK-NEXT:   10: [B10.9] (ImplicitCastExpr, IntegralToBoolean, _Bool)
+// CHECK-NEXT:    T: while [B10.10]
+// CHECK-NEXT:    Preds (2): B2 B11
+// CHECK-NEXT:    Succs (2): B9 B1
+// CHECK:       [B11]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    Preds (1): B12
+// CHECK-NEXT:    Succs (1): B10
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (2): B1 B4
+void test_while_jumps() {
+  A a;
+  while (A b = a) {
+    A c;
+    if (UV)
+      break;
+    if (UV)
+      continue;
+    if (UV)
+      return;
+    A d;
+  }
+  A e;
+}
+
+// CHECK:       [B12 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B11
+// CHECK:       [B1]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A d;
+// CHECK-NEXT:    3: [B1.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B11.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B8 B2
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B2.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: do ... while [B2.2]
+// CHECK-NEXT:    Preds (2): B3 B6
+// CHECK-NEXT:    Succs (2): B10 B1
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B4]
+// CHECK-NEXT:    1: return;
+// CHECK-NEXT:    2: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    3: [B11.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B5]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B5.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B5.2]
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (2): B4 B3
+// CHECK:       [B6]
+// CHECK-NEXT:    1: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    T: continue;
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B7]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B7.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B7.2]
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (2): B6 B5
+// CHECK:       [B8]
+// CHECK-NEXT:    1: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    T: break;
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B9]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A b;
+// CHECK-NEXT:    3: UV
+// CHECK-NEXT:    4: [B9.3] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B9.4]
+// CHECK-NEXT:    Preds (2): B10 B11
+// CHECK-NEXT:    Succs (2): B8 B7
+// CHECK:       [B10]
+// CHECK-NEXT:    Preds (1): B2
+// CHECK-NEXT:    Succs (1): B9
+// CHECK:       [B11]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    Preds (1): B12
+// CHECK-NEXT:    Succs (1): B9
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (2): B1 B4
+void test_do_jumps() {
+  A a;
+  do {
+    A b;
+    if (UV)
+      break;
+    if (UV)
+      continue;
+    if (UV)
+      return;
+    A c;
+  } while (UV);
+  A d;
+}
+
+// CHECK:       [B6 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B5
+// CHECK:       [B1]
+// CHECK-NEXT:    1: [B4.4] (Lifetime ends)
+// CHECK-NEXT:    2: [B5.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    Preds (1): B3
+// CHECK-NEXT:    Succs (1): B4
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A c;
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B4.4] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B4]
+// CHECK-NEXT:    1: a
+// CHECK-NEXT:    2: [B4.1] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    3: [B4.2] (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A b = a;
+// CHECK-NEXT:    5: b
+// CHECK-NEXT:    6: [B4.5] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    7: [B4.6].operator int
+// CHECK-NEXT:    8: [B4.6]
+// CHECK-NEXT:    9: [B4.8] (ImplicitCastExpr, UserDefinedConversion, int)
+// CHECK-NEXT:   10: [B4.9] (ImplicitCastExpr, IntegralToBoolean, _Bool)
+// CHECK-NEXT:    T: for (...; [B4.10]; )
+// CHECK-NEXT:    Preds (2): B2 B5
+// CHECK-NEXT:    Succs (2): B3 B1
+// CHECK:       [B5]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    Preds (1): B6
+// CHECK-NEXT:    Succs (1): B4
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void test_for_implicit_scope() {
+  for (A a; A b = a;)
+    A c;
+}
+
+// CHECK:       [B12 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B11
+// CHECK:       [B1]
+// CHECK-NEXT:    1: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    2: [B11.4] (Lifetime ends)
+// CHECK-NEXT:    3:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A f;
+// CHECK-NEXT:    5: [B1.4] (Lifetime ends)
+// CHECK-NEXT:    6: [B11.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B8 B10
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    Preds (2): B3 B6
+// CHECK-NEXT:    Succs (1): B10
+// CHECK:       [B3]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A e;
+// CHECK-NEXT:    3: [B3.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    5: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B4]
+// CHECK-NEXT:    1: return;
+// CHECK-NEXT:    2: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    3: [B10.4] (Lifetime ends)
+// CHECK-NEXT:    4: [B11.4] (Lifetime ends)
+// CHECK-NEXT:    5: [B11.2] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B5
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B5]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B5.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B5.2]
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (2): B4 B3
+// CHECK:       [B6]
+// CHECK-NEXT:    1: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    T: continue;
+// CHECK-NEXT:    Preds (1): B7
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B7]
+// CHECK-NEXT:    1: UV
+// CHECK-NEXT:    2: [B7.1] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B7.2]
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (2): B6 B5
+// CHECK:       [B8]
+// CHECK-NEXT:    1: [B9.2] (Lifetime ends)
+// CHECK-NEXT:    T: break;
+// CHECK-NEXT:    Preds (1): B9
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B9]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A d;
+// CHECK-NEXT:    3: UV
+// CHECK-NEXT:    4: [B9.3] (ImplicitCastExpr, LValueToRValue, _Bool)
+// CHECK-NEXT:    T: if [B9.4]
+// CHECK-NEXT:    Preds (1): B10
+// CHECK-NEXT:    Succs (2): B8 B7
+// CHECK:       [B10]
+// CHECK-NEXT:    1: b
+// CHECK-NEXT:    2: [B10.1] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    3: [B10.2] (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A c = b;
+// CHECK-NEXT:    5: c
+// CHECK-NEXT:    6: [B10.5] (ImplicitCastExpr, NoOp, const class A)
+// CHECK-NEXT:    7: [B10.6].operator int
+// CHECK-NEXT:    8: [B10.6]
+// CHECK-NEXT:    9: [B10.8] (ImplicitCastExpr, UserDefinedConversion, int)
+// CHECK-NEXT:   10: [B10.9] (ImplicitCastExpr, IntegralToBoolean, _Bool)
+// CHECK-NEXT:    T: for (...; [B10.10]; )
+// CHECK-NEXT:    Preds (2): B2 B11
+// CHECK-NEXT:    Succs (2): B9 B1
+// CHECK:       [B11]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    4: A b;
+// CHECK-NEXT:    Preds (1): B12
+// CHECK-NEXT:    Succs (1): B10
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (2): B1 B4
+void test_for_jumps() {
+  A a;
+  for (A b; A c = b;) {
+    A d;
+    if (UV)
+      break;
+    if (UV)
+      continue;
+    if (UV)
+      return;
+    A e;
+  }
+  A f;
+}
+
+// CHECK:       [B2 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B1]
+// CHECK-NEXT:    1:  (CXXConstructExpr, class A)
+// CHECK-NEXT:    2: A a;
+// CHECK-NEXT:    3: int n;
+// CHECK-NEXT:    4: n
+// CHECK-NEXT:    5: &[B1.4]
+// CHECK-NEXT:    6: a
+// CHECK-NEXT:    7: [B1.6].p
+// CHECK-NEXT:    8: [B1.7] = [B1.5]
+// CHECK-NEXT:    9: [B1.2] (Lifetime ends)
+// CHECK-NEXT:   10: [B1.3] (Lifetime ends)
+// CHECK-NEXT:    Preds (1): B2
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void test_trivial_vs_non_trivial_order() {
+  A a;
+  int n;
+  a.p = &n;
+}
+
+// CHECK:       [B4 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B3
+// CHECK:       [B1]
+// CHECK-NEXT:   a:
+// CHECK-NEXT:    1: 1
+// CHECK-NEXT:    2: i
+// CHECK-NEXT:    3: [B1.2] = [B1.1]
+// CHECK-NEXT:    4: [B2.1] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B2 B3
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    1: int i;
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B3]
+// CHECK-NEXT:    T: goto a;
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void goto_past_declaration() {
+  goto a;
+  int i;
+a:
+  i = 1;
+}
+
+// CHECK:       [B4 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B3
+// CHECK:       [B1]
+// CHECK-NEXT:   a:
+// CHECK-NEXT:    1: 1
+// CHECK-NEXT:    2: k
+// CHECK-NEXT:    3: [B1.2] = [B1.1]
+// CHECK-NEXT:    4: [B2.4] (Lifetime ends)
+// CHECK-NEXT:    Preds (2): B2 B3
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:    1: int j;
+// CHECK-NEXT:    2: [B2.1] (Lifetime ends)
+// CHECK-NEXT:    3: [B3.1] (Lifetime ends)
+// CHECK-NEXT:    4: int k;
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B3]
+// CHECK-NEXT:    1: int i;
+// CHECK-NEXT:    2: [B3.1] (Lifetime ends)
+// CHECK-NEXT:    T: goto a;
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B1
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+void goto_past_declaration2() {
+  {
+    int i;
+    goto a;
+    int j;
+  }
+  {
+    int k;
+  a:
+    k = 1;
+  }
+}
+
+struct B {
+  ~B();
+};
+
+// CHECK:       [B4 (ENTRY)]
+// CHECK-NEXT:    Succs (1): B3
+// CHECK:       [B1]
+// CHECK-NEXT:    1: i
+// CHECK-NEXT:    2: [B1.1]++
+// CHECK-NEXT:    3: [B2.2] (Lifetime ends)
+// CHECK-NEXT:    4: [B3.1] (Lifetime ends)
+// CHECK-NEXT:    Succs (1): B0
+// CHECK:       [B2]
+// CHECK-NEXT:   label:
+// CHECK-NEXT:    1:  (CXXConstructExpr, struct B)
+// CHECK-NEXT:    2: B b;
+// CHECK-NEXT:    3: [B2.2] (Lifetime ends)
+// CHECK-NEXT:    T: goto label;
+// CHECK-NEXT:    Preds (2): B3 B2
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B3]
+// CHECK-NEXT:    1: int i;
+// CHECK-NEXT:    Preds (1): B4
+// CHECK-NEXT:    Succs (1): B2
+// CHECK:       [B0 (EXIT)]
+// CHECK-NEXT:    Preds (1): B1
+int backpatched_goto() {
+  int i;
+label:
+  B b;
+  goto label;
+  i++;
+}
diff --git a/test/CXX/except/except.spec/p11.cpp b/test/CXX/except/except.spec/p11.cpp
index 196f84c557ea..b7b9fcb23144 100644
--- a/test/CXX/except/except.spec/p11.cpp
+++ b/test/CXX/except/except.spec/p11.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -std=c++11 -fexceptions -fcxx-exceptions -fsyntax-only -verify %s
 
 // This is the "let the user shoot themselves in the foot" clause.
-void f() noexcept { // expected-note {{non-throwing function declare here}}
+void f() noexcept { // expected-note {{function declared non-throwing here}}
   throw 0; // expected-warning {{has a non-throwing exception specification but}} 
 }
-void g() throw() { // expected-note {{non-throwing function declare here}}
+void g() throw() { // expected-note {{function declared non-throwing here}}
   throw 0; // expected-warning {{has a non-throwing exception specification but}} 
 }
 void h() throw(int) {
diff --git a/test/CXX/modules-ts/basic/basic.def.odr/p4/module.cpp b/test/CXX/modules-ts/basic/basic.def.odr/p4/module.cpp
new file mode 100644
index 000000000000..dc6a3635a8ee
--- /dev/null
+++ b/test/CXX/modules-ts/basic/basic.def.odr/p4/module.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -fmodules-ts %S/module.cppm -triple %itanium_abi_triple -emit-module-interface -o %t
+// RUN: %clang_cc1 -fmodules-ts %s -triple %itanium_abi_triple -fmodule-file=%t -emit-llvm -o - | FileCheck %s --implicit-check-not=unused --implicit-check-not=global_module
+
+// CHECK-DAG: @extern_var_exported = external global
+// FIXME: Should this be 'external global'?
+// CHECK-DAG: @inline_var_exported = linkonce_odr global
+// CHECK-DAG: @_ZL19static_var_exported = external global
+// CHECK-DAG: @const_var_exported = external constant
+//
+// FIXME: The module name should be mangled into all of these.
+// CHECK-DAG: @extern_var_module_linkage = external global
+// FIXME: Should this be 'external global'?
+// CHECK-DAG: @inline_var_module_linkage = linkonce_odr global
+// CHECK-DAG: @_ZL25static_var_module_linkage = external global
+// CHECK-DAG: @_ZL24const_var_module_linkage = external constant
+
+module Module;
+
+void use() {
+  // CHECK: define linkonce_odr {{.*}}@_Z20used_inline_exportedv
+  used_inline_exported();
+  // CHECK: declare {{.*}}@_Z18noninline_exportedv
+  noninline_exported();
+
+  (void)&extern_var_exported;
+  (void)&inline_var_exported;
+  (void)&static_var_exported; // FIXME: Should not be exported.
+  (void)&const_var_exported;
+
+  // FIXME: This symbol should not be visible here.
+  // CHECK: declare {{.*}}@_ZL26used_static_module_linkagev
+  used_static_module_linkage();
+
+  // FIXME: The module name should be mangled into the name of this function.
+  // CHECK: define linkonce_odr {{.*}}@_Z26used_inline_module_linkagev
+  used_inline_module_linkage();
+
+  // FIXME: The module name should be mangled into the name of this function.
+  // CHECK: declare {{.*}}@_Z24noninline_module_linkagev
+  noninline_module_linkage();
+
+  (void)&extern_var_module_linkage;
+  (void)&inline_var_module_linkage;
+  (void)&static_var_module_linkage; // FIXME: Should not be visible here.
+  (void)&const_var_module_linkage;
+}
diff --git a/test/CXX/modules-ts/basic/basic.def.odr/p4/module.cppm b/test/CXX/modules-ts/basic/basic.def.odr/p4/module.cppm
new file mode 100644
index 000000000000..d452f741a0fb
--- /dev/null
+++ b/test/CXX/modules-ts/basic/basic.def.odr/p4/module.cppm
@@ -0,0 +1,118 @@
+// RUN: %clang_cc1 -fmodules-ts %s -triple %itanium_abi_triple -emit-llvm -o - | FileCheck %s --implicit-check-not unused_inline --implicit-check-not unused_stastic_global_module
+
+// CHECK-DAG: @extern_var_global_module = external global
+// CHECK-DAG: @inline_var_global_module = linkonce_odr global
+// CHECK-DAG: @_ZL24static_var_global_module = internal global
+// CHECK-DAG: @_ZL23const_var_global_module = internal constant
+//
+// For ABI compatibility, these symbols do not include the module name.
+// CHECK-DAG: @extern_var_exported = external global
+// FIXME: Should this be 'weak_odr global'? Presumably it must be, since we
+// can discard this global and its initializer (if any), and other TUs are not
+// permitted to run the initializer for this variable.
+// CHECK-DAG: @inline_var_exported = linkonce_odr global
+// CHECK-DAG: @_ZL19static_var_exported = global
+// CHECK-DAG: @const_var_exported = constant
+//
+// FIXME: The module name should be mangled into all of these.
+// CHECK-DAG: @extern_var_module_linkage = external global
+// FIXME: Should this be 'weak_odr global'? Presumably it must be, since we
+// can discard this global and its initializer (if any), and other TUs are not
+// permitted to run the initializer for this variable.
+// CHECK-DAG: @inline_var_module_linkage = linkonce_odr global
+// CHECK-DAG: @_ZL25static_var_module_linkage = global
+// CHECK-DAG: @_ZL24const_var_module_linkage = constant
+
+static void unused_static_global_module() {}
+static void used_static_global_module() {}
+
+inline void unused_inline_global_module() {}
+inline void used_inline_global_module() {}
+
+extern int extern_var_global_module;
+inline int inline_var_global_module;
+static int static_var_global_module;
+const int const_var_global_module = 3;
+
+// CHECK: define void {{.*}}@_Z23noninline_global_modulev
+void noninline_global_module() {
+  // FIXME: This should be promoted to module linkage and given a
+  // module-mangled name, if it's called from an inline function within
+  // the module interface.
+  // (We should try to avoid this when it's not reachable from outside
+  // the module interface unit.)
+  // CHECK: define internal {{.*}}@_ZL25used_static_global_modulev
+  used_static_global_module();
+  // CHECK: define linkonce_odr {{.*}}@_Z25used_inline_global_modulev
+  used_inline_global_module();
+
+  (void)&extern_var_global_module;
+  (void)&inline_var_global_module;
+  (void)&static_var_global_module;
+  (void)&const_var_global_module;
+}
+
+export module Module;
+
+export {
+  // FIXME: These should be ill-formed: you can't export an internal linkage
+  // symbol, per [dcl.module.interface]p2.
+  // CHECK: define void {{.*}}@_ZL22unused_static_exportedv
+  static void unused_static_exported() {}
+  // CHECK: define void {{.*}}@_ZL20used_static_exportedv
+  static void used_static_exported() {}
+
+  inline void unused_inline_exported() {}
+  inline void used_inline_exported() {}
+
+  extern int extern_var_exported;
+  inline int inline_var_exported;
+  // FIXME: This should be ill-formed: you can't export an internal linkage
+  // symbol.
+  static int static_var_exported;
+  const int const_var_exported = 3;
+
+  // CHECK: define void {{.*}}@_Z18noninline_exportedv
+  void noninline_exported() {
+    used_static_exported();
+    // CHECK: define linkonce_odr {{.*}}@_Z20used_inline_exportedv
+    used_inline_exported();
+
+    (void)&extern_var_exported;
+    (void)&inline_var_exported;
+    (void)&static_var_exported;
+    (void)&const_var_exported;
+  }
+}
+
+// FIXME: Ideally we wouldn't emit this as its name is not visible outside this
+// TU, but this module interface might contain a template that can use this
+// function so we conservatively emit it for now.
+// FIXME: The module name should be mangled into the name of this function.
+// CHECK: define void {{.*}}@_ZL28unused_static_module_linkagev
+static void unused_static_module_linkage() {}
+// FIXME: The module name should be mangled into the name of this function.
+// CHECK: define void {{.*}}@_ZL26used_static_module_linkagev
+static void used_static_module_linkage() {}
+
+inline void unused_inline_module_linkage() {}
+inline void used_inline_module_linkage() {}
+
+extern int extern_var_module_linkage;
+inline int inline_var_module_linkage;
+static int static_var_module_linkage;
+const int const_var_module_linkage = 3;
+
+// FIXME: The module name should be mangled into the name of this function.
+// CHECK: define void {{.*}}@_Z24noninline_module_linkagev
+void noninline_module_linkage() {
+  used_static_module_linkage();
+  // FIXME: The module name should be mangled into the name of this function.
+  // CHECK: define linkonce_odr {{.*}}@_Z26used_inline_module_linkagev
+  used_inline_module_linkage();
+
+  (void)&extern_var_module_linkage;
+  (void)&inline_var_module_linkage;
+  (void)&static_var_module_linkage;
+  (void)&const_var_module_linkage;
+}
diff --git a/test/CXX/modules-ts/basic/basic.def.odr/p4/user.cpp b/test/CXX/modules-ts/basic/basic.def.odr/p4/user.cpp
new file mode 100644
index 000000000000..f6e0238c6b4b
--- /dev/null
+++ b/test/CXX/modules-ts/basic/basic.def.odr/p4/user.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fmodules-ts %S/module.cppm -triple %itanium_abi_triple -emit-module-interface -o %t
+// RUN: %clang_cc1 -fmodules-ts %s -triple %itanium_abi_triple -fmodule-file=%t -emit-llvm -o - | FileCheck %s --implicit-check-not=unused --implicit-check-not=global_module
+
+// CHECK-DAG: @extern_var_exported = external global
+// FIXME: Should this be 'external global'?
+// CHECK-DAG: @inline_var_exported = linkonce_odr global
+// FIXME: These should be 'extern global' and 'extern constant'.
+// CHECK-DAG: @_ZL19static_var_exported = global
+// CHECK-DAG: @const_var_exported = constant
+
+import Module;
+
+void use() {
+  // CHECK: define linkonce_odr {{.*}}@_Z20used_inline_exportedv
+  used_inline_exported();
+  // CHECK: declare {{.*}}@_Z18noninline_exportedv
+  noninline_exported();
+
+  (void)&extern_var_exported;
+  (void)&inline_var_exported;
+  (void)&static_var_exported;
+  (void)&const_var_exported;
+
+  // Module-linkage declarations are not visible here.
+}
diff --git a/test/CXX/modules-ts/basic/basic.link/p2/module.cpp b/test/CXX/modules-ts/basic/basic.link/p2/module.cpp
new file mode 100644
index 000000000000..3fc6044d8e94
--- /dev/null
+++ b/test/CXX/modules-ts/basic/basic.link/p2/module.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts %S/module.cppm -emit-module-interface -o %t
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts -fmodule-file=%t %s -verify
+// expected-no-diagnostics
+module M;
+
+// FIXME: Use of internal linkage entities should be rejected.
+void use_from_module_impl() {
+  external_linkage_fn();
+  module_linkage_fn();
+  internal_linkage_fn();
+  (void)external_linkage_class{};
+  (void)module_linkage_class{};
+  (void)internal_linkage_class{};
+  (void)external_linkage_var;
+  (void)module_linkage_var;
+  (void)internal_linkage_var;
+}
diff --git a/test/CXX/modules-ts/basic/basic.link/p2/module.cppm b/test/CXX/modules-ts/basic/basic.link/p2/module.cppm
new file mode 100644
index 000000000000..bb261700db84
--- /dev/null
+++ b/test/CXX/modules-ts/basic/basic.link/p2/module.cppm
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts %s -verify
+// expected-no-diagnostics
+export module M;
+
+export int external_linkage_var;
+int module_linkage_var;
+static int internal_linkage_var;
+
+export void external_linkage_fn() {}
+void module_linkage_fn() {}
+static void internal_linkage_fn() {}
+
+export struct external_linkage_class {};
+struct module_linkage_class {};
+namespace {
+  struct internal_linkage_class {};
+}
+
+void use() {
+  external_linkage_fn();
+  module_linkage_fn();
+  internal_linkage_fn();
+  (void)external_linkage_class{};
+  (void)module_linkage_class{};
+  (void)internal_linkage_class{};
+  (void)external_linkage_var;
+  (void)module_linkage_var;
+  (void)internal_linkage_var;
+}
diff --git a/test/CXX/modules-ts/basic/basic.link/p2/other.cpp b/test/CXX/modules-ts/basic/basic.link/p2/other.cpp
new file mode 100644
index 000000000000..8370777e7ed4
--- /dev/null
+++ b/test/CXX/modules-ts/basic/basic.link/p2/other.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts %S/module.cppm -emit-module-interface -o %t
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts -fmodule-file=%t %s -verify
+import M;
+
+void use_from_module_impl() {
+  external_linkage_fn();
+  module_linkage_fn(); // expected-error {{undeclared identifier}}
+  internal_linkage_fn(); // expected-error {{undeclared identifier}}
+  (void)external_linkage_class{};
+  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  // expected-note@module.cppm:9 {{here}}
+  (void)external_linkage_var;
+  (void)module_linkage_var; // expected-error {{undeclared identifier}}
+  (void)internal_linkage_var; // expected-error {{undeclared identifier}}
+}
diff --git a/test/CXX/modules-ts/dcl.dcl/dcl.module/dcl.module.import/p1.cpp b/test/CXX/modules-ts/dcl.dcl/dcl.module/dcl.module.import/p1.cpp
index aaf43d6584a4..aad31b4b46d1 100644
--- a/test/CXX/modules-ts/dcl.dcl/dcl.module/dcl.module.import/p1.cpp
+++ b/test/CXX/modules-ts/dcl.dcl/dcl.module/dcl.module.import/p1.cpp
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
-// RUN: echo 'export module x; int a, b;' > %t/x.cppm
-// RUN: echo 'export module x.y; int c;' > %t/x.y.cppm
+// RUN: echo 'export module x; export int a, b;' > %t/x.cppm
+// RUN: echo 'export module x.y; export int c;' > %t/x.y.cppm
 //
 // RUN: %clang_cc1 -std=c++1z -fmodules-ts -emit-module-interface %t/x.cppm -o %t/x.pcm
 // RUN: %clang_cc1 -std=c++1z -fmodules-ts -emit-module-interface -fmodule-file=%t/x.pcm %t/x.y.cppm -o %t/x.y.pcm
diff --git a/test/CXX/modules-ts/dcl.dcl/dcl.module/p5.cpp b/test/CXX/modules-ts/dcl.dcl/dcl.module/p5.cpp
new file mode 100644
index 000000000000..734b89173df9
--- /dev/null
+++ b/test/CXX/modules-ts/dcl.dcl/dcl.module/p5.cpp
@@ -0,0 +1,33 @@
+// RUN: rm -f %t
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts -emit-module-interface %s -o %t -DINTERFACE
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts -fmodule-file=%t %s -verify -DIMPLEMENTATION
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts -fmodule-file=%t %s -verify -DEARLY_IMPLEMENTATION
+// RUN: %clang_cc1 -std=c++1z -fmodules-ts -fmodule-file=%t %s -verify -DUSER
+
+// expected-no-diagnostics
+
+#ifdef USER
+import Foo;
+#endif
+
+#ifdef EARLY_IMPLEMENTATION
+module Foo;
+#endif
+
+template<typename T> struct type_template {
+  typedef T type;
+  void f(type);
+};
+
+template<typename T> void type_template<T>::f(type) {}
+
+template<int = 0, typename = int, template<typename> class = type_template>
+struct default_template_args {};
+
+#ifdef INTERFACE
+export module Foo;
+#endif
+
+#ifdef IMPLEMENTATION
+module Foo;
+#endif
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index cbc2e72fcbac..bcb680c4b518 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -9037,9 +9037,10 @@ int64x2_t test_vld1q_s64(int64_t const *a) {
 
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[TMP2:%.*]] = load <8 x half>, <8 x half>* [[TMP1]]
-// CHECK:   ret <8 x half> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP3]]
 float16x8_t test_vld1q_f16(float16_t const *a) {
   return vld1q_f16(a);
 }
@@ -9151,9 +9152,10 @@ int64x1_t test_vld1_s64(int64_t const *a) {
 
 // CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[TMP2:%.*]] = load <4 x half>, <4 x half>* [[TMP1]]
-// CHECK:   ret <4 x half> [[TMP2]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP3]]
 float16x4_t test_vld1_f16(float16_t const *a) {
   return vld1_f16(a);
 }
@@ -9340,10 +9342,10 @@ int64x2x2_t test_vld2q_s64(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x half>*
-// CHECK:   [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
@@ -9571,10 +9573,10 @@ int64x1x2_t test_vld2_s64(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x half>*
-// CHECK:   [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
@@ -9802,10 +9804,10 @@ int64x2x3_t test_vld3q_s64(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x half>*
-// CHECK:   [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
@@ -10033,10 +10035,10 @@ int64x1x3_t test_vld3_s64(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x half>*
-// CHECK:   [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
@@ -10264,10 +10266,10 @@ int64x2x4_t test_vld4q_s64(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x half>*
-// CHECK:   [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
@@ -10495,10 +10497,10 @@ int64x1x4_t test_vld4_s64(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x half>*
-// CHECK:   [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
@@ -10664,9 +10666,9 @@ void test_vst1q_s64(int64_t *a, int64x2_t b) {
 // CHECK-LABEL: @test_vst1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   store <8 x half> [[TMP3]], <8 x half>* [[TMP2]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
 // CHECK:   ret void
 void test_vst1q_f16(float16_t *a, float16x8_t b) {
   vst1q_f16(a, b);
@@ -10798,9 +10800,9 @@ void test_vst1_s64(int64_t *a, int64x1_t b) {
 // CHECK-LABEL: @test_vst1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   store <4 x half> [[TMP3]], <4 x half>* [[TMP2]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
 // CHECK:   ret void
 void test_vst1_f16(float16_t *a, float16x4_t b) {
   vst1_f16(a, b);
@@ -11054,9 +11056,9 @@ void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2.v8f16.p0i8(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i8* [[TMP2]])
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
   vst2q_f16(a, b);
@@ -11364,9 +11366,9 @@ void test_vst2_s64(int64_t *a, int64x1x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2.v4f16.p0i8(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i8* [[TMP2]])
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst2_f16(float16_t *a, float16x4x2_t b) {
   vst2_f16(a, b);
@@ -11714,10 +11716,10 @@ void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3.v8f16.p0i8(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i8* [[TMP2]])
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
   vst3q_f16(a, b);
@@ -12083,10 +12085,10 @@ void test_vst3_s64(int64_t *a, int64x1x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3.v4f16.p0i8(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i8* [[TMP2]])
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst3_f16(float16_t *a, float16x4x3_t b) {
   vst3_f16(a, b);
@@ -12492,11 +12494,11 @@ void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4.v8f16.p0i8(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i8* [[TMP2]])
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
   vst4q_f16(a, b);
@@ -12920,11 +12922,11 @@ void test_vst4_s64(int64_t *a, int64x1x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4.v4f16.p0i8(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i8* [[TMP2]])
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst4_f16(float16_t *a, float16x4x4_t b) {
   vst4_f16(a, b);
@@ -13206,10 +13208,10 @@ int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
@@ -13452,10 +13454,10 @@ int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
@@ -13698,10 +13700,10 @@ int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
@@ -13944,10 +13946,10 @@ int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
@@ -14190,10 +14192,10 @@ int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
@@ -14436,10 +14438,10 @@ int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD1XN:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
@@ -14750,10 +14752,10 @@ void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v8f16.p0f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], half* [[TMP9]])
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
 // CHECK:   ret void
 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
   vst1q_f16_x2(a, b);
@@ -15096,10 +15098,10 @@ void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x2.v4f16.p0f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], half* [[TMP9]])
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
 // CHECK:   ret void
 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
   vst1_f16_x2(a, b);
@@ -15482,11 +15484,11 @@ void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v8f16.p0f16(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], half* [[TMP12]])
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
 // CHECK:   ret void
 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
   vst1q_f16_x3(a, b);
@@ -15892,11 +15894,11 @@ void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x3.v4f16.p0f16(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], half* [[TMP12]])
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
 // CHECK:   ret void
 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
   vst1_f16_x3(a, b);
@@ -16342,12 +16344,12 @@ void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v8f16.p0f16(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], half* [[TMP15]])
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
 // CHECK:   ret void
 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
   vst1q_f16_x4(a, b);
@@ -16816,12 +16818,12 @@ void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to half*
-// CHECK:   call void @llvm.aarch64.neon.st1x4.v4f16.p0f16(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], half* [[TMP15]])
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
 // CHECK:   ret void
 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
   vst1_f16_x4(a, b);
diff --git a/test/CodeGen/aarch64-neon-ldst-one.c b/test/CodeGen/aarch64-neon-ldst-one.c
index a3c5b140a0d2..9bd9ab1cb61b 100644
--- a/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/test/CodeGen/aarch64-neon-ldst-one.c
@@ -90,11 +90,12 @@ int64x2_t test_vld1q_dup_s64(int64_t  *a) {
 
 // CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x half> [[LANE]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
   return vld1q_dup_f16(a);
 }
@@ -238,11 +239,12 @@ int64x1_t test_vld1_dup_s64(int64_t  *a) {
 
 // CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x half> [[LANE]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
   return vld1_dup_f16(a);
 }
@@ -445,10 +447,10 @@ int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
@@ -691,10 +693,10 @@ int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
@@ -945,10 +947,10 @@ int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
@@ -1205,10 +1207,10 @@ int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
@@ -1457,10 +1459,10 @@ int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
@@ -1703,10 +1705,10 @@ int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to half*
-// CHECK:   [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
@@ -1895,11 +1897,12 @@ int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
 // CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
-// CHECK:   ret <8 x half> [[VLD1_LANE]]
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
@@ -2051,11 +2054,12 @@ int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
 // CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
-// CHECK:   ret <4 x half> [[VLD1_LANE]]
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
@@ -2491,11 +2495,11 @@ int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0i8(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]]
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
@@ -2923,11 +2927,11 @@ int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0i8(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]]
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
 // CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
@@ -3360,12 +3364,12 @@ int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0i8(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]]
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
@@ -3885,12 +3889,12 @@ int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0i8(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]]
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
 // CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
@@ -4450,13 +4454,13 @@ int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0i8(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]]
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
@@ -5039,13 +5043,13 @@ int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0i8(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]]
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
 // CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
 // CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
@@ -5357,10 +5361,10 @@ void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
 // CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   store half [[TMP3]], half* [[TMP4]]
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
 // CHECK:   ret void
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
@@ -5513,10 +5517,10 @@ void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
 // CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   store half [[TMP3]], half* [[TMP4]]
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
 // CHECK:   ret void
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
@@ -5785,9 +5789,9 @@ void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8f16.p0i8(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
@@ -6120,9 +6124,9 @@ void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
 // CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f16.p0i8(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
@@ -6495,10 +6499,10 @@ void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8f16.p0i8(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
@@ -6894,10 +6898,10 @@ void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f16.p0i8(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
@@ -7333,11 +7337,11 @@ void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8f16.p0i8(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
@@ -7796,11 +7800,11 @@ void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f16.p0i8(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
 // CHECK:   ret void
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
diff --git a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
deleted file mode 100644
index 3f61238b64fb..000000000000
--- a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ /dev/null
@@ -1,1633 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
-// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
-// RUN: | opt -S -mem2reg \
-// RUN: | FileCheck %s
-
-// REQUIRES: aarch64-registered-target
-
-#include <arm_neon.h>
-
-// CHECK-LABEL: test_vabs_f16
-// CHECK:  [[ABS:%.*]] =  call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[ABS]]
-float16x4_t test_vabs_f16(float16x4_t a) {
-  return vabs_f16(a);
-}
-
-// CHECK-LABEL: test_vabsq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[ABS]]
-float16x8_t test_vabsq_f16(float16x8_t a) {
-  return vabsq_f16(a);
-}
-
-// CHECK-LABEL: test_vceqz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vceqz_f16(float16x4_t a) {
-  return vceqz_f16(a);
-}
-
-// CHECK-LABEL: test_vceqzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vceqzq_f16(float16x8_t a) {
-  return vceqzq_f16(a);
-}
-
-// CHECK-LABEL: test_vcgez_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vcgez_f16(float16x4_t a) {
-  return vcgez_f16(a);
-}
-
-// CHECK-LABEL: test_vcgezq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcgezq_f16(float16x8_t a) {
-  return vcgezq_f16(a);
-}
-
-// CHECK-LABEL: test_vcgtz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vcgtz_f16(float16x4_t a) {
-  return vcgtz_f16(a);
-}
-
-// CHECK-LABEL: test_vcgtzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcgtzq_f16(float16x8_t a) {
-  return vcgtzq_f16(a);
-}
-
-// CHECK-LABEL: test_vclez_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vclez_f16(float16x4_t a) {
-  return vclez_f16(a);
-}
-
-// CHECK-LABEL: test_vclezq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vclezq_f16(float16x8_t a) {
-  return vclezq_f16(a);
-}
-
-// CHECK-LABEL: test_vcltz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vcltz_f16(float16x4_t a) {
-  return vcltz_f16(a);
-}
-
-// CHECK-LABEL: test_vcltzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcltzq_f16(float16x8_t a) {
-  return vcltzq_f16(a);
-}
-
-// CHECK-LABEL: test_vcvt_f16_s16
-// CHECK:  [[VCVT:%.*]] = sitofp <4 x i16> %a to <4 x half>
-// CHECK:  ret <4 x half> [[VCVT]]
-float16x4_t test_vcvt_f16_s16 (int16x4_t a) {
-  return vcvt_f16_s16(a);
-}
-
-// CHECK-LABEL: test_vcvtq_f16_s16
-// CHECK:  [[VCVT:%.*]] = sitofp <8 x i16> %a to <8 x half>
-// CHECK:  ret <8 x half> [[VCVT]]
-float16x8_t test_vcvtq_f16_s16 (int16x8_t a) {
-  return vcvtq_f16_s16(a);
-}
-
-// CHECK-LABEL: test_vcvt_f16_u16
-// CHECK:  [[VCVT:%.*]] = uitofp <4 x i16> %a to <4 x half>
-// CHECK:  ret <4 x half> [[VCVT]]
-float16x4_t test_vcvt_f16_u16 (uint16x4_t a) {
-  return vcvt_f16_u16(a);
-}
-
-// CHECK-LABEL: test_vcvtq_f16_u16
-// CHECK:  [[VCVT:%.*]] = uitofp <8 x i16> %a to <8 x half>
-// CHECK:  ret <8 x half> [[VCVT]]
-float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
-  return vcvtq_f16_u16(a);
-}
-
-// CHECK-LABEL: test_vcvt_s16_f16
-// CHECK:  [[VCVT:%.*]] = fptosi <4 x half> %a to <4 x i16>
-// CHECK:  ret <4 x i16> [[VCVT]]
-int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
-  return vcvt_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtq_s16_f16
-// CHECK:  [[VCVT:%.*]] = fptosi <8 x half> %a to <8 x i16>
-// CHECK:  ret <8 x i16> [[VCVT]]
-int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
-  return vcvtq_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvt_u16_f16
-// CHECK:  [[VCVT:%.*]] = fptoui <4 x half> %a to <4 x i16>
-// CHECK:  ret <4 x i16> [[VCVT]]
-int16x4_t test_vcvt_u16_f16 (float16x4_t a) {
-  return vcvt_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtq_u16_f16
-// CHECK:  [[VCVT:%.*]] = fptoui <8 x half> %a to <8 x i16>
-// CHECK:  ret <8 x i16> [[VCVT]]
-int16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
-  return vcvtq_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvta_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
-  return vcvta_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtaq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
-  return vcvtaq_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtm_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
-  return vcvtm_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtmq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
-  return vcvtmq_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtm_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
-  return vcvtm_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtmq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
-  return vcvtmq_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtn_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
-  return vcvtn_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtnq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
-  return vcvtnq_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtn_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
-  return vcvtn_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtnq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
-  return vcvtnq_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtp_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
-  return vcvtp_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtpq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
-  return vcvtpq_s16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtp_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
-uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
-  return vcvtp_u16_f16(a);
-}
-
-// CHECK-LABEL: test_vcvtpq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
-uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) {
-  return vcvtpq_u16_f16(a);
-}
-
-// FIXME: Fix the zero constant when fp16 non-storage-only type becomes available.
-// CHECK-LABEL: test_vneg_f16
-// CHECK:  [[NEG:%.*]] = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %a
-// CHECK:  ret <4 x half> [[NEG]]
-float16x4_t test_vneg_f16(float16x4_t a) {
-  return vneg_f16(a);
-}
-
-// CHECK-LABEL: test_vnegq_f16
-// CHECK:  [[NEG:%.*]] = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %a
-// CHECK:  ret <8 x half> [[NEG]]
-float16x8_t test_vnegq_f16(float16x8_t a) {
-  return vnegq_f16(a);
-}
-
-// CHECK-LABEL: test_vrecpe_f16
-// CHECK:  [[RCP:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RCP]]
-float16x4_t test_vrecpe_f16(float16x4_t a) {
-  return vrecpe_f16(a);
-}
-
-// CHECK-LABEL: test_vrecpeq_f16
-// CHECK:  [[RCP:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RCP]]
-float16x8_t test_vrecpeq_f16(float16x8_t a) {
-  return vrecpeq_f16(a);
-}
-
-// CHECK-LABEL: test_vrnd_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.trunc.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrnd_f16(float16x4_t a) {
-  return vrnd_f16(a);
-}
-
-// CHECK-LABEL: test_vrndq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.trunc.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndq_f16(float16x8_t a) {
-  return vrndq_f16(a);
-}
-
-// CHECK-LABEL: test_vrnda_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.round.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrnda_f16(float16x4_t a) {
-  return vrnda_f16(a);
-}
-
-// CHECK-LABEL: test_vrndaq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.round.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndaq_f16(float16x8_t a) {
-  return vrndaq_f16(a);
-}
-
-// CHECK-LABEL: test_vrndi_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrndi_f16(float16x4_t a) {
-  return vrndi_f16(a);
-}
-
-// CHECK-LABEL: test_vrndiq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndiq_f16(float16x8_t a) {
-  return vrndiq_f16(a);
-}
-
-// CHECK-LABEL: test_vrndm_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.floor.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrndm_f16(float16x4_t a) {
-  return vrndm_f16(a);
-}
-
-// CHECK-LABEL: test_vrndmq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.floor.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndmq_f16(float16x8_t a) {
-  return vrndmq_f16(a);
-}
-
-// CHECK-LABEL: test_vrndn_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.aarch64.neon.frintn.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrndn_f16(float16x4_t a) {
-  return vrndn_f16(a);
-}
-
-// CHECK-LABEL: test_vrndnq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.aarch64.neon.frintn.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndnq_f16(float16x8_t a) {
-  return vrndnq_f16(a);
-}
-
-// CHECK-LABEL: test_vrndp_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.ceil.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrndp_f16(float16x4_t a) {
-  return vrndp_f16(a);
-}
-
-// CHECK-LABEL: test_vrndpq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.ceil.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndpq_f16(float16x8_t a) {
-  return vrndpq_f16(a);
-}
-
-// CHECK-LABEL: test_vrndx_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.rint.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrndx_f16(float16x4_t a) {
-  return vrndx_f16(a);
-}
-
-// CHECK-LABEL: test_vrndxq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.rint.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrndxq_f16(float16x8_t a) {
-  return vrndxq_f16(a);
-}
-
-// CHECK-LABEL: test_vrsqrte_f16
-// CHECK:  [[RND:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
-float16x4_t test_vrsqrte_f16(float16x4_t a) {
-  return vrsqrte_f16(a);
-}
-
-// CHECK-LABEL: test_vrsqrteq_f16
-// CHECK:  [[RND:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
-float16x8_t test_vrsqrteq_f16(float16x8_t a) {
-  return vrsqrteq_f16(a);
-}
-
-// CHECK-LABEL: test_vsqrt_f16
-// CHECK:  [[SQR:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[SQR]]
-float16x4_t test_vsqrt_f16(float16x4_t a) {
-  return vsqrt_f16(a);
-}
-
-// CHECK-LABEL: test_vsqrtq_f16
-// CHECK:  [[SQR:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[SQR]]
-float16x8_t test_vsqrtq_f16(float16x8_t a) {
-  return vsqrtq_f16(a);
-}
-
-// CHECK-LABEL: test_vadd_f16
-// CHECK:  [[ADD:%.*]] = fadd <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[ADD]]
-float16x4_t test_vadd_f16(float16x4_t a, float16x4_t b) {
-  return vadd_f16(a, b);
-}
-
-// CHECK-LABEL: test_vaddq_f16
-// CHECK:  [[ADD:%.*]] = fadd <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[ADD]]
-float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
-  return vaddq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vabd_f16
-// CHECK:  [[ABD:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[ABD]]
-float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
-  return vabd_f16(a, b);
-}
-
-// CHECK-LABEL: test_vabdq_f16
-// CHECK:  [[ABD:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[ABD]]
-float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
-  return vabdq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcage_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x i16> [[ABS]]
-uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
-  return vcage_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcageq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x i16> [[ABS]]
-uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
-  return vcageq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcagt_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x i16> [[ABS]]
-uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
-  return vcagt_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcagtq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x i16> [[ABS]]
-uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
-  return vcagtq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcale_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
-// CHECK:  ret <4 x i16> [[ABS]]
-uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
-  return vcale_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcaleq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
-// CHECK:  ret <8 x i16> [[ABS]]
-uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
-  return vcaleq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcalt_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
-// CHECK:  ret <4 x i16> [[ABS]]
-uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
-  return vcalt_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcaltq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
-// CHECK:  ret <8 x i16> [[ABS]]
-uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) {
-  return vcaltq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vceq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vceq_f16(float16x4_t a, float16x4_t b) {
-  return vceq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vceqq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vceqq_f16(float16x8_t a, float16x8_t b) {
-  return vceqq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcge_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vcge_f16(float16x4_t a, float16x4_t b) {
-  return vcge_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcgeq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcgeq_f16(float16x8_t a, float16x8_t b) {
-  return vcgeq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcgt_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vcgt_f16(float16x4_t a, float16x4_t b) {
-  return vcgt_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcgtq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcgtq_f16(float16x8_t a, float16x8_t b) {
-  return vcgtq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcle_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vcle_f16(float16x4_t a, float16x4_t b) {
-  return vcle_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcleq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcleq_f16(float16x8_t a, float16x8_t b) {
-  return vcleq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vclt_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
-uint16x4_t test_vclt_f16(float16x4_t a, float16x4_t b) {
-  return vclt_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcltq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
-uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) {
-  return vcltq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vcvt_n_f16_s16
-// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <4 x half> [[CVT]]
-float16x4_t test_vcvt_n_f16_s16(int16x4_t a) {
-  return vcvt_n_f16_s16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvtq_n_f16_s16
-// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <8 x half> [[CVT]]
-float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) {
-  return vcvtq_n_f16_s16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvt_n_f16_u16
-// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <4 x half> [[CVT]]
-float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) {
-  return vcvt_n_f16_u16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvtq_n_f16_u16
-// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <8 x half> [[CVT]]
-float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) {
-  return vcvtq_n_f16_u16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvt_n_s16_f16
-// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
-// CHECK:  ret <4 x i16> [[CVT]]
-int16x4_t test_vcvt_n_s16_f16(float16x4_t a) {
-  return vcvt_n_s16_f16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvtq_n_s16_f16
-// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
-// CHECK:  ret <8 x i16> [[CVT]]
-int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) {
-  return vcvtq_n_s16_f16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvt_n_u16_f16
-// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
-// CHECK:  ret <4 x i16> [[CVT]]
-uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) {
-  return vcvt_n_u16_f16(a, 2);
-}
-
-// CHECK-LABEL: test_vcvtq_n_u16_f16
-// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
-// CHECK:  ret <8 x i16> [[CVT]]
-uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) {
-  return vcvtq_n_u16_f16(a, 2);
-}
-
-// CHECK-LABEL: test_vdiv_f16
-// CHECK:  [[DIV:%.*]] = fdiv <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[DIV]]
-float16x4_t test_vdiv_f16(float16x4_t a, float16x4_t b) {
-  return vdiv_f16(a, b);
-}
-
-// CHECK-LABEL: test_vdivq_f16
-// CHECK:  [[DIV:%.*]] = fdiv <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[DIV]]
-float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) {
-  return vdivq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmax_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
-float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
-  return vmax_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmaxq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
-float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
-  return vmaxq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmaxnm_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
-float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
-  return vmaxnm_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmaxnmq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
-float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
-  return vmaxnmq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmin_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
-float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
-  return vmin_f16(a, b);
-}
-
-// CHECK-LABEL: test_vminq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
-float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
-  return vminq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vminnm_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
-float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
-  return vminnm_f16(a, b);
-}
-
-// CHECK-LABEL: test_vminnmq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
-float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) {
-  return vminnmq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmul_f16
-// CHECK:  [[MUL:%.*]] = fmul <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[MUL]]
-float16x4_t test_vmul_f16(float16x4_t a, float16x4_t b) {
-  return vmul_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmulq_f16
-// CHECK:  [[MUL:%.*]] = fmul <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[MUL]]
-float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
-  return vmulq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmulx_f16
-// CHECK:  [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MUL]]
-float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
-  return vmulx_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmulxq_f16
-// CHECK:  [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MUL]]
-float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
-  return vmulxq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpadd_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.addp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[ADD]]
-float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
-  return vpadd_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpaddq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.addp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[ADD]]
-float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
-  return vpaddq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpmax_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
-float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
-  return vpmax_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpmaxq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
-float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
-  return vpmaxq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpmaxnm_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
-float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
-  return vpmaxnm_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpmaxnmq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
-float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
-  return vpmaxnmq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpmin_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
-float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
-  return vpmin_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpminq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
-float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
-  return vpminq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpminnm_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
-float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
-  return vpminnm_f16(a, b);
-}
-
-// CHECK-LABEL: test_vpminnmq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
-float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
-  return vpminnmq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vrecps_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
-float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
-  return vrecps_f16(a, b);
-}
-
-// CHECK-LABEL: test_vrecpsq_f16
-// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
-float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
-  return vrecpsq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vrsqrts_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
-float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
-  return vrsqrts_f16(a, b);
-}
-
-// CHECK-LABEL: test_vrsqrtsq_f16
-// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
-float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) {
-  return vrsqrtsq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vsub_f16
-// CHECK:  [[ADD:%.*]] = fsub <4 x half> %a, %b 
-// CHECK:  ret <4 x half> [[ADD]]
-float16x4_t test_vsub_f16(float16x4_t a, float16x4_t b) {
-  return vsub_f16(a, b);
-}
-
-// CHECK-LABEL: test_vsubq_f16
-// CHECK:  [[ADD:%.*]] = fsub <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[ADD]]
-float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
-  return vsubq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vfma_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a)
-// CHECK:  ret <4 x half> [[ADD]]
-float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmaq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
-// CHECK:  ret <8 x half> [[ADD]]
-float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmaq_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfms_f16
-// CHECK:  [[SUB:%.*]] = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a)
-// CHECK:  ret <4 x half> [[ADD]]
-float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfms_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmsq_f16
-// CHECK:  [[SUB:%.*]] = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a)
-// CHECK:  ret <8 x half> [[ADD]]
-float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmsq_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfma_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
-// CHECK: ret <4 x half> [[FMLA]]
-float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfma_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: test_vfmaq_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
-// CHECK: ret <8 x half> [[FMLA]]
-float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
-  return vfmaq_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: test_vfma_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CHECK: ret <4 x half> [[FMLA]]
-float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
-  return vfma_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: test_vfmaq_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CHECK: ret <8 x half> [[FMLA]]
-float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmaq_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: test_vfma_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[FMA:%.*]]  = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a)
-// CHECK: ret <4 x half> [[FMA]]
-float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
-  return vfma_n_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmaq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
-// CHECK: [[FMA:%.*]]  = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a)
-// CHECK: ret <8 x half> [[FMA]]
-float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
-  return vfmaq_n_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmah_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
-float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
-  return vfmah_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: test_vfmah_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
-float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
-  return vfmah_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: test_vfms_lane_f16
-// CHECK: [[SUB:%.*]]  = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
-// CHECK: ret <4 x half> [[FMA]]
-float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
-  return vfms_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: test_vfmsq_lane_f16
-// CHECK: [[SUB:%.*]]  = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
-// CHECK: ret <8 x half> [[FMLA]]
-float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
-  return vfmsq_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: test_vfms_laneq_f16
-// CHECK: [[SUB:%.*]]  = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CHECK: ret <4 x half> [[FMLA]]
-float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
-  return vfms_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: test_vfmsq_laneq_f16
-// CHECK: [[SUB:%.*]]  = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CHECK: ret <8 x half> [[FMLA]]
-float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
-  return vfmsq_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: test_vfms_n_f16
-// CHECK: [[SUB:%.*]]  = fsub <4 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[FMA:%.*]]  = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a)
-// CHECK: ret <4 x half> [[FMA]]
-float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
-  return vfms_n_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmsq_n_f16
-// CHECK: [[SUB:%.*]]  = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
-// CHECK: [[FMA:%.*]]  = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a)
-// CHECK: ret <8 x half> [[FMA]]
-float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
-  return vfmsq_n_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vfmsh_lane_f16
-// CHECK: [[TMP0:%.*]] = fpext half %b to float
-// CHECK: [[TMP1:%.*]] = fsub float -0.000000e+00, [[TMP0]]
-// CHECK: [[SUB:%.*]]  = fptrunc float [[TMP1]] to half
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP3]], i32 3
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
-float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
-  return vfmsh_lane_f16(a, b, c, 3);
-}
-
-// CHECK-LABEL: test_vfmsh_laneq_f16
-// CHECK: [[TMP0:%.*]] = fpext half %b to float
-// CHECK: [[TMP1:%.*]] = fsub float -0.000000e+00, [[TMP0]]
-// CHECK: [[SUB:%.*]]  = fptrunc float [[TMP1]] to half
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP3]], i32 7
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
-float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
-  return vfmsh_laneq_f16(a, b, c, 7);
-}
-
-// CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
-// CHECK: ret <4 x half> [[MUL]]
-float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
-  return vmul_lane_f16(a, b, 3);
-}
-
-// CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
-// CHECK: ret <8 x half> [[MUL]]
-float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulq_lane_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmul_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
-// CHECK: ret <4 x half> [[MUL]]
-float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) {
-  return vmul_laneq_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmulq_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
-// CHECK: ret <8 x half> [[MUL]]
-float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) {
-  return vmulq_laneq_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmul_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP3]]
-// CHECK: ret <4 x half> [[MUL]]
-float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) {
-  return vmul_n_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmulq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %b, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %b, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %b, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %b, i32 7
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP7]]
-// CHECK: ret <8 x half> [[MUL]]
-float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
-  return vmulq_n_f16(a, b);
-}
-
-// FIXME: Fix it when fp16 non-storage-only type becomes available.
-// CHECK-LABEL: test_vmulh_lane_f16
-// CHECK: [[CONV0:%.*]] = fpext half %a to float
-// CHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CHECK: ret half [[CONV3:%.*]]
-float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) {
-  return vmulh_lane_f16(a, b, 3);
-}
-
-// CHECK-LABEL: test_vmulh_laneq_f16
-// CHECK: [[CONV0:%.*]] = fpext half %a to float
-// CHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CHECK: ret half [[CONV3:%.*]]
-float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
-  return vmulh_laneq_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmulx_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]])
-// CHECK: ret <4 x half> [[MUL]]
-float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
-  return vmulx_lane_f16(a, b, 3);
-}
-
-// CHECK-LABEL: test_vmulxq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]])
-// CHECK: ret <8 x half> [[MUL]]
-float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulxq_lane_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmulx_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]])
-// CHECK: ret <4 x half> [[MUL]]
-float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
-  return vmulx_laneq_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmulxq_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]])
-// CHECK: ret <8 x half> [[MUL]]
-float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
-  return vmulxq_laneq_f16(a, b, 7);
-}
-
-// CHECK-LABEL: test_vmulx_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[MUL:%.*]]  = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP3]])
-// CHECK: ret <4 x half> [[MUL]]
-float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
-  return vmulx_n_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmulxq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %b, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %b, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %b, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %b, i32 7
-// CHECK: [[MUL:%.*]]  = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP7]])
-// CHECK: ret <8 x half> [[MUL]]
-float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) {
-  return vmulxq_n_f16(a, b);
-}
-
-/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h)
-// CCHECK-LABEL: test_vmulxh_lane_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
-float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
-  return vmulxh_lane_f16(a, b, 3);
-}
-
-// CCHECK-LABEL: test_vmulxh_laneq_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
-float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
-  return vmulxh_laneq_f16(a, b, 7);
-}
-*/
-
-// CHECK-LABEL: test_vmaxv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vmaxv_f16(float16x4_t a) {
-  return vmaxv_f16(a);
-}
-
-// CHECK-LABEL: test_vmaxvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vmaxvq_f16(float16x8_t a) {
-  return vmaxvq_f16(a);
-}
-
-// CHECK-LABEL: test_vminv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vminv_f16(float16x4_t a) {
-  return vminv_f16(a);
-}
-
-// CHECK-LABEL: test_vminvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vminvq_f16(float16x8_t a) {
-  return vminvq_f16(a);
-}
-
-// CHECK-LABEL: test_vmaxnmv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vmaxnmv_f16(float16x4_t a) {
-  return vmaxnmv_f16(a);
-}
-
-// CHECK-LABEL: test_vmaxnmvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vmaxnmvq_f16(float16x8_t a) {
-  return vmaxnmvq_f16(a);
-}
-
-// CHECK-LABEL: test_vminnmv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vminnmv_f16(float16x4_t a) {
-  return vminnmv_f16(a);
-}
-
-// CHECK-LABEL: test_vminnmvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
-float16_t test_vminnmvq_f16(float16x8_t a) {
-  return vminnmvq_f16(a);
-}
-
-// CHECK-LABEL: test_vbsl_f16
-// CHECK:  [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:  [[TMP1:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK:  [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:  [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:  [[TMP4:%.*]] = and <4 x i16> %a, [[TMP2]]
-// CHECK:  [[TMP5:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:  [[TMP6:%.*]] = and <4 x i16> [[TMP5]], [[TMP3]]
-// CHECK:  [[TMP7:%.*]] = or <4 x i16> [[TMP4]], [[TMP6]]
-// CHECK:  [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <4 x half>
-// CHECK:  ret <4 x half> [[TMP8]]
-float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
-  return vbsl_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vbslq_f16
-// CHECK:  [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:  [[TMP1:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK:  [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:  [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:  [[TMP4:%.*]] = and <8 x i16> %a, [[TMP2]]
-// CHECK:  [[TMP5:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:  [[TMP6:%.*]] = and <8 x i16> [[TMP5]], [[TMP3]]
-// CHECK:  [[TMP7:%.*]] = or <8 x i16> [[TMP4]], [[TMP6]]
-// CHECK:  [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <8 x half>
-// CHECK:  ret <8 x half> [[TMP8]]
-float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
-  return vbslq_f16(a, b, c);
-}
-
-// CHECK-LABEL: test_vzip_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
-float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
-  return vzip_f16(a, b);
-}
-
-// CHECK-LABEL: test_vzipq_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
-float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
-  return vzipq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vuzp_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
-float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
-  return vuzp_f16(a, b);
-}
-
-// CHECK-LABEL: test_vuzpq_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
-float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
-  return vuzpq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vtrn_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
-float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
-  return vtrn_f16(a, b);
-}
-
-// CHECK-LABEL: test_vtrnq_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]] 
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
-float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
-  return vtrnq_f16(a, b);
-}
-
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %a, i32 3
-// CHECK:   ret <4 x half> [[TMP3]]
-float16x4_t test_vmov_n_f16(float16_t a) {
-  return vmov_n_f16(a);
-}
-
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %a, i32 3
-// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %a, i32 4
-// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %a, i32 5
-// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %a, i32 6
-// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %a, i32 7
-// CHECK:   ret <8 x half> [[TMP7]]
-float16x8_t test_vmovq_n_f16(float16_t a) {
-  return vmovq_n_f16(a);
-}
-
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %a, i32 3
-// CHECK:   ret <4 x half> [[TMP3]]
-float16x4_t test_vdup_n_f16(float16_t a) {
-  return vdup_n_f16(a);
-}
-
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %a, i32 3
-// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %a, i32 4
-// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %a, i32 5
-// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %a, i32 6
-// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %a, i32 7
-// CHECK:   ret <8 x half> [[TMP7]]
-float16x8_t test_vdupq_n_f16(float16_t a) {
-  return vdupq_n_f16(a);
-}
-
-// CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[SHFL]]
-float16x4_t test_vdup_lane_f16(float16x4_t a) {
-  return vdup_lane_f16(a, 3);
-}
-
-// CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x half> [[SHFL]]
-float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 7);
-}
-
-// CHECK-LABEL: @test_vext_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-// CHECK:   ret <4 x half> [[VEXT]]
-float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
-  return vext_f16(a, b, 2);
-}
-
-// CHECK-LABEL: @test_vextq_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-// CHECK:   ret <8 x half> [[VEXT]]
-float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
-  return vextq_f16(a, b, 5);
-}
-
-// CHECK-LABEL: @test_vrev64_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x half> [[SHFL]]
-float16x4_t test_vrev64_f16(float16x4_t a) {
-  return vrev64_f16(a);
-}
-
-// CHECK-LABEL: @test_vrev64q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x half> [[SHFL]]
-float16x8_t test_vrev64q_f16(float16x8_t a) {
-  return vrev64q_f16(a);
-}
-
-// CHECK-LABEL: @test_vzip1_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x half> [[SHFL]]
-float16x4_t test_vzip1_f16(float16x4_t a, float16x4_t b) {
-  return vzip1_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vzip1q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x half> [[SHFL]]
-float16x8_t test_vzip1q_f16(float16x8_t a, float16x8_t b) {
-  return vzip1q_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vzip2_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x half> [[SHFL]]
-float16x4_t test_vzip2_f16(float16x4_t a, float16x4_t b) {
-  return vzip2_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vzip2q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x half> [[SHFL]]
-float16x8_t test_vzip2q_f16(float16x8_t a, float16x8_t b) {
-  return vzip2q_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vuzp1_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
-float16x4_t test_vuzp1_f16(float16x4_t a, float16x4_t b) {
-  return vuzp1_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vuzp1q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
-float16x8_t test_vuzp1q_f16(float16x8_t a, float16x8_t b) {
-  return vuzp1q_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vuzp2_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
-float16x4_t test_vuzp2_f16(float16x4_t a, float16x4_t b) {
-  return vuzp2_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vuzp2q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
-float16x8_t test_vuzp2q_f16(float16x8_t a, float16x8_t b) {
-  return vuzp2q_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vtrn1_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
-float16x4_t test_vtrn1_f16(float16x4_t a, float16x4_t b) {
-  return vtrn1_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vtrn1q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
-float16x8_t test_vtrn1q_f16(float16x8_t a, float16x8_t b) {
-  return vtrn1q_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vtrn2_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
-float16x4_t test_vtrn2_f16(float16x4_t a, float16x4_t b) {
-  return vtrn2_f16(a, b);
-}
-
-// CHECK-LABEL: @test_vtrn2q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
-float16x8_t test_vtrn2q_f16(float16x8_t a, float16x8_t b) {
-  return vtrn2q_f16(a, b);
-}
-
diff --git a/test/CodeGen/address-space.c b/test/CodeGen/address-space.c
index 54e059385772..28b3954ab7d2 100644
--- a/test/CodeGen/address-space.c
+++ b/test/CodeGen/address-space.c
@@ -24,11 +24,13 @@ int test2(int i) { return ban[i]; }
 __attribute__((address_space(2))) int *A, *B;
 
 // CHECK-LABEL: define void @test3()
-// GIZ: load i32 addrspace(2)*, i32 addrspace(2)** @B
-// PIZ: load i32 addrspace(2)*, i32 addrspace(2)* addrspace(4)* @B
+// X86: load i32 addrspace(2)*, i32 addrspace(2)** @B
+// AMDGIZ: load i32 addrspace(2)*, i32 addrspace(2)** addrspacecast (i32 addrspace(2)* addrspace(1)* @B to i32 addrspace(2)**)
+// PIZ: load i32 addrspace(2)*, i32 addrspace(2)* addrspace(4)* addrspacecast (i32 addrspace(2)* addrspace(1)* @B to i32 addrspace(2)* addrspace(4)*)
 // CHECK: load i32, i32 addrspace(2)*
-// GIZ: load i32 addrspace(2)*, i32 addrspace(2)** @A
-// PIZ: load i32 addrspace(2)*, i32 addrspace(2)* addrspace(4)* @A
+// X86: load i32 addrspace(2)*, i32 addrspace(2)** @A
+// AMDGIZ: load i32 addrspace(2)*, i32 addrspace(2)** addrspacecast (i32 addrspace(2)* addrspace(1)* @A to i32 addrspace(2)**)
+// PIZ: load i32 addrspace(2)*, i32 addrspace(2)* addrspace(4)* addrspacecast (i32 addrspace(2)* addrspace(1)* @A to i32 addrspace(2)* addrspace(4)*)
 // CHECK: store i32 {{.*}}, i32 addrspace(2)*
 void test3() {
   *A = *B;
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index b01c90c03a96..62888dd73339 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -3896,8 +3896,9 @@ int64x2_t test_vld1q_s64(int64_t const * a) {
 
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0i8(i8* [[TMP0]], i32 2)
-// CHECK:   ret <8 x half> [[VLD1]]
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP1]]
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
@@ -3989,8 +3990,9 @@ int64x1_t test_vld1_s64(int64_t const * a) {
 
 // CHECK-LABEL: @test_vld1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0i8(i8* [[TMP0]], i32 2)
-// CHECK:   ret <4 x half> [[VLD1]]
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
@@ -4104,11 +4106,12 @@ int64x2_t test_vld1q_dup_s64(int64_t const * a) {
 
 // CHECK-LABEL: @test_vld1q_dup_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x half> [[LANE]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
@@ -4230,11 +4233,12 @@ int64x1_t test_vld1_dup_s64(int64_t const * a) {
 
 // CHECK-LABEL: @test_vld1_dup_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x half> [[LANE]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
@@ -4361,11 +4365,12 @@ int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
 // CHECK-LABEL: @test_vld1q_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
-// CHECK:   ret <8 x half> [[VLD1_LANE]]
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
@@ -4493,11 +4498,12 @@ int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
 // CHECK-LABEL: @test_vld1_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
-// CHECK:   ret <4 x half> [[VLD1_LANE]]
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
@@ -4590,7 +4596,7 @@ int32x4x2_t test_vld2q_s32(int32_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half>
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
@@ -4695,7 +4701,7 @@ int64x1x2_t test_vld2_s64(int64_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x half>, <4 x half>
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
@@ -4800,7 +4806,7 @@ int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x half>, <4 x half>
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   return vld2_dup_f16(a);
 }
@@ -4959,9 +4965,9 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
@@ -5192,9 +5198,9 @@ int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
@@ -5331,7 +5337,7 @@ int32x4x3_t test_vld3q_s32(int32_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
@@ -5436,7 +5442,7 @@ int64x1x3_t test_vld3_s64(int64_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
@@ -5541,7 +5547,7 @@ int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   return vld3_dup_f16(a);
 }
@@ -5724,10 +5730,10 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
@@ -5998,10 +6004,10 @@ int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
@@ -6151,7 +6157,7 @@ int32x4x4_t test_vld4q_s32(int32_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
@@ -6256,7 +6262,7 @@ int64x1x4_t test_vld4_s64(int64_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
@@ -6361,7 +6367,7 @@ int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
 // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD_DUP:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   return vld4_dup_f16(a);
 }
@@ -6568,11 +6574,11 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
@@ -6883,11 +6889,11 @@ int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
@@ -15778,8 +15784,8 @@ void test_vst1q_s64(int64_t * a, int64x2_t b) {
 // CHECK-LABEL: @test_vst1q_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* [[TMP0]], <8 x half> [[TMP2]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
 // CHECK:   ret void
 void test_vst1q_f16(float16_t * a, float16x8_t b) {
   vst1q_f16(a, b);
@@ -15889,8 +15895,8 @@ void test_vst1_s64(int64_t * a, int64x1_t b) {
 // CHECK-LABEL: @test_vst1_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* [[TMP0]], <4 x half> [[TMP2]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
 // CHECK:   ret void
 void test_vst1_f16(float16_t * a, float16x4_t b) {
   vst1_f16(a, b);
@@ -16012,10 +16018,10 @@ void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
 // CHECK-LABEL: @test_vst1q_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   store half [[TMP3]], half* [[TMP4]], align 2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
 // CHECK:   ret void
 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
@@ -16144,10 +16150,10 @@ void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
 // CHECK-LABEL: @test_vst1_lane_f16(
 // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   store half [[TMP3]], half* [[TMP4]], align 2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
 // CHECK:   ret void
 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
@@ -16349,9 +16355,9 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
 // CHECK:   ret void
 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   vst2q_f16(a, b);
@@ -16646,9 +16652,9 @@ void test_vst2_s64(int64_t * a, int64x1x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
 // CHECK:   ret void
 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   vst2_f16(a, b);
@@ -16849,9 +16855,9 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
 // CHECK:   ret void
 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
@@ -17073,9 +17079,9 @@ void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
 // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
 // CHECK:   ret void
 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
@@ -17348,10 +17354,10 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
 // CHECK:   ret void
 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   vst3q_f16(a, b);
@@ -17699,10 +17705,10 @@ void test_vst3_s64(int64_t * a, int64x1x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
 // CHECK:   ret void
 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   vst3_f16(a, b);
@@ -17940,10 +17946,10 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
 // CHECK:   ret void
 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
@@ -18205,10 +18211,10 @@ void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
 // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
 // CHECK:   ret void
 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
@@ -18524,11 +18530,11 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
 // CHECK:   ret void
 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   vst4q_f16(a, b);
@@ -18929,11 +18935,11 @@ void test_vst4_s64(int64_t * a, int64x1x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
 // CHECK:   ret void
 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   vst4_f16(a, b);
@@ -19208,11 +19214,11 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
 // CHECK:   ret void
 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
@@ -19514,11 +19520,11 @@ void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
 // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
 // CHECK:   ret void
 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
diff --git a/test/CodeGen/bitscan-builtins.c b/test/CodeGen/bitscan-builtins.c
index 71e49845f896..5fd3f13fbc73 100644
--- a/test/CodeGen/bitscan-builtins.c
+++ b/test/CodeGen/bitscan-builtins.c
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
+// PR33722
+// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-unknown -D_MSC_VER -emit-llvm -o - %s | FileCheck %s
+
 #include <immintrin.h>
 
 int test_bit_scan_forward(int a) {
diff --git a/test/CodeGen/default-address-space.c b/test/CodeGen/default-address-space.c
index fc5f55ffd6f4..b7f40585b267 100644
--- a/test/CodeGen/default-address-space.c
+++ b/test/CodeGen/default-address-space.c
@@ -1,24 +1,24 @@
 // RUN: %clang_cc1 -triple amdgcn -emit-llvm < %s | FileCheck -check-prefixes=PIZ,COM %s
 // RUN: %clang_cc1 -triple amdgcn---amdgiz -emit-llvm < %s | FileCheck -check-prefixes=CHECK,COM %s
 
-// PIZ-DAG: @foo = common addrspace(4) global i32 0
-// CHECK-DAG: @foo = common global i32 0
+// PIZ-DAG: @foo = common addrspace(1) global i32 0
+// CHECK-DAG: @foo = common addrspace(1) global i32 0
 int foo;
 
-// PIZ-DAG: @ban = common addrspace(4) global [10 x i32] zeroinitializer
-// CHECK-DAG: @ban = common global [10 x i32] zeroinitializer
+// PIZ-DAG: @ban = common addrspace(1) global [10 x i32] zeroinitializer
+// CHECK-DAG: @ban = common addrspace(1) global [10 x i32] zeroinitializer
 int ban[10];
 
-// PIZ-DAG: @A = common addrspace(4) global i32 addrspace(4)* null
-// PIZ-DAG: @B = common addrspace(4) global i32 addrspace(4)* null
-// CHECK-DAG: @A = common global i32* null
-// CHECK-DAG: @B = common global i32* null
+// PIZ-DAG: @A = common addrspace(1) global i32 addrspace(4)* null
+// PIZ-DAG: @B = common addrspace(1) global i32 addrspace(4)* null
+// CHECK-DAG: @A = common addrspace(1) global i32* null
+// CHECK-DAG: @B = common addrspace(1) global i32* null
 int *A;
 int *B;
 
 // COM-LABEL: define i32 @test1()
-// PIZ: load i32, i32 addrspace(4)* @foo
-// CHECK: load i32, i32* @foo
+// PIZ: load i32, i32 addrspace(4)* addrspacecast{{[^@]+}} @foo
+// CHECK: load i32, i32* addrspacecast{{[^@]+}} @foo
 int test1() { return foo; }
 
 // COM-LABEL: define i32 @test2(i32 %i)
@@ -30,13 +30,13 @@ int test1() { return foo; }
 int test2(int i) { return ban[i]; }
 
 // COM-LABEL: define void @test3()
-// PIZ: load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* @B
+// PIZ: load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* addrspacecast{{[^@]+}} @B
 // PIZ: load i32, i32 addrspace(4)*
-// PIZ: load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* @A
+// PIZ: load i32 addrspace(4)*, i32 addrspace(4)* addrspace(4)* addrspacecast{{[^@]+}} @A
 // PIZ: store i32 {{.*}}, i32 addrspace(4)*
-// CHECK: load i32*, i32** @B
+// CHECK: load i32*, i32** addrspacecast{{.*}} @B
 // CHECK: load i32, i32*
-// CHECK: load i32*, i32** @A
+// CHECK: load i32*, i32** addrspacecast{{.*}} @A
 // CHECK: store i32 {{.*}}, i32*
 void test3() {
   *A = *B;
@@ -50,7 +50,7 @@ void test3() {
 // PIZ: %[[arrayidx:.*]] = getelementptr inbounds i32, i32 addrspace(4)* %[[r0]]
 // PIZ: store i32 0, i32 addrspace(4)* %[[arrayidx]]
 // CHECK-LABEL: define void @test4(i32* %a)
-// CHECK: %[[alloca:.*]] = alloca i32*, align 4, addrspace(5)
+// CHECK: %[[alloca:.*]] = alloca i32*, align 8, addrspace(5)
 // CHECK: %[[a_addr:.*]] = addrspacecast{{.*}} %[[alloca]] to i32**
 // CHECK: store i32* %a, i32** %[[a_addr]]
 // CHECK: %[[r0:.*]] = load i32*, i32** %[[a_addr]]
diff --git a/test/CodeGen/mcount.c b/test/CodeGen/mcount.c
index 2839d8ef6af3..2284acac0f8e 100644
--- a/test/CodeGen/mcount.c
+++ b/test/CodeGen/mcount.c
@@ -8,6 +8,10 @@
 // RUN: %clang_cc1 -pg -triple arm-netbsd-eabi -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple aarch64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mips-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple mips-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple mipsel-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple mips64-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple mips64el-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple powerpc-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple powerpc64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple powerpc64le-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
diff --git a/test/CodeGen/ms-barriers-intrinsics.c b/test/CodeGen/ms-barriers-intrinsics.c
index b0dfc3042a6e..c7da50cd0d83 100644
--- a/test/CodeGen/ms-barriers-intrinsics.c
+++ b/test/CodeGen/ms-barriers-intrinsics.c
@@ -13,19 +13,19 @@ typedef __SIZE_TYPE__ size_t;
 
 void test_ReadWriteBarrier() { _ReadWriteBarrier(); }
 // CHECK-LABEL: define void @test_ReadWriteBarrier
-// CHECK:   fence singlethread seq_cst
+// CHECK:   fence syncscope("singlethread") seq_cst
 // CHECK:   ret void
 // CHECK: }
 
 void test_ReadBarrier() { _ReadBarrier(); }
 // CHECK-LABEL: define void @test_ReadBarrier
-// CHECK:   fence singlethread seq_cst
+// CHECK:   fence syncscope("singlethread") seq_cst
 // CHECK:   ret void
 // CHECK: }
 
 void test_WriteBarrier() { _WriteBarrier(); }
 // CHECK-LABEL: define void @test_WriteBarrier
-// CHECK:   fence singlethread seq_cst
+// CHECK:   fence syncscope("singlethread") seq_cst
 // CHECK:   ret void
 // CHECK: }
 
diff --git a/test/CodeGen/no-devirt.cpp b/test/CodeGen/no-devirt.cpp
index 4333b7cde7c6..544b1394f421 100644
--- a/test/CodeGen/no-devirt.cpp
+++ b/test/CodeGen/no-devirt.cpp
@@ -21,7 +21,7 @@ template < typename T, int N = 0 > class TmplWithArray {
 struct Wrapper {
   TmplWithArray<bool, 10> data;
   bool indexIt(int a) {
-    if (a > 6) return data[a] ;      // Should not devirtualize
+    if (a > 6) return data[a] ;      // Should devirtualize
     if (a > 4) return data.func1(a); // Should devirtualize
     return data.func2(a);            // Should devirtualize
   }
@@ -53,7 +53,7 @@ bool stuff(int p)
 }
 #endif
 
-// CHECK-NOT: call {{.*}} @_ZN13TmplWithArrayIbLi10EEixEi
+// CHECK-DAG: call {{.*}} @_ZN13TmplWithArrayIbLi10EEixEi
 // CHECK-DAG: call {{.*}} @_ZN13TmplWithArrayIbLi10EE5func1Ei
 // CHECK-DAG: call {{.*}} @_ZN13TmplWithArrayIbLi10EE5func2Ei
 
diff --git a/test/CodeGen/pgo-sample-thinlto-summary.c b/test/CodeGen/pgo-sample-thinlto-summary.c
index 51c8faa6be6d..7045db08f22e 100644
--- a/test/CodeGen/pgo-sample-thinlto-summary.c
+++ b/test/CodeGen/pgo-sample-thinlto-summary.c
@@ -1,9 +1,7 @@
 // RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=SAMPLEPGO
 // RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=THINLTO
 // RUN: %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=SAMPLEPGO
-// FIXME: Run the following command once LTOPreLinkDefaultPipeline is
-//        customized.
-// %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=THINLTO
+// RUN: %clang_cc1 -O2 -fexperimental-new-pass-manager -fprofile-sample-use=%S/Inputs/pgo-sample-thinlto-summary.prof %s -emit-llvm -flto=thin -o - 2>&1 | FileCheck %s -check-prefix=THINLTO
 // Checks if hot call is inlined by normal compile, but not inlined by
 // thinlto compile.
 
diff --git a/test/CodeGenCXX/amdgcn-automatic-variable.cpp b/test/CodeGenCXX/amdgcn-automatic-variable.cpp
index 7df27c28e6d2..a82275206c6d 100644
--- a/test/CodeGenCXX/amdgcn-automatic-variable.cpp
+++ b/test/CodeGenCXX/amdgcn-automatic-variable.cpp
@@ -15,8 +15,8 @@ void func2(void) {
   // CHECK: %lv1 = alloca i32, align 4, addrspace(5)
   // CHECK: %lv2 = alloca i32, align 4, addrspace(5)
   // CHECK: %la = alloca [100 x i32], align 4, addrspace(5)
-  // CHECK: %lp1 = alloca i32*, align 4, addrspace(5)
-  // CHECK: %lp2 = alloca i32*, align 4, addrspace(5)
+  // CHECK: %lp1 = alloca i32*, align 8, addrspace(5)
+  // CHECK: %lp2 = alloca i32*, align 8, addrspace(5)
   // CHECK: %lvc = alloca i32, align 4, addrspace(5)
 
   // CHECK: %[[r0:.*]] = addrspacecast i32 addrspace(5)* %lv1 to i32*
@@ -34,12 +34,12 @@ void func2(void) {
   la[0] = 3;
 
   // CHECK: %[[r3:.*]] = addrspacecast i32* addrspace(5)* %lp1 to i32**
-  // CHECK: store i32* %[[r0]], i32** %[[r3]], align 4
+  // CHECK: store i32* %[[r0]], i32** %[[r3]], align 8
   int *lp1 = &lv1;
 
   // CHECK: %[[r4:.*]] = addrspacecast i32* addrspace(5)* %lp2 to i32**
   // CHECK: %[[arraydecay:.*]] = getelementptr inbounds [100 x i32], [100 x i32]* %[[r2]], i32 0, i32 0
-  // CHECK: store i32* %[[arraydecay]], i32** %[[r4]], align 4
+  // CHECK: store i32* %[[arraydecay]], i32** %[[r4]], align 8
   int *lp2 = la;
 
   // CHECK: call void @_Z5func1Pi(i32* %[[r0]])
@@ -80,3 +80,5 @@ void func4(int x) {
   // CHECK: call void @_Z5func1Pi(i32* %[[r0]])
   func1(&x);
 }
+
+// CHECK-NOT: !opencl.ocl.version
diff --git a/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp b/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
index 7bab11488ad4..67215ef48fb3 100644
--- a/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-none-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefixes=X86,CHECK %s
+// RUN: %clang_cc1 -std=c++11 -triple amdgcn-amd-amdhsa-amdgiz -DNO_TLS -emit-llvm -o - %s | FileCheck -check-prefixes=AMD,CHECK %s
 
 namespace std {
   typedef decltype(sizeof(int)) size_t;
@@ -46,54 +47,72 @@ struct wantslist1 {
   wantslist1(std::initializer_list<destroyme1>);
   ~wantslist1();
 };
-
-// CHECK: @_ZGR15globalInitList1_ = internal constant [3 x i32] [i32 1, i32 2, i32 3]
-// CHECK: @globalInitList1 = global %{{[^ ]+}} { i32* getelementptr inbounds ([3 x i32], [3 x i32]* @_ZGR15globalInitList1_, i32 0, i32 0), i{{32|64}} 3 }
+// X86: @_ZGR15globalInitList1_ = internal constant [3 x i32] [i32 1, i32 2, i32 3]
+// X86: @globalInitList1 = global %{{[^ ]+}} { i32* getelementptr inbounds ([3 x i32], [3 x i32]* @_ZGR15globalInitList1_, i32 0, i32 0), i{{32|64}} 3 }
+// AMD: @_ZGR15globalInitList1_ = internal addrspace(1) constant [3 x i32] [i32 1, i32 2, i32 3]
+// AMD: @globalInitList1 = addrspace(1) global %{{[^ ]+}} { i32* addrspacecast (i32 addrspace(1)* getelementptr inbounds ([3 x i32], [3 x i32] addrspace(1)* @_ZGR15globalInitList1_, i32 0, i32 0) to i32*), i{{32|64}} 3 }
 std::initializer_list<int> globalInitList1 = {1, 2, 3};
 
+#ifndef NO_TLS
 namespace thread_local_global_array {
-  // FIXME: We should be able to constant-evaluate this even though the
-  // initializer is not a constant expression (pointers to thread_local
-  // objects aren't really a problem).
-  //
-  // CHECK: @_ZN25thread_local_global_array1xE = thread_local global
-  // CHECK: @_ZGRN25thread_local_global_array1xE_ = internal thread_local constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
-  std::initializer_list<int> thread_local x = { 1, 2, 3, 4 };
+// FIXME: We should be able to constant-evaluate this even though the
+// initializer is not a constant expression (pointers to thread_local
+// objects aren't really a problem).
+//
+// X86: @_ZN25thread_local_global_array1xE = thread_local global
+// X86: @_ZGRN25thread_local_global_array1xE_ = internal thread_local constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
+std::initializer_list<int> thread_local x = {1, 2, 3, 4};
 }
+#endif
 
-// CHECK: @globalInitList2 = global %{{[^ ]+}} zeroinitializer
-// CHECK: @_ZGR15globalInitList2_ = internal global [2 x %[[WITHARG:[^ ]*]]] zeroinitializer
+// X86: @globalInitList2 = global %{{[^ ]+}} zeroinitializer
+// X86: @_ZGR15globalInitList2_ = internal global [2 x %[[WITHARG:[^ ]*]]] zeroinitializer
+// AMD: @globalInitList2 = addrspace(1) global %{{[^ ]+}} zeroinitializer
+// AMD: @_ZGR15globalInitList2_ = internal addrspace(1) global [2 x %[[WITHARG:[^ ]*]]] zeroinitializer
 
-// CHECK: @_ZN15partly_constant1kE = global i32 0, align 4
-// CHECK: @_ZN15partly_constant2ilE = global {{.*}} null, align 8
-// CHECK: @[[PARTLY_CONSTANT_OUTER:_ZGRN15partly_constant2ilE.*]] = internal global {{.*}} zeroinitializer, align 8
-// CHECK: @[[PARTLY_CONSTANT_INNER:_ZGRN15partly_constant2ilE.*]] = internal global [3 x {{.*}}] zeroinitializer, align 8
-// CHECK: @[[PARTLY_CONSTANT_FIRST:_ZGRN15partly_constant2ilE.*]] = internal constant [3 x i32] [i32 1, i32 2, i32 3], align 4
-// CHECK: @[[PARTLY_CONSTANT_SECOND:_ZGRN15partly_constant2ilE.*]] = internal global [2 x i32] zeroinitializer, align 4
-// CHECK: @[[PARTLY_CONSTANT_THIRD:_ZGRN15partly_constant2ilE.*]] = internal constant [4 x i32] [i32 5, i32 6, i32 7, i32 8], align 4
+// X86: @_ZN15partly_constant1kE = global i32 0, align 4
+// X86: @_ZN15partly_constant2ilE = global {{.*}} null, align 8
+// X86: @[[PARTLY_CONSTANT_OUTER:_ZGRN15partly_constant2ilE.*]] = internal global {{.*}} zeroinitializer, align 8
+// X86: @[[PARTLY_CONSTANT_INNER:_ZGRN15partly_constant2ilE.*]] = internal global [3 x {{.*}}] zeroinitializer, align 8
+// X86: @[[PARTLY_CONSTANT_FIRST:_ZGRN15partly_constant2ilE.*]] = internal constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+// X86: @[[PARTLY_CONSTANT_SECOND:_ZGRN15partly_constant2ilE.*]] = internal global [2 x i32] zeroinitializer, align 4
+// X86: @[[PARTLY_CONSTANT_THIRD:_ZGRN15partly_constant2ilE.*]] = internal constant [4 x i32] [i32 5, i32 6, i32 7, i32 8], align 4
+// AMD: @_ZN15partly_constant1kE = addrspace(1) global i32 0, align 4
+// AMD: @_ZN15partly_constant2ilE = addrspace(2) global {{.*}} null, align 8
+// AMD: @[[PARTLY_CONSTANT_OUTER:_ZGRN15partly_constant2ilE.*]] = internal addrspace(2) global {{.*}} zeroinitializer, align 8
+// AMD: @[[PARTLY_CONSTANT_INNER:_ZGRN15partly_constant2ilE.*]] = internal addrspace(2) global [3 x {{.*}}] zeroinitializer, align 8
+// AMD: @[[PARTLY_CONSTANT_FIRST:_ZGRN15partly_constant2ilE.*]] = internal addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+// AMD: @[[PARTLY_CONSTANT_SECOND:_ZGRN15partly_constant2ilE.*]] = internal addrspace(2) global [2 x i32] zeroinitializer, align 4
+// AMD: @[[PARTLY_CONSTANT_THIRD:_ZGRN15partly_constant2ilE.*]] = internal addrspace(2) constant [4 x i32] [i32 5, i32 6, i32 7, i32 8], align 4
 
-// CHECK: @[[REFTMP1:.*]] = private constant [2 x i32] [i32 42, i32 43], align 4
-// CHECK: @[[REFTMP2:.*]] = private constant [3 x %{{.*}}] [%{{.*}} { i32 1 }, %{{.*}} { i32 2 }, %{{.*}} { i32 3 }], align 4
+// X86: @[[REFTMP1:.*]] = private constant [2 x i32] [i32 42, i32 43], align 4
+// X86: @[[REFTMP2:.*]] = private constant [3 x %{{.*}}] [%{{.*}} { i32 1 }, %{{.*}} { i32 2 }, %{{.*}} { i32 3 }], align 4
+// AMD: @[[REFTMP1:.*]] = private addrspace(2) constant [2 x i32] [i32 42, i32 43], align 4
+// AMD: @[[REFTMP2:.*]] = private addrspace(2) constant [3 x %{{.*}}] [%{{.*}} { i32 1 }, %{{.*}} { i32 2 }, %{{.*}} { i32 3 }], align 4
 
 // CHECK: appending global
 
-
 // thread_local initializer:
-// CHECK-LABEL: define internal void
-// CHECK: store i32* getelementptr inbounds ([4 x i32], [4 x i32]* @_ZGRN25thread_local_global_array1xE_, i64 0, i64 0),
-// CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* @_ZN25thread_local_global_array1xE, i32 0, i32 0), align 8
-// CHECK: store i64 4, i64* getelementptr inbounds ({{.*}}, {{.*}}* @_ZN25thread_local_global_array1xE, i32 0, i32 1), align 8
+// X86-LABEL: define internal void @__cxx_global_var_init
+// X86: store i32* getelementptr inbounds ([4 x i32], [4 x i32]* @_ZGRN25thread_local_global_array1xE_, i64 0, i64 0),
+// X86:       i32** getelementptr inbounds ({{.*}}, {{.*}}* @_ZN25thread_local_global_array1xE, i32 0, i32 0), align 8
+// X86: store i64 4, i64* getelementptr inbounds ({{.*}}, {{.*}}* @_ZN25thread_local_global_array1xE, i32 0, i32 1), align 8
 
-
-// CHECK-LABEL: define internal void
-// CHECK: call void @_ZN8witharg1C1ERK10destroyme1(%[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* @_ZGR15globalInitList2_, i{{32|64}} 0, i{{32|64}} 0
-// CHECK: call void @_ZN8witharg1C1ERK10destroyme1(%[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* @_ZGR15globalInitList2_, i{{32|64}} 0, i{{32|64}} 1
-// CHECK: __cxa_atexit
-// CHECK: store %[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* @_ZGR15globalInitList2_, i64 0, i64 0),
-// CHECK:       %[[WITHARG]]** getelementptr inbounds (%{{.*}}, %{{.*}}* @globalInitList2, i32 0, i32 0), align 8
-// CHECK: store i64 2, i64* getelementptr inbounds (%{{.*}}, %{{.*}}* @globalInitList2, i32 0, i32 1), align 8
-// CHECK: call void @_ZN10destroyme1D1Ev
+// CHECK-LABEL: define internal void @__cxx_global_var_init
+// X86: call void @_ZN8witharg1C1ERK10destroyme1(%[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* @_ZGR15globalInitList2_, i{{32|64}} 0, i{{32|64}} 0
+// X86: call void @_ZN8witharg1C1ERK10destroyme1(%[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* @_ZGR15globalInitList2_, i{{32|64}} 0, i{{32|64}} 1
+// AMD: call void @_ZN8witharg1C1ERK10destroyme1(%[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* addrspacecast ({{[^@]+}} @_ZGR15globalInitList2_ {{[^)]+}}), i{{32|64}} 0, i{{32|64}} 0
+// AMD: call void @_ZN8witharg1C1ERK10destroyme1(%[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* addrspacecast ({{[^@]+}} @_ZGR15globalInitList2_ {{[^)]+}}), i{{32|64}} 0, i{{32|64}} 1
+// CHECK: call i32 @__cxa_atexit
+// X86: store %[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* @_ZGR15globalInitList2_, i64 0, i64 0),
+// X86:       %[[WITHARG]]** getelementptr inbounds (%{{.*}}, %{{.*}}* @globalInitList2, i32 0, i32 0), align 8
+// X86: store i64 2, i64* getelementptr inbounds (%{{.*}}, %{{.*}}* @globalInitList2, i32 0, i32 1), align 8
+// AMD: store %[[WITHARG]]* getelementptr inbounds ([2 x %[[WITHARG]]], [2 x %[[WITHARG]]]* addrspacecast ({{[^@]+}} @_ZGR15globalInitList2_ {{[^)]+}}), i64 0, i64 0),
+// AMD:       %[[WITHARG]]** getelementptr inbounds (%{{.*}}, %{{.*}}* addrspacecast ({{[^@]+}} @globalInitList2 {{[^)]+}}), i32 0, i32 0), align 8
+// AMD: store i64 2, i64* getelementptr inbounds (%{{.*}}, %{{.*}}* addrspacecast ({{[^@]+}} @globalInitList2 {{[^)]+}}), i32 0, i32 1), align 8
 // CHECK: call void @_ZN10destroyme1D1Ev
+// CHECK-NEXT: call void @_ZN10destroyme1D1Ev
+// CHECK-NEXT: ret void
 std::initializer_list<witharg1> globalInitList2 = {
   witharg1(destroyme1()), witharg1(destroyme1())
 };
@@ -101,7 +120,9 @@ std::initializer_list<witharg1> globalInitList2 = {
 void fn1(int i) {
   // CHECK-LABEL: define void @_Z3fn1i
   // temporary array
-  // CHECK: [[array:%[^ ]+]] = alloca [3 x i32]
+  // X86: [[array:%[^ ]+]] = alloca [3 x i32]
+  // AMD: [[alloca:%[^ ]+]] = alloca [3 x i32], align 4, addrspace(5)
+  // AMD: [[array:%[^ ]+]] = addrspacecast [3 x i32] addrspace(5)* [[alloca]] to [3 x i32]*
   // CHECK: getelementptr inbounds [3 x i32], [3 x i32]* [[array]], i{{32|64}} 0
   // CHECK-NEXT: store i32 1, i32*
   // CHECK-NEXT: getelementptr
@@ -175,7 +196,6 @@ void fn6() {
   destroyme2 dm2;
   // CHECK: call void @_ZN10destroyme2D1Ev
 }
-
 void fn7() {
   // CHECK-LABEL: define void @_Z3fn7v
   // temps should be destroyed before dm2
@@ -366,37 +386,36 @@ namespace partly_constant {
   std::initializer_list<std::initializer_list<int>> &&il = { { 1, 2, 3 }, { 4, k }, { 5, 6, 7, 8 } };
   // First init list.
   // CHECK-NOT: @[[PARTLY_CONSTANT_FIRST]],
-  // CHECK: store i32* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_FIRST]], i64 0, i64 0),
-  // CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_INNER]], i64 0, i64 0, i32 0)
-  // CHECK: store i64 3, i64* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_INNER]], i64 0, i64 0, i32 1)
+  // CHECK: store i32* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_FIRST]]{{.*}}, i64 0, i64 0),
+  // CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_INNER]]{{.*}}, i64 0, i64 0, i32 0)
+  // CHECK: store i64 3, i64* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_INNER]]{{.*}}, i64 0, i64 0, i32 1)
   // CHECK-NOT: @[[PARTLY_CONSTANT_FIRST]],
   //
   // Second init list array (non-constant).
-  // CHECK: store i32 4, i32* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_SECOND]], i64 0, i64 0)
-  // CHECK: load i32, i32* @_ZN15partly_constant1kE
-  // CHECK: store i32 {{.*}}, i32* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_SECOND]], i64 0, i64 1)
+  // CHECK: store i32 4, i32* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_SECOND]]{{.*}}, i64 0, i64 0)
+  // CHECK: load i32, i32* {{.*}}@_ZN15partly_constant1kE
+  // CHECK: store i32 {{.*}}, i32* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_SECOND]]{{.*}}, i64 0, i64 1)
   //
   // Second init list.
-  // CHECK: store i32* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_SECOND]], i64 0, i64 0),
-  // CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_INNER]], i64 0, i64 1, i32 0)
-  // CHECK: store i64 2, i64* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_INNER]], i64 0, i64 1, i32 1)
+  // CHECK: store i32* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_SECOND]]{{.*}}, i64 0, i64 0),
+  // CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_INNER]]{{.*}}, i64 0, i64 1, i32 0)
+  // CHECK: store i64 2, i64* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_INNER]]{{.*}}, i64 0, i64 1, i32 1)
   //
   // Third init list.
   // CHECK-NOT: @[[PARTLY_CONSTANT_THIRD]],
-  // CHECK: store i32* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_THIRD]], i64 0, i64 0),
-  // CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_INNER]], i64 0, i64 2, i32 0)
-  // CHECK: store i64 4, i64* getelementptr inbounds ({{.*}}, {{.*}}* @_ZGRN15partly_constant2ilE4_, i64 0, i64 2, i32 1)
+  // CHECK: store i32* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_THIRD]]{{.*}}, i64 0, i64 0),
+  // CHECK:       i32** getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_INNER]]{{.*}}, i64 0, i64 2, i32 0)
+  // CHECK: store i64 4, i64* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@_ZGRN15partly_constant2ilE4_{{.*}}, i64 0, i64 2, i32 1)
   // CHECK-NOT: @[[PARTLY_CONSTANT_THIRD]],
   //
   // Outer init list.
-  // CHECK: store {{.*}}* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_INNER]], i64 0, i64 0),
-  // CHECK:       {{.*}}** getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_OUTER]], i32 0, i32 0)
-  // CHECK: store i64 3, i64* getelementptr inbounds ({{.*}}, {{.*}}* @[[PARTLY_CONSTANT_OUTER]], i32 0, i32 1)
+  // CHECK: store {{.*}}* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_INNER]]{{.*}}, i64 0, i64 0),
+  // CHECK:       {{.*}}** getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_OUTER]]{{.*}}, i32 0, i32 0)
+  // CHECK: store i64 3, i64* getelementptr inbounds ({{.*}}, {{.*}}* {{.*}}@[[PARTLY_CONSTANT_OUTER]]{{.*}}, i32 0, i32 1)
   //
   // 'il' reference.
-  // CHECK: store {{.*}}* @[[PARTLY_CONSTANT_OUTER]], {{.*}}** @_ZN15partly_constant2ilE, align 8
+  // CHECK: store {{.*}}* {{.*}}@[[PARTLY_CONSTANT_OUTER]]{{.*}}, {{.*}}** {{.*}}@_ZN15partly_constant2ilE{{.*}}, align 8
 }
-
 namespace nested {
   struct A { A(); ~A(); };
   struct B { const A &a; ~B(); };
@@ -463,7 +482,7 @@ namespace PR20445 {
   template<int x> void f() { new MyClass({42, 43}); }
   template void f<0>();
   // CHECK-LABEL: define {{.*}} @_ZN7PR204451fILi0EEEvv(
-  // CHECK: store i32* getelementptr inbounds ([2 x i32], [2 x i32]* @[[REFTMP1]], i64 0, i64 0)
+  // CHECK: store i32* getelementptr inbounds ([2 x i32], [2 x i32]* {{.*}}@[[REFTMP1]]{{.*}}, i64 0, i64 0)
   // CHECK: call void @_ZN7PR204456vectorC1ESt16initializer_listIiE(
   // CHECK: call void @_ZN7PR204457MyClassC1ERKNS_6vectorE(
 }
@@ -476,9 +495,9 @@ namespace ConstExpr {
   };
   void f(std::initializer_list<C>);
   void g() {
-// CHECK-LABEL: _ZN9ConstExpr1gEv
-// CHECK: store %"class.ConstExpr::C"* getelementptr inbounds ([3 x %"class.ConstExpr::C"], [3 x %"class.ConstExpr::C"]* @[[REFTMP2]], i64 0, i64 0)
-// CHECK: call void @_ZN9ConstExpr1fESt16initializer_listINS_1CEE
+    // CHECK-LABEL: _ZN9ConstExpr1gEv
+    // CHECK: store %"class.ConstExpr::C"* getelementptr inbounds ([3 x %"class.ConstExpr::C"], [3 x %"class.ConstExpr::C"]* {{.*}}@[[REFTMP2]]{{.*}}, i64 0, i64 0)
+    // CHECK: call void @_ZN9ConstExpr1fESt16initializer_listINS_1CEE
     f({C(1), C(2), C(3)});
   }
 }
@@ -498,11 +517,13 @@ namespace B19773010 {
   void f1() {
     // CHECK-LABEL: @_ZN9B197730102f1Ev
     testcase a{{"", ENUM_CONSTANT}};
-    // CHECK: store %"struct.B19773010::pair"* getelementptr inbounds ([1 x %"struct.B19773010::pair"], [1 x %"struct.B19773010::pair"]* bitcast ([1 x { i8*, i32 }]* @.ref.tmp{{.*}} to [1 x %"struct.B19773010::pair"]*), i64 0, i64 0), %"struct.B19773010::pair"** %{{.*}}, align 8
+    // X86: store %"struct.B19773010::pair"* getelementptr inbounds ([1 x %"struct.B19773010::pair"], [1 x %"struct.B19773010::pair"]* bitcast ([1 x { i8*, i32 }]* @.ref.tmp{{.*}} to [1 x %"struct.B19773010::pair"]*), i64 0, i64 0), %"struct.B19773010::pair"** %{{.*}}, align 8
+    // AMD: store %"struct.B19773010::pair"* getelementptr inbounds ([1 x %"struct.B19773010::pair"], [1 x %"struct.B19773010::pair"]* addrspacecast{{.*}} bitcast ([1 x { i8*, i32 }] addrspace(2)* @.ref.tmp{{.*}} to [1 x %"struct.B19773010::pair"] addrspace(2)*){{.*}}, i64 0, i64 0), %"struct.B19773010::pair"** %{{.*}}, align 8
   }
   void f2() {
     // CHECK-LABEL: @_ZN9B197730102f2Ev
-    // CHECK: store %"struct.B19773010::pair"* getelementptr inbounds ([1 x %"struct.B19773010::pair"], [1 x %"struct.B19773010::pair"]* bitcast ([1 x { i8*, i32 }]* @_ZGRZN9B197730102f2EvE1p_ to [1 x %"struct.B19773010::pair"]*), i64 0, i64 0), %"struct.B19773010::pair"** getelementptr inbounds ([2 x %"class.std::initializer_list.10"], [2 x %"class.std::initializer_list.10"]* @_ZZN9B197730102f2EvE1p, i64 0, i64 1, i32 0), align 16
+    // X86: store %"struct.B19773010::pair"* getelementptr inbounds ([1 x %"struct.B19773010::pair"], [1 x %"struct.B19773010::pair"]* bitcast ([1 x { i8*, i32 }]* @_ZGRZN9B197730102f2EvE1p_ to [1 x %"struct.B19773010::pair"]*), i64 0, i64 0), %"struct.B19773010::pair"** getelementptr inbounds ([2 x %"class.std::initializer_list.10"], [2 x %"class.std::initializer_list.10"]* @_ZZN9B197730102f2EvE1p, i64 0, i64 1, i32 0), align 16
+    // AMD: store %"struct.B19773010::pair"* getelementptr inbounds ([1 x %"struct.B19773010::pair"], [1 x %"struct.B19773010::pair"]* addrspacecast{{.*}} bitcast ([1 x { i8*, i32 }] addrspace(1)* @_ZGRZN9B197730102f2EvE1p_ to [1 x %"struct.B19773010::pair"] addrspace(1)*){{.*}}, i64 0, i64 0), %"struct.B19773010::pair"** getelementptr inbounds ([2 x %"class.std::initializer_list.10"], [2 x %"class.std::initializer_list.10"]* addrspacecast{{.*}}@_ZZN9B197730102f2EvE1p{{.*}}, i64 0, i64 1, i32 0), align 8
     static std::initializer_list<pair<const char *, E>> a, p[2] =
         {a, {{"", ENUM_CONSTANT}}};
   }
diff --git a/test/CodeGenCXX/devirtualize-virtual-function-calls-final.cpp b/test/CodeGenCXX/devirtualize-virtual-function-calls-final.cpp
index b90620ab600f..2ab2f759cfe3 100644
--- a/test/CodeGenCXX/devirtualize-virtual-function-calls-final.cpp
+++ b/test/CodeGenCXX/devirtualize-virtual-function-calls-final.cpp
@@ -241,3 +241,53 @@ namespace Test10 {
     return static_cast<A *>(b)->f();
   }
 }
+
+namespace Test11 {
+  // Check that the definitions of Derived's operators are emitted.
+
+  // CHECK-LABEL: define linkonce_odr void @_ZN6Test111SIiE4foo1Ev(
+  // CHECK: call void @_ZN6Test111SIiE7DerivedclEv(
+  // CHECK: call zeroext i1 @_ZN6Test111SIiE7DerivedeqERKNS_4BaseE(
+  // CHECK: call zeroext i1 @_ZN6Test111SIiE7DerivedntEv(
+  // CHECK: call dereferenceable(4) %"class.Test11::Base"* @_ZN6Test111SIiE7DerivedixEi(
+  // CHECK: define linkonce_odr void @_ZN6Test111SIiE7DerivedclEv(
+  // CHECK: define linkonce_odr zeroext i1 @_ZN6Test111SIiE7DerivedeqERKNS_4BaseE(
+  // CHECK: define linkonce_odr zeroext i1 @_ZN6Test111SIiE7DerivedntEv(
+  // CHECK: define linkonce_odr dereferenceable(4) %"class.Test11::Base"* @_ZN6Test111SIiE7DerivedixEi(
+  class Base {
+  public:
+    virtual void operator()() {}
+    virtual bool operator==(const Base &other) { return false; }
+    virtual bool operator!() { return false; }
+    virtual Base &operator[](int i) { return *this; }
+  };
+
+  template<class T>
+  struct S {
+    class Derived final : public Base {
+    public:
+      void operator()() override {}
+      bool operator==(const Base &other) override { return true; }
+      bool operator!() override { return true; }
+      Base &operator[](int i) override { return *this; }
+    };
+
+    Derived *ptr = nullptr, *ptr2 = nullptr;
+
+    void foo1() {
+      if (ptr && ptr2) {
+        // These calls get devirtualized. Linkage fails if the definitions of
+        // the called functions are not emitted.
+        (*ptr)();
+        (void)(*ptr == *ptr2);
+        (void)(!(*ptr));
+        (void)((*ptr)[1]);
+      }
+    }
+  };
+
+  void foo2() {
+    S<int> *s = new S<int>;
+    s->foo1();
+  }
+}
diff --git a/test/CodeGenCXX/dllimport-memptr-global.cpp b/test/CodeGenCXX/dllimport-memptr-global.cpp
new file mode 100644
index 000000000000..e64537b8b9f9
--- /dev/null
+++ b/test/CodeGenCXX/dllimport-memptr-global.cpp
@@ -0,0 +1,58 @@
+// Also check that -Wglobal-constructors does the right thing. Strictly
+// speaking, this is a Sema test, but this avoids test case duplication.
+// RUN: %clang_cc1 -Wglobal-constructors %s -verify -triple i686-windows-msvc -fms-extensions -std=c++11
+//
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple i686-windows-msvc -fms-extensions -std=c++11 | FileCheck %s
+
+struct __declspec(dllimport) Single {
+  void nonvirt();
+  virtual void virt();
+};
+
+struct A { int a; };
+struct B { int b; };
+struct __declspec(dllimport) Multi : A, B {
+  void nonvirt();
+  virtual void virt();
+};
+
+struct __declspec(dllimport) Virtual : virtual A {
+  void nonvirt();
+  virtual void virt();
+};
+
+struct General;
+static_assert(sizeof(void (General::*)()) == 16, "force general memptr model");
+struct __declspec(dllimport) General {
+  void nonvirt();
+  virtual void virt();
+};
+
+auto mp_single_nv = &Single::nonvirt; // expected-warning {{global constructor}}
+auto mp_multi_nv = &Multi::nonvirt; // expected-warning {{global constructor}}
+auto mp_virtual_nv = &Virtual::nonvirt; // expected-warning {{global constructor}}
+auto mp_general_nv = &General::nonvirt; // expected-warning {{global constructor}}
+
+auto mp_single_v = &Single::virt;
+auto mp_multi_v = &Multi::virt;
+auto mp_virtual_v = &Virtual::virt;
+auto mp_general_v = &General::virt;
+
+// All of the non-virtual globals need dynamic initializers.
+
+// CHECK: @"\01?mp_single_nv@@3P8Single@@AEXXZQ1@" = global i8* null, align 4
+// CHECK: @"\01?mp_multi_nv@@3P8Multi@@AEXXZQ1@" = global { i8*, i32 } zeroinitializer, align 4
+// CHECK: @"\01?mp_virtual_nv@@3P8Virtual@@AEXXZQ1@" = global { i8*, i32, i32 } zeroinitializer, align 4
+// CHECK: @"\01?mp_general_nv@@3P8General@@AEXXZQ1@" = global { i8*, i32, i32, i32 } zeroinitializer, align 4
+
+// CHECK: @"\01?mp_single_v@@3P8Single@@AEXXZQ1@" = global i8* bitcast (void (%struct.Single*, ...)* @"\01??_9Single@@$BA@AE" to i8*), align 4
+// CHECK: @"\01?mp_multi_v@@3P8Multi@@AEXXZQ1@" = global { i8*, i32 } { i8* bitcast (void (%struct.Multi*, ...)* @"\01??_9Multi@@$BA@AE" to i8*), i32 0 }, align 4
+// CHECK: @"\01?mp_virtual_v@@3P8Virtual@@AEXXZQ1@" = global { i8*, i32, i32 } { i8* bitcast (void (%struct.Virtual*, ...)* @"\01??_9Virtual@@$BA@AE" to i8*), i32 0, i32 0 }, align 4
+// CHECK: @"\01?mp_general_v@@3P8General@@AEXXZQ1@" = global { i8*, i32, i32, i32 } { i8* bitcast (void (%struct.General*, ...)* @"\01??_9General@@$BA@AE" to i8*), i32 0, i32 0, i32 0 }, align 4
+
+// CHECK: define internal void @_GLOBAL__sub_I{{.*}}() {{.*}} {
+// CHECK:   call void @"\01??__Emp_single_nv@@YAXXZ"()
+// CHECK:   call void @"\01??__Emp_multi_nv@@YAXXZ"()
+// CHECK:   call void @"\01??__Emp_virtual_nv@@YAXXZ"()
+// CHECK:   call void @"\01??__Emp_general_nv@@YAXXZ"()
+// CHECK: }
diff --git a/test/CodeGenCXX/vtable-available-externally.cpp b/test/CodeGenCXX/vtable-available-externally.cpp
index db99f73d9e72..2e2cdbbfeff5 100644
--- a/test/CodeGenCXX/vtable-available-externally.cpp
+++ b/test/CodeGenCXX/vtable-available-externally.cpp
@@ -275,9 +275,8 @@ struct C {
   virtual D& operator=(const D&);
 };
 
-// Cannot emit D's vtable available_externally, because we cannot create
-// a reference to the inline virtual D::operator= function.
-// CHECK-TEST11: @_ZTVN6Test111DE = external unnamed_addr constant
+// Can emit D's vtable available_externally.
+// CHECK-TEST11: @_ZTVN6Test111DE = available_externally unnamed_addr constant
 struct D : C {
   virtual void key();
 };
diff --git a/test/CodeGenCXX/windows-itanium-type-info.cpp b/test/CodeGenCXX/windows-itanium-type-info.cpp
index ad89318f599e..285b59815da2 100644
--- a/test/CodeGenCXX/windows-itanium-type-info.cpp
+++ b/test/CodeGenCXX/windows-itanium-type-info.cpp
@@ -32,9 +32,15 @@ void f() {
 // CHECK-DAG: @_ZTV7derived = dllexport unnamed_addr constant
 
 // CHECK-DAG: @_ZTI4base = external dllimport constant
-// CHECK-DAG: @_ZTS4base = external dllimport constant
-// CHECK-NOT: @_ZTV4base = external dllimport constant
 
 // CHECK-EH-IMPORT: @_ZTS4base = linkonce_odr constant
 // CHECK-EH-IMPORT: @_ZTI4base = linkonce_odr constant
 
+struct __declspec(dllimport) gatekeeper {};
+struct zuul : gatekeeper {
+  virtual ~zuul();
+};
+zuul::~zuul() {}
+
+// CHECK-DAG: @_ZTI10gatekeeper = linkonce_odr constant
+// CHECK-DAG: @_ZTS10gatekeeper = linkonce_odr constant
diff --git a/test/CodeGenOpenCL/address-spaces.cl b/test/CodeGenOpenCL/address-spaces.cl
index 7c665286547a..488b8f9d480e 100644
--- a/test/CodeGenOpenCL/address-spaces.cl
+++ b/test/CodeGenOpenCL/address-spaces.cl
@@ -4,6 +4,8 @@
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa-opencl -DCL20 -cl-std=CL2.0 -emit-llvm -o - | FileCheck %s --check-prefixes=CL20,CL20SPIR
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa-amdgizcl -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,GIZ
 // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa-amdgizcl -DCL20 -cl-std=CL2.0 -emit-llvm -o - | FileCheck %s --check-prefixes=CL20,CL20GIZ
+// RUN: %clang_cc1 %s -O0 -triple amdgcn-mesa-mesa3d -emit-llvm -o - | FileCheck --check-prefixes=CHECK,SPIR %s
+// RUN: %clang_cc1 %s -O0 -triple r600-- -emit-llvm -o - | FileCheck --check-prefixes=CHECK,SPIR %s
 
 // SPIR: i32* %arg
 // GIZ: i32 addrspace(5)* %arg
diff --git a/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
index 29309567ec71..19287c7d8998 100644
--- a/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
+++ b/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -22,8 +22,8 @@ void func2(void) {
   // CHECK: %la = alloca [100 x i32], align 4, addrspace(5)
   // CL12: %lp1 = alloca i32 addrspace(5)*, align 4, addrspace(5)
   // CL12: %lp2 = alloca i32 addrspace(5)*, align 4, addrspace(5)
-  // CL20: %lp1 = alloca i32*, align 4, addrspace(5)
-  // CL20: %lp2 = alloca i32*, align 4, addrspace(5)
+  // CL20: %lp1 = alloca i32*, align 8, addrspace(5)
+  // CL20: %lp2 = alloca i32*, align 8, addrspace(5)
   // CHECK: %lvc = alloca i32, align 4, addrspace(5)
 
   // CHECK: store i32 1, i32 addrspace(5)* %lv1
@@ -39,13 +39,13 @@ void func2(void) {
 
   // CL12: store i32 addrspace(5)* %lv1, i32 addrspace(5)* addrspace(5)* %lp1, align 4
   // CL20: %[[r0:.*]] = addrspacecast i32 addrspace(5)* %lv1 to i32*
-  // CL20: store i32* %[[r0]], i32* addrspace(5)* %lp1, align 4
+  // CL20: store i32* %[[r0]], i32* addrspace(5)* %lp1, align 8
   int *lp1 = &lv1;
 
   // CHECK: %[[arraydecay:.*]] = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %la, i32 0, i32 0
   // CL12: store i32 addrspace(5)* %[[arraydecay]], i32 addrspace(5)* addrspace(5)* %lp2, align 4
   // CL20: %[[r1:.*]] = addrspacecast i32 addrspace(5)* %[[arraydecay]] to i32*
-  // CL20: store i32* %[[r1]], i32* addrspace(5)* %lp2, align 4
+  // CL20: store i32* %[[r1]], i32* addrspace(5)* %lp2, align 8
   int *lp2 = la;
 
   // CL12: call void @func1(i32 addrspace(5)* %lv1)
diff --git a/test/CodeGenOpenCL/amdgpu-nullptr.cl b/test/CodeGenOpenCL/amdgpu-nullptr.cl
index 402be5760cf7..69f54fcaa483 100644
--- a/test/CodeGenOpenCL/amdgpu-nullptr.cl
+++ b/test/CodeGenOpenCL/amdgpu-nullptr.cl
@@ -27,13 +27,13 @@ private char *private_p = 0;
 // CHECK: @local_p = local_unnamed_addr addrspace(1) global i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), align 4
 local char *local_p = 0;
 
-// CHECK: @global_p = local_unnamed_addr addrspace(1) global i8 addrspace(1)* null, align 4
+// CHECK: @global_p = local_unnamed_addr addrspace(1) global i8 addrspace(1)* null, align 8
 global char *global_p = 0;
 
-// CHECK: @constant_p = local_unnamed_addr addrspace(1) global i8 addrspace(2)* null, align 4
+// CHECK: @constant_p = local_unnamed_addr addrspace(1) global i8 addrspace(2)* null, align 8
 constant char *constant_p = 0;
 
-// CHECK: @generic_p = local_unnamed_addr addrspace(1) global i8 addrspace(4)* null, align 4
+// CHECK: @generic_p = local_unnamed_addr addrspace(1) global i8 addrspace(4)* null, align 8
 generic char *generic_p = 0;
 
 // Test NULL as initializer.
@@ -44,19 +44,19 @@ private char *private_p_NULL = NULL;
 // CHECK: @local_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), align 4
 local char *local_p_NULL = NULL;
 
-// CHECK: @global_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(1)* null, align 4
+// CHECK: @global_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(1)* null, align 8
 global char *global_p_NULL = NULL;
 
-// CHECK: @constant_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(2)* null, align 4
+// CHECK: @constant_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(2)* null, align 8
 constant char *constant_p_NULL = NULL;
 
-// CHECK: @generic_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(4)* null, align 4
+// CHECK: @generic_p_NULL = local_unnamed_addr addrspace(1) global i8 addrspace(4)* null, align 8
 generic char *generic_p_NULL = NULL;
 
 // Test constant folding of null pointer.
 // A null pointer should be folded to a null pointer in the target address space.
 
-// CHECK: @fold_generic = local_unnamed_addr addrspace(1) global i32 addrspace(4)* null, align 4
+// CHECK: @fold_generic = local_unnamed_addr addrspace(1) global i32 addrspace(4)* null, align 8
 generic int *fold_generic = (global int*)(generic float*)(private char*)0;
 
 // CHECK: @fold_priv = local_unnamed_addr addrspace(1) global i16* null, align 4
@@ -104,8 +104,8 @@ int fold_int5_local = (int) &((local StructTy1*)0)->p2;
 // NOOPT: @test_static_var_private.sp3 = internal addrspace(1) global i8* null, align 4
 // NOOPT: @test_static_var_private.sp4 = internal addrspace(1) global i8* null, align 4
 // NOOPT: @test_static_var_private.sp5 = internal addrspace(1) global i8* null, align 4
-// NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, align 4
-// NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 4
+// NOOPT: @test_static_var_private.SS1 = internal addrspace(1) global %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, align 8
+// NOOPT: @test_static_var_private.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
 
 void test_static_var_private(void) {
   static private char *sp1 = 0;
@@ -123,8 +123,8 @@ void test_static_var_private(void) {
 // NOOPT: @test_static_var_local.sp3 = internal addrspace(1) global i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), align 4
 // NOOPT: @test_static_var_local.sp4 = internal addrspace(1) global i8 addrspace(3)* null, align 4
 // NOOPT: @test_static_var_local.sp5 = internal addrspace(1) global i8 addrspace(3)* null, align 4
-// NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, align 4
-// NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 4
+// NOOPT: @test_static_var_local.SS1 = internal addrspace(1) global %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, align 8
+// NOOPT: @test_static_var_local.SS2 = internal addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
 void test_static_var_local(void) {
   static local char *sp1 = 0;
   static local char *sp2 = NULL;
@@ -143,9 +143,9 @@ void test_static_var_local(void) {
 // NOOPT: store i8* null, i8** %sp3, align 4
 // NOOPT: store i8* null, i8** %sp4, align 4
 // NOOPT: %[[SS1:.*]] = bitcast %struct.StructTy1* %SS1 to i8*
-// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_private.SS1 to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_private.SS1 to i8 addrspace(2)*), i64 32, i32 8, i1 false)
 // NOOPT: %[[SS2:.*]] = bitcast %struct.StructTy2* %SS2 to i8*
-// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 4, i1 false)
+// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 8, i1 false)
 void test_func_scope_var_private(void) {
   private char *sp1 = 0;
   private char *sp2 = NULL;
@@ -163,9 +163,9 @@ void test_func_scope_var_private(void) {
 // NOOPT: store i8 addrspace(3)* null, i8 addrspace(3)** %sp3, align 4
 // NOOPT: store i8 addrspace(3)* null, i8 addrspace(3)** %sp4, align 4
 // NOOPT: %[[SS1:.*]] = bitcast %struct.StructTy1* %SS1 to i8*
-// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_local.SS1 to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_local.SS1 to i8 addrspace(2)*), i64 32, i32 8, i1 false)
 // NOOPT: %[[SS2:.*]] = bitcast %struct.StructTy2* %SS2 to i8*
-// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 4, i1 false)
+// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 8, i1 false)
 void test_func_scope_var_local(void) {
   local char *sp1 = 0;
   local char *sp2 = NULL;
@@ -189,28 +189,28 @@ private char *p1;
 // CHECK: @p2 = weak local_unnamed_addr addrspace(1) global i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), align 4
 local char *p2;
 
-// CHECK: @p3 = common local_unnamed_addr addrspace(1) global i8 addrspace(2)* null, align 4
+// CHECK: @p3 = common local_unnamed_addr addrspace(1) global i8 addrspace(2)* null, align 8
 constant char *p3;
 
-// CHECK: @p4 = common local_unnamed_addr addrspace(1) global i8 addrspace(1)* null, align 4
+// CHECK: @p4 = common local_unnamed_addr addrspace(1) global i8 addrspace(1)* null, align 8
 global char *p4;
 
-// CHECK: @p5 = common local_unnamed_addr addrspace(1) global i8 addrspace(4)* null, align 4
+// CHECK: @p5 = common local_unnamed_addr addrspace(1) global i8 addrspace(4)* null, align 8
 generic char *p5;
 
 // Test default initialization of sturcture.
 
-// CHECK: @S1 = weak local_unnamed_addr addrspace(1) global %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, align 4
+// CHECK: @S1 = weak local_unnamed_addr addrspace(1) global %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, align 8
 StructTy1 S1;
 
-// CHECK: @S2 = common local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 4
+// CHECK: @S2 = common local_unnamed_addr addrspace(1) global %struct.StructTy2 zeroinitializer, align 8
 StructTy2 S2;
 
 // Test default initialization of array.
-// CHECK: @A1 = weak local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }], align 4
+// CHECK: @A1 = weak local_unnamed_addr addrspace(1) global [2 x %struct.StructTy1] [%struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }, %struct.StructTy1 { i8* null, i8 addrspace(3)* addrspacecast (i8 addrspace(4)* null to i8 addrspace(3)*), i8 addrspace(2)* null, i8 addrspace(1)* null, i8 addrspace(4)* null }], align 8
 StructTy1 A1[2];
 
-// CHECK: @A2 = common local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 4
+// CHECK: @A2 = common local_unnamed_addr addrspace(1) global [2 x %struct.StructTy2] zeroinitializer, align 8
 StructTy2 A2[2];
 
 // Test comparison with 0.
@@ -597,7 +597,7 @@ int test_and_ptr(private char* p1, local char* p2) {
 // Test folding of null pointer in function scope.
 // NOOPT-LABEL: test_fold_private
 // NOOPT: call void @test_fold_callee
-// NOOPT: store i32 addrspace(1)* null, i32 addrspace(1)** %glob, align 4
+// NOOPT: store i32 addrspace(1)* null, i32 addrspace(1)** %glob, align 8
 // NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
 // NOOPT: call void @test_fold_callee
 // NOOPT: %{{.*}} = add nsw i64 %{{.*}}, 0
@@ -612,7 +612,7 @@ void test_fold_private(void) {
 
 // NOOPT-LABEL: test_fold_local
 // NOOPT: call void @test_fold_callee
-// NOOPT: store i32 addrspace(1)* null, i32 addrspace(1)** %glob, align 4
+// NOOPT: store i32 addrspace(1)* null, i32 addrspace(1)** %glob, align 8
 // NOOPT: %{{.*}} = sub i64 %{{.*}}, 0
 // NOOPT: call void @test_fold_callee
 // NOOPT: %{{.*}} = add nsw i64 %{{.*}}, sext (i32 ptrtoint (i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*) to i32) to i64)
diff --git a/test/CodeGenOpenCL/amdgpu-sizeof-alignof.cl b/test/CodeGenOpenCL/amdgpu-sizeof-alignof.cl
new file mode 100644
index 000000000000..a5d438933fa4
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgpu-sizeof-alignof.cl
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -triple r600 -cl-std=CL1.2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-mesa-mesa3d -cl-std=CL1.2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn---opencl -cl-std=CL1.2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn---opencl -cl-std=CL2.0 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn---amdgizcl -cl-std=CL1.2 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn---amdgizcl -cl-std=CL2.0 %s -emit-llvm -o - | FileCheck %s
+
+#ifdef __AMDGCN__
+#define PTSIZE 8
+#else
+#define PTSIZE 4
+#endif
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+typedef __INTPTR_TYPE__ intptr_t;
+typedef __UINTPTR_TYPE__ uintptr_t;
+typedef global void *global_ptr_t;
+typedef constant void *constant_ptr_t;
+typedef local void *local_ptr_t;
+typedef private void *private_ptr_t;
+
+void check(bool);
+
+void test() {
+  // CHECK-NOT: call void @check(i1 zeroext false)
+  check(sizeof(size_t) == PTSIZE);
+  check(__alignof__(size_t) == PTSIZE);
+  check(sizeof(intptr_t) == PTSIZE);
+  check(__alignof__(intptr_t) == PTSIZE);
+  check(sizeof(uintptr_t) == PTSIZE);
+  check(__alignof__(uintptr_t) == PTSIZE);
+  check(sizeof(ptrdiff_t) == PTSIZE);
+  check(__alignof__(ptrdiff_t) == PTSIZE);
+
+  check(sizeof(char) == 1);
+  check(__alignof__(char) == 1);
+  check(sizeof(short) == 2);
+  check(__alignof__(short) == 2);
+  check(sizeof(int) == 4);
+  check(__alignof__(int) == 4);
+  check(sizeof(long) == 8);
+  check(__alignof__(long) == 8);
+#ifdef cl_khr_fp16
+  check(sizeof(half) == 2);
+  check(__alignof__(half) == 2);
+#endif
+  check(sizeof(float) == 4);
+  check(__alignof__(float) == 4);
+#ifdef cl_khr_fp64
+  check(sizeof(double) == 8);
+  check(__alignof__(double) == 8);
+#endif
+
+  check(sizeof(void*) == (__OPENCL_C_VERSION__ >= 200 ? 8 : 4));
+  check(__alignof__(void*) == (__OPENCL_C_VERSION__ >= 200 ? 8 : 4));
+  check(sizeof(global_ptr_t) == PTSIZE);
+  check(__alignof__(global_ptr_t) == PTSIZE);
+  check(sizeof(constant_ptr_t) == PTSIZE);
+  check(__alignof__(constant_ptr_t) == PTSIZE);
+  check(sizeof(local_ptr_t) == 4);
+  check(__alignof__(private_ptr_t) == 4);
+}
diff --git a/test/Driver/autocomplete.c b/test/Driver/autocomplete.c
index f0bdcce5707c..e9e2e992bae9 100644
--- a/test/Driver/autocomplete.c
+++ b/test/Driver/autocomplete.c
@@ -36,3 +36,7 @@
 // MTHREADMODELALL: posix single
 // RUN: %clang --autocomplete=-mrelocation-model, | FileCheck %s -check-prefix=MRELOCMODELALL
 // MRELOCMODELALL: dynamic-no-pic pic ropi ropi-rwpi rwpi static
+// RUN: %clang --autocomplete=-mrelocation-mode | FileCheck %s -check-prefix=MRELOCMODEL_CLANG
+// MRELOCMODEL_CLANG-NOT: -mrelocation-model
+// RUN: %clang --autocomplete=#-mrelocation-mode | FileCheck %s -check-prefix=MRELOCMODEL_CC1
+// MRELOCMODEL_CC1: -mrelocation-model
diff --git a/test/Driver/clang_f_opts.c b/test/Driver/clang_f_opts.c
index e4b72d69ca3f..c17cec6eba9b 100644
--- a/test/Driver/clang_f_opts.c
+++ b/test/Driver/clang_f_opts.c
@@ -356,6 +356,8 @@
 // RUN: -ftree-vrp                                                            \
 // RUN: -fno-devirtualize                                                     \
 // RUN: -fno-devirtualize-speculatively                                       \
+// RUN: -fslp-vectorize-aggressive                                            \
+// RUN: -fno-slp-vectorize-aggressive                                         \
 // RUN: %s 2>&1 | FileCheck --check-prefix=CHECK-WARNING %s
 // CHECK-WARNING-DAG: optimization flag '-finline-limit=1000' is not supported
 // CHECK-WARNING-DAG: optimization flag '-finline-limit' is not supported
@@ -422,6 +424,8 @@
 // CHECK-WARNING-DAG: optimization flag '-ftree-vrp' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-devirtualize' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-devirtualize-speculatively' is not supported
+// CHECK-WARNING-DAG: the flag '-fslp-vectorize-aggressive' has been deprecated and will be ignored
+// CHECK-WARNING-DAG: the flag '-fno-slp-vectorize-aggressive' has been deprecated and will be ignored
 
 // Test that we mute the warning on these
 // RUN: %clang -### -finline-limit=1000 -Wno-invalid-command-line-argument              \
diff --git a/test/Driver/crash report spaces.c b/test/Driver/crash-report-spaces.c
similarity index 84%
rename from test/Driver/crash report spaces.c
rename to test/Driver/crash-report-spaces.c
index 9bc4626b48b8..3e95a0de2516 100644
--- a/test/Driver/crash report spaces.c	
+++ b/test/Driver/crash-report-spaces.c
@@ -1,6 +1,7 @@
 // RUN: rm -rf "%t"
 // RUN: mkdir "%t"
-// RUN: not env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 %clang -fsyntax-only "%s" 2>&1 | FileCheck "%s"
+// RUN: cp "%s" "%t/crash report spaces.c"
+// RUN: not env TMPDIR="%t" TEMP="%t" TMP="%t" RC_DEBUG_OPTIONS=1 %clang -fsyntax-only "%t/crash report spaces.c" 2>&1 | FileCheck "%s"
 // RUN: cat "%t/crash report spaces"-*.c | FileCheck --check-prefix=CHECKSRC "%s"
 // RUN: cat "%t/crash report spaces"-*.sh | FileCheck --check-prefix=CHECKSH "%s"
 // REQUIRES: crash-recovery
diff --git a/test/Driver/darwin-sdk-vs-os-version.c b/test/Driver/darwin-sdk-vs-os-version.c
new file mode 100644
index 000000000000..391f4d5a7305
--- /dev/null
+++ b/test/Driver/darwin-sdk-vs-os-version.c
@@ -0,0 +1,10 @@
+// REQUIRES: system-darwin
+
+// Ensure that we never pick a version that's based on the SDK that's newer than
+// the system version:
+// RUN: rm -rf %t/SDKs/MacOSX10.99.99.sdk
+// RUN: mkdir -p %t/SDKs/MacOSX10.99.99.sdk
+// RUN: %clang -target x86_64-apple-darwin -isysroot %t/SDKs/MacOSX10.99.99.sdk %s -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MACOSX-SYSTEM-VERSION %s
+
+// CHECK-MACOSX-SYSTEM-VERSION-NOT: 10.99.99"
diff --git a/test/Driver/fuchsia.c b/test/Driver/fuchsia.c
index 58c2bbe533c9..64d31cc0d38c 100644
--- a/test/Driver/fuchsia.c
+++ b/test/Driver/fuchsia.c
@@ -12,7 +12,7 @@
 // CHECK: Scrt1.o
 // CHECK-NOT: crti.o
 // CHECK-NOT: crtbegin.o
-// CHECK: "-L[[SYSROOT]]/lib"
+// CHECK: "-L[[SYSROOT]]{{/|\\\\}}lib"
 // CHECK: "{{.*[/\\]}}libclang_rt.builtins-x86_64.a"
 // CHECK: "-lc"
 // CHECK-NOT: crtend.o
diff --git a/test/Driver/fuchsia.cpp b/test/Driver/fuchsia.cpp
index 4490f94d0715..4c3f7d29acd8 100644
--- a/test/Driver/fuchsia.cpp
+++ b/test/Driver/fuchsia.cpp
@@ -1,9 +1,10 @@
 // RUN: %clangxx %s -### -no-canonical-prefixes --target=x86_64-unknown-fuchsia \
 // RUN:     --sysroot=%S/platform 2>&1 -fuse-ld=ld | FileCheck %s
 // CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK: "-triple" "x86_64-fuchsia"
 // CHECK: "-fuse-init-array"
 // CHECK: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK: "-internal-isystem" "[[SYSROOT]]{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
+// CHECK: "-internal-isystem" "{{.*[/\\]}}x86_64-fuchsia{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
 // CHECK: "-internal-externc-isystem" "[[SYSROOT]]{{/|\\\\}}include"
 // CHECK: {{.*}}lld{{.*}}" "-flavor" "gnu"
 // CHECK: "--sysroot=[[SYSROOT]]"
@@ -13,7 +14,7 @@
 // CHECK: Scrt1.o
 // CHECK-NOT: crti.o
 // CHECK-NOT: crtbegin.o
-// CHECK: "-L[[SYSROOT]]/lib"
+// CHECK: "-L[[SYSROOT]]{{/|\\\\}}lib"
 // CHECK: "-lc++" "-lc++abi" "-lunwind" "-lm"
 // CHECK: "{{.*[/\\]}}libclang_rt.builtins-x86_64.a"
 // CHECK: "-lc"
diff --git a/test/Driver/mips-features.c b/test/Driver/mips-features.c
index 69fc20e1f245..b228a2d5781d 100644
--- a/test/Driver/mips-features.c
+++ b/test/Driver/mips-features.c
@@ -70,6 +70,18 @@
 // RUN:   | FileCheck --check-prefix=CHECK-NOMMSA %s
 // CHECK-NOMMSA: "-target-feature" "-msa"
 //
+// -mmt
+// RUN: %clang -target mips-linux-gnu -### -c %s \
+// RUN:     -mno-mt -mmt 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MMT %s
+// CHECK-MMT: "-target-feature" "+mt"
+//
+// -mno-mt
+// RUN: %clang -target mips-linux-gnu -### -c %s \
+// RUN:     -mmt -mno-mt 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOMMT %s
+// CHECK-NOMMT: "-target-feature" "-mt"
+//
 // -modd-spreg
 // RUN: %clang -target mips-linux-gnu -### -c %s -mno-odd-spreg -modd-spreg 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-MODDSPREG %s
@@ -247,3 +259,14 @@
 // RUN:   | FileCheck --check-prefix=CHECK-IMG-SINGLEFLOAT-FPXX %s
 // CHECK-IMG-SINGLEFLOAT-FPXX: "-target-feature" "+single-float"
 // CHECK-IMG-SINGLEFLOAT-FPXX: "-target-feature" "+fpxx"
+
+// -mlong-call
+// RUN: %clang -target mips-img-linux-gnu -### -c %s -mlong-calls 2>&1 \
+// RUN:   | FileCheck --check-prefix=LONG-CALLS-ON %s
+// RUN: %clang -target mips-img-linux-gnu -### -c %s -mno-long-calls 2>&1 \
+// RUN:   | FileCheck --check-prefix=LONG-CALLS-OFF %s
+// RUN: %clang -target mips-img-linux-gnu -### -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LONG-CALLS-DEF %s
+// LONG-CALLS-ON: "-target-feature" "+long-calls"
+// LONG-CALLS-OFF: "-target-feature" "-long-calls"
+// LONG-CALLS-DEF-NOT: "long-calls"
diff --git a/test/FixIt/fixit-add-synthesize-to-property.m b/test/FixIt/fixit-add-synthesize-to-property.m
new file mode 100644
index 000000000000..19b2f4b73f01
--- /dev/null
+++ b/test/FixIt/fixit-add-synthesize-to-property.m
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+@protocol P1
+
+@property int prop;
+
+@end
+
+@interface I <P1>
+
+@end
+
+@implementation I
+@end // CHECK: fix-it:{{.*}}:{[[@LINE]]:1-[[@LINE]]:1}:"@synthesize prop;\n\n"
diff --git a/test/Import/direct/Inputs/S.c b/test/Import/direct/Inputs/S.c
new file mode 100644
index 000000000000..b0876d27df44
--- /dev/null
+++ b/test/Import/direct/Inputs/S.c
@@ -0,0 +1,3 @@
+struct S {
+  int a;
+};
diff --git a/test/Import/direct/test.c b/test/Import/direct/test.c
new file mode 100644
index 000000000000..5fac24c015dc
--- /dev/null
+++ b/test/Import/direct/test.c
@@ -0,0 +1,5 @@
+// RUN: clang-import-test -direct -import %S/Inputs/S.c -expression %s
+void expr() {
+  struct S MyS;
+  MyS.a = 3;
+}
diff --git a/test/Import/enum/Inputs/S.cpp b/test/Import/enum/Inputs/S.cpp
new file mode 100644
index 000000000000..8506c68512db
--- /dev/null
+++ b/test/Import/enum/Inputs/S.cpp
@@ -0,0 +1,4 @@
+enum E {
+  a = 1,
+  b = 2
+};
diff --git a/test/Import/enum/test.cpp b/test/Import/enum/test.cpp
new file mode 100644
index 000000000000..aa68063cb207
--- /dev/null
+++ b/test/Import/enum/test.cpp
@@ -0,0 +1,4 @@
+// RUN: clang-import-test -import %S/Inputs/S.cpp -expression %s
+void expr() {
+  static_assert(E::a + E::b == 3);
+}
diff --git a/test/Import/import-overrides/Inputs/Hierarchy.cpp b/test/Import/import-overrides/Inputs/Hierarchy.cpp
new file mode 100644
index 000000000000..3f91be765d18
--- /dev/null
+++ b/test/Import/import-overrides/Inputs/Hierarchy.cpp
@@ -0,0 +1,9 @@
+class Base {
+public:
+  virtual void foo() {}
+};
+
+class Derived : public Base {
+public:
+  void foo() override {}
+};
diff --git a/test/Import/import-overrides/test.cpp b/test/Import/import-overrides/test.cpp
new file mode 100644
index 000000000000..ded29cded2ee
--- /dev/null
+++ b/test/Import/import-overrides/test.cpp
@@ -0,0 +1,7 @@
+// RUN: clang-import-test -dump-ast -import %S/Inputs/Hierarchy.cpp -expression %s | FileCheck %s
+
+// CHECK: Overrides:{{.*}}Base::foo
+
+void foo() {
+  Derived d;
+}
diff --git a/test/Index/Core/index-source-invalid-name.cpp b/test/Index/Core/index-source-invalid-name.cpp
new file mode 100644
index 000000000000..1b4b059cd1b3
--- /dev/null
+++ b/test/Index/Core/index-source-invalid-name.cpp
@@ -0,0 +1,13 @@
+// RUN: c-index-test core -print-source-symbols -- %s -std=c++1z -target x86_64-apple-macosx10.7 | FileCheck %s
+
+namespace rdar32474406 {
+// CHECK: [[@LINE+1]]:6 | function/C | foo | c:@N@rdar32474406@F@foo# | __ZN12rdar324744063fooEv | Decl,RelChild | rel: 1
+void foo();
+// CHECK: [[@LINE+1]]:16 | type-alias/C | Func_t | c:index-source-invalid-name.cpp@N@rdar32474406@T@Func_t | <no-cgname> | Def,RelChild | rel: 1
+typedef void (*Func_t)();
+// CHECK: [[@LINE+4]]:1 | type-alias/C | Func_t | c:index-source-invalid-name.cpp@N@rdar32474406@T@Func_t | <no-cgname> | Ref,RelCont | rel: 1
+// CHECK-NEXT: RelCont | rdar32474406 | c:@N@rdar32474406
+// CHECK: [[@LINE+2]]:14 | function/C | foo | c:@N@rdar32474406@F@foo# | __ZN12rdar324744063fooEv | Ref,RelCont | rel: 1
+// CHECK-NEXT: RelCont | rdar32474406 | c:@N@rdar32474406
+Func_t[] = { foo }; // invalid decomposition
+}
diff --git a/test/Index/Core/index-source.cpp b/test/Index/Core/index-source.cpp
index 6d20fdd48e50..4864d6cf0150 100644
--- a/test/Index/Core/index-source.cpp
+++ b/test/Index/Core/index-source.cpp
@@ -496,3 +496,19 @@ void localStructuredBindingAndRef() {
 }
 
 }
+
+namespace rd33122110 {
+
+struct Outer {
+    template<typename T>
+    struct Nested { };
+};
+
+}
+
+template<>
+struct rd33122110::Outer::Nested<int>;
+// CHECK: [[@LINE-1]]:8 | namespace/C++ | rd33122110 | c:@N@rd33122110 | <no-cgname> | Ref,RelCont | rel: 1
+// CHECK-NEXT: RelCont | Nested | c:@N@rd33122110@S@Outer@S@Nested>#I
+// CHECK: [[@LINE-3]]:20 | struct/C++ | Outer | c:@N@rd33122110@S@Outer | <no-cgname> | Ref,RelCont | rel: 1
+// CHECK-NEXT: RelCont | Nested | c:@N@rd33122110@S@Outer@S@Nested>#I
diff --git a/test/Index/Core/index-source.m b/test/Index/Core/index-source.m
index a64c34ad2ac1..c911973a70d6 100644
--- a/test/Index/Core/index-source.m
+++ b/test/Index/Core/index-source.m
@@ -413,3 +413,28 @@ void classReceivers() {
   (void)ClassReceivers.implicit;
 // CHECK: [[@LINE-1]]:9 | class/ObjC | ClassReceivers | c:objc(cs)ClassReceivers | _OBJC_CLASS_$_ClassReceivers | Ref,RelCont | rel: 1
 }
+
+@interface ImplicitProperties
+
+- (int)implicit;
+- (void)setImplicit:(int)x;
+
++ (int)classImplicit;
++ (void)setClassImplicit:(int)y;
+
+@end
+
+void testImplicitProperties(ImplicitProperties *c) {
+  c.implicit = 0;
+// CHECK: [[@LINE-1]]:5 | instance-method/ObjC | setImplicit: | c:objc(cs)ImplicitProperties(im)setImplicit: | -[ImplicitProperties setImplicit:] | Ref,Call,Dyn,RelRec,RelCall,RelCont | rel: 2
+// CHECK-NEXT: RelCall,RelCont | testImplicitProperties | c:@F@testImplicitProperties
+  c.implicit;
+// CHECK: [[@LINE-1]]:5 | instance-method/ObjC | implicit | c:objc(cs)ImplicitProperties(im)implicit | -[ImplicitProperties implicit] | Ref,Call,Dyn,RelRec,RelCall,RelCont | rel: 2
+// CHECK-NEXT: RelCall,RelCont | testImplicitProperties | c:@F@testImplicitProperties
+  ImplicitProperties.classImplicit = 1;
+// CHECK: [[@LINE-1]]:22 | class-method/ObjC | setClassImplicit: | c:objc(cs)ImplicitProperties(cm)setClassImplicit: | +[ImplicitProperties setClassImplicit:] | Ref,Call,RelCall,RelCont | rel: 1
+// CHECK-NEXT: RelCall,RelCont | testImplicitProperties | c:@F@testImplicitProperties
+  ImplicitProperties.classImplicit;
+// CHECK: [[@LINE-1]]:22 | class-method/ObjC | classImplicit | c:objc(cs)ImplicitProperties(cm)classImplicit | +[ImplicitProperties classImplicit] | Ref,Call,RelCall,RelCont | rel: 1
+// CHECK-NEXT: RelCall,RelCont | testImplicitProperties | c:@F@testImplicitProperties
+}
diff --git a/test/Index/Inputs/empty.dia b/test/Index/Inputs/empty.dia
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/Index/pipe-size.cl b/test/Index/pipe-size.cl
index d07d7067da8c..cfe97935ee63 100644
--- a/test/Index/pipe-size.cl
+++ b/test/Index/pipe-size.cl
@@ -11,6 +11,6 @@ __kernel void testPipe( pipe int test )
     // SPIR: store i32 4, i32* %s, align 4
     // SPIR64: store %opencl.pipe_t addrspace(1)* %test, %opencl.pipe_t addrspace(1)** %test.addr, align 8
     // SPIR64: store i32 8, i32* %s, align 4
-    // AMD: store %opencl.pipe_t addrspace(1)* %test, %opencl.pipe_t addrspace(1)* addrspace(5)* %test.addr, align 4
+    // AMD: store %opencl.pipe_t addrspace(1)* %test, %opencl.pipe_t addrspace(1)* addrspace(5)* %test.addr, align 8
     // AMD: store i32 8, i32 addrspace(5)* %s, align 4
 }
diff --git a/test/Index/print-type-declaration.cpp b/test/Index/print-type-declaration.cpp
index 31c0a73fcd06..a0953d1e5638 100644
--- a/test/Index/print-type-declaration.cpp
+++ b/test/Index/print-type-declaration.cpp
@@ -7,6 +7,13 @@ int main()
   auto b = a;
 }
 
+enum RegularEnum {};
+
+enum class ScopedEnum {};
+
 // RUN: c-index-test -test-print-type-declaration -std=c++11 %s | FileCheck %s
 // CHECK: VarDecl=a:6:8 (Definition) [typedeclaration=Test] [typekind=Record]
 // CHECK: VarDecl=b:7:8 (Definition) [typedeclaration=Test] [typekind=Record]
+// CHECK: EnumDecl=RegularEnum:10:6 (Definition) [typedeclaration=RegularEnum] [typekind=Enum]
+// CHECK: EnumDecl=ScopedEnum:12:12 (Definition) (scoped) [typedeclaration=ScopedEnum] [typekind=Enum]
+
diff --git a/test/Index/read-empty-diags.test b/test/Index/read-empty-diags.test
new file mode 100644
index 000000000000..cef751c920ab
--- /dev/null
+++ b/test/Index/read-empty-diags.test
@@ -0,0 +1,2 @@
+// RUN: not c-index-test -read-diagnostics %S/Inputs/empty.dia 2>&1 | FileCheck %s
+// CHECK: Trouble deserializing file (Invalid File): Invalid diagnostics signature
diff --git a/test/Index/usrs.m b/test/Index/usrs.m
index 92c3a3fafee2..d0a860e1afad 100644
--- a/test/Index/usrs.m
+++ b/test/Index/usrs.m
@@ -119,7 +119,7 @@ int test_multi_declaration(void) {
 // CHECK: usrs.m c:@SA@MyStruct Extent=[15:9 - 18:2]
 // CHECK: usrs.m c:@SA@MyStruct@FI@wa Extent=[16:3 - 16:9]
 // CHECK: usrs.m c:@SA@MyStruct@FI@moo Extent=[17:3 - 17:10]
-// CHECK: usrs.m c:usrs.m@T@MyStruct Extent=[15:1 - 18:11]
+// CHECK: usrs.m c:@T@MyStruct Extent=[15:1 - 18:11]
 // CHECK: usrs.m c:@E@Pizza Extent=[20:1 - 23:2]
 // CHECK: usrs.m c:@E@Pizza@CHEESE Extent=[21:3 - 21:9]
 // CHECK: usrs.m c:@E@Pizza@MUSHROOMS Extent=[22:3 - 22:12]
diff --git a/test/Misc/find-diagnostic-id.c b/test/Misc/find-diagnostic-id.c
new file mode 100644
index 000000000000..bef66178f96f
--- /dev/null
+++ b/test/Misc/find-diagnostic-id.c
@@ -0,0 +1,5 @@
+// RUN: diagtool find-diagnostic-id warn_unused_variable | FileCheck %s
+// RUN: not diagtool find-diagnostic-id warn_unused_vars 2>&1 | FileCheck --check-prefix=ERROR %s
+
+// CHECK: {{^[0-9]+$}}
+// ERROR: error: invalid diagnostic 'warn_unused_vars'
diff --git a/test/Modules/missing-flag.cpp b/test/Modules/missing-flag.cpp
new file mode 100644
index 000000000000..560183be59e8
--- /dev/null
+++ b/test/Modules/missing-flag.cpp
@@ -0,0 +1,4 @@
+// RUN: not %clang_cc1 -x c++-module-map %s -emit-module -fmodule-name=Foo -o %t 2>&1 | FileCheck %s
+// CHECK: module compilation requires '-fmodules'
+module Foo {}
+#pragma clang module contents
diff --git a/test/Modules/odr_hash.cpp b/test/Modules/odr_hash.cpp
index 6e01e989a374..ee45ae529961 100644
--- a/test/Modules/odr_hash.cpp
+++ b/test/Modules/odr_hash.cpp
@@ -1339,6 +1339,84 @@ Bravo<char> golf;
 #endif
 }
 
+namespace Friend {
+#if defined(FIRST)
+struct T1 {};
+struct S1 {
+  friend class T1;
+};
+#elif defined(SECOND)
+struct T1 {};
+struct S1 {
+  friend T1;
+};
+#else
+S1 s1;
+// expected-error@second.h:* {{'Friend::S1' has different definitions in different modules; first difference is definition in module 'SecondModule' found friend 'Friend::T1'}}
+// expected-note@first.h:* {{but in 'FirstModule' found friend 'class T1'}}
+#endif
+
+#if defined(FIRST)
+struct T2 {};
+struct S2 {
+  friend class T2;
+};
+#elif defined(SECOND)
+struct T2 {};
+struct S2 {
+  friend struct T2;
+};
+#else
+S2 s2;
+// expected-error@second.h:* {{'Friend::S2' has different definitions in different modules; first difference is definition in module 'SecondModule' found friend 'struct T2'}}
+// expected-note@first.h:* {{but in 'FirstModule' found friend 'class T2'}}
+#endif
+
+#if defined(FIRST)
+struct T3 {};
+struct S3 {
+  friend const T3;
+};
+#elif defined(SECOND)
+struct T3 {};
+struct S3 {
+  friend T3;
+};
+#else
+S3 s3;
+// expected-error@second.h:* {{'Friend::S3' has different definitions in different modules; first difference is definition in module 'SecondModule' found friend 'Friend::T3'}}
+// expected-note@first.h:* {{but in 'FirstModule' found friend 'const Friend::T3'}}
+#endif
+
+#if defined(FIRST)
+struct T4 {};
+struct S4 {
+  friend T4;
+};
+#elif defined(SECOND)
+struct S4 {
+  friend void T4();
+};
+#else
+S4 s4;
+// expected-error@second.h:* {{'Friend::S4' has different definitions in different modules; first difference is definition in module 'SecondModule' found friend function}}
+// expected-note@first.h:* {{but in 'FirstModule' found friend class}}
+#endif
+
+#if defined(FIRST)
+struct S5 {
+  friend void T5a();
+};
+#elif defined(SECOND)
+struct S5 {
+  friend void T5b();
+};
+#else
+S5 s5;
+// expected-error@second.h:* {{'Friend::S5' has different definitions in different modules; first difference is definition in module 'SecondModule' found friend function 'T5b'}}
+// expected-note@first.h:* {{but in 'FirstModule' found friend function 'T5a'}}
+#endif
+}
 // Interesting cases that should not cause errors.  struct S should not error
 // while struct T should error at the access specifier mismatch at the end.
 namespace AllDecls {
diff --git a/test/Modules/preprocess-build.cpp b/test/Modules/preprocess-build.cpp
index bf9f16162be2..36a0740a0a76 100644
--- a/test/Modules/preprocess-build.cpp
+++ b/test/Modules/preprocess-build.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z %s -verify
+// RUN: %clang_cc1 -std=c++1z -fmodules %s -verify
 
 #pragma clang module build baz
   module baz {}
diff --git a/test/Modules/relative-dep-gen.cpp b/test/Modules/relative-dep-gen.cpp
index e58dd6eb3596..bfa9471859b8 100644
--- a/test/Modules/relative-dep-gen.cpp
+++ b/test/Modules/relative-dep-gen.cpp
@@ -2,17 +2,17 @@
 // RUN: rm -rf %t
 // RUN: mkdir %t
 //
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-name=relative-dep-gen -emit-module -x c++ Inputs/relative-dep-gen.modulemap -dependency-file %t/build.d -MT mod.pcm -o %t/mod.pcm
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-map-file=Inputs/relative-dep-gen.modulemap -fmodule-file=%t/mod.pcm -dependency-file %t/use-explicit.d -MT use.o relative-dep-gen.cpp -fsyntax-only
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-map-file=Inputs/relative-dep-gen.modulemap -dependency-file %t/use-implicit.d relative-dep-gen.cpp -MT use.o -fsyntax-only
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -fmodule-name=relative-dep-gen -emit-module -x c++ Inputs/relative-dep-gen.modulemap -dependency-file %t/build.d -MT mod.pcm -o %t/mod.pcm
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -fmodule-map-file=Inputs/relative-dep-gen.modulemap -fmodule-file=%t/mod.pcm -dependency-file %t/use-explicit.d -MT use.o relative-dep-gen.cpp -fsyntax-only
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fmodule-map-file=Inputs/relative-dep-gen.modulemap -dependency-file %t/use-implicit.d relative-dep-gen.cpp -MT use.o -fsyntax-only
 //
 // RUN: FileCheck --check-prefix=CHECK-BUILD %s < %t/build.d
-// RUN: FileCheck --check-prefix=CHECK-USE %s < %t/use-explicit.d
-// RUN: FileCheck --check-prefix=CHECK-USE %s < %t/use-implicit.d
+// RUN: FileCheck --check-prefix=CHECK-USE --check-prefix=CHECK-EXPLICIT %s < %t/use-explicit.d
+// RUN: FileCheck --check-prefix=CHECK-USE --check-prefix=CHECK-IMPLICIT %s < %t/use-implicit.d
 //
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-name=relative-dep-gen -emit-module -x c++ Inputs/relative-dep-gen-cwd.modulemap -dependency-file %t/build-cwd.d -MT mod.pcm -o %t/mod-cwd.pcm -fmodule-map-file-home-is-cwd
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-map-file=Inputs/relative-dep-gen-cwd.modulemap -fmodule-file=%t/mod-cwd.pcm -dependency-file %t/use-explicit-cwd.d -MT use.o relative-dep-gen.cpp -fsyntax-only -fmodule-map-file-home-is-cwd
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-map-file=Inputs/relative-dep-gen-cwd.modulemap -dependency-file %t/use-implicit-cwd.d relative-dep-gen.cpp -MT use.o -fsyntax-only -fmodule-map-file-home-is-cwd
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -fmodule-name=relative-dep-gen -emit-module -x c++ Inputs/relative-dep-gen-cwd.modulemap -dependency-file %t/build-cwd.d -MT mod.pcm -o %t/mod-cwd.pcm -fmodule-map-file-home-is-cwd
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -fmodule-map-file=Inputs/relative-dep-gen-cwd.modulemap -fmodule-file=%t/mod-cwd.pcm -dependency-file %t/use-explicit-cwd.d -MT use.o relative-dep-gen.cpp -fsyntax-only -fmodule-map-file-home-is-cwd
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fmodule-map-file=Inputs/relative-dep-gen-cwd.modulemap -dependency-file %t/use-implicit-cwd.d relative-dep-gen.cpp -MT use.o -fsyntax-only -fmodule-map-file-home-is-cwd
 //
 // RUN: FileCheck --check-prefix=CHECK-BUILD %s < %t/build-cwd.d
 // RUN: FileCheck --check-prefix=CHECK-USE %s < %t/use-explicit-cwd.d
@@ -23,7 +23,7 @@
 // RUN: cp %S/Inputs/relative-dep-gen-1.h %t/Inputs
 // RUN: cp %s %t
 // RUN: cd %t
-// RUN: %clang_cc1 -cc1 -fno-implicit-modules -fmodule-file=%t/mod-cwd.pcm -dependency-file %t/use-explicit-no-map-cwd.d -MT use.o relative-dep-gen.cpp -fsyntax-only -fmodule-map-file-home-is-cwd
+// RUN: %clang_cc1 -fmodules -fno-implicit-modules -fmodule-file=%t/mod-cwd.pcm -dependency-file %t/use-explicit-no-map-cwd.d -MT use.o relative-dep-gen.cpp -fsyntax-only -fmodule-map-file-home-is-cwd
 // RUN: cat %t/use-explicit-no-map-cwd.d
 // RUN: FileCheck --check-prefix=CHECK-USE %s < %t/use-explicit-no-map-cwd.d
 
@@ -35,4 +35,5 @@
 // CHECK-BUILD:   {{[ \t]}}Inputs/relative-dep-gen-2.h
 // CHECK-USE: use.o:
 // CHECK-USE-DAG:   {{[ \t]}}relative-dep-gen.cpp
-// CHECK-USE-DAG:   {{[ \t]}}Inputs{{[/\\]}}relative-dep-gen-1.h
+// CHECK-EXPLICIT-DAG:   mod.pcm
+// CHECK-IMPLICIT-DAG:   {{[ \t]}}Inputs{{[/\\]}}relative-dep-gen-1.h
diff --git a/test/OpenMP/taskloop_ast_print.cpp b/test/OpenMP/taskloop_ast_print.cpp
index 36f90c95dd33..dd0af85916cf 100644
--- a/test/OpenMP/taskloop_ast_print.cpp
+++ b/test/OpenMP/taskloop_ast_print.cpp
@@ -20,7 +20,7 @@ T tmain(T argc) {
 // CHECK-NEXT: for (int i = 0; i < 2; ++i)
 // CHECK-NEXT: a = 2;
 #pragma omp parallel
-#pragma omp taskloop private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) shared(g) if (c) final(d) mergeable priority(f) nogroup num_tasks(N) reduction(min:a)
+#pragma omp taskloop private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) shared(g) if (c) final(d) mergeable priority(f) nogroup num_tasks(N)
   for (int i = 0; i < 2; ++i)
     for (int j = 0; j < 2; ++j)
       for (int j = 0; j < 2; ++j)
@@ -33,7 +33,7 @@ T tmain(T argc) {
           for (int j = 0; j < 2; ++j)
             foo();
   // CHECK-NEXT: #pragma omp parallel
-  // CHECK-NEXT: #pragma omp taskloop private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) shared(g) if(c) final(d) mergeable priority(f) nogroup num_tasks(N) reduction(min: a)
+  // CHECK-NEXT: #pragma omp taskloop private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) shared(g) if(c) final(d) mergeable priority(f) nogroup num_tasks(N)
   // CHECK-NEXT: for (int i = 0; i < 2; ++i)
   // CHECK-NEXT: for (int j = 0; j < 2; ++j)
   // CHECK-NEXT: for (int j = 0; j < 2; ++j)
@@ -53,8 +53,8 @@ int main(int argc, char **argv) {
   int b = argc, c, d, e, f, g;
   static int a;
 // CHECK: static int a;
-#pragma omp taskloop if(taskloop: a) default(none) shared(a) final(b) priority(5) nogroup num_tasks(argc) reduction(*: g)
-  // CHECK-NEXT: #pragma omp taskloop if(taskloop: a) default(none) shared(a) final(b) priority(5) nogroup num_tasks(argc) reduction(*: g)
+#pragma omp taskloop if(taskloop: a) default(none) shared(a) final(b) priority(5) num_tasks(argc) reduction(*: g)
+  // CHECK-NEXT: #pragma omp taskloop if(taskloop: a) default(none) shared(a) final(b) priority(5) num_tasks(argc) reduction(*: g)
   for (int i = 0; i < 2; ++i)
     a = 2;
 // CHECK-NEXT: for (int i = 0; i < 2; ++i)
diff --git a/test/OpenMP/taskloop_codegen.cpp b/test/OpenMP/taskloop_codegen.cpp
index bc7367ce0216..b1b45455802a 100644
--- a/test/OpenMP/taskloop_codegen.cpp
+++ b/test/OpenMP/taskloop_codegen.cpp
@@ -8,6 +8,7 @@
 // CHECK-LABEL: @main
 int main(int argc, char **argv) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
 // CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
@@ -19,6 +20,7 @@ int main(int argc, char **argv) {
 // CHECK: store i64 1, i64* [[ST]],
 // CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
 // CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 0, i64 0, i8* null)
+// CHECK: call void @__kmpc_end_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
 #pragma omp taskloop priority(argc)
   for (int i = 0; i < 10; ++i)
     ;
@@ -33,10 +35,11 @@ int main(int argc, char **argv) {
 // CHECK: store i64 1, i64* [[ST]],
 // CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
 // CHECK: [[GRAINSIZE:%.+]] = zext i32 %{{.+}} to i64
-// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 1, i32 1, i64 [[GRAINSIZE]], i8* null)
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 1, i64 [[GRAINSIZE]], i8* null)
 #pragma omp taskloop nogroup grainsize(argc)
   for (int i = 0; i < 10; ++i)
     ;
+// CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
 // CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
@@ -50,6 +53,7 @@ int main(int argc, char **argv) {
 // CHECK: store i64 1, i64* [[ST]],
 // CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
 // CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 [[IF_INT]], i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 4, i8* null)
+// CHECK: call void @__kmpc_end_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
   int i;
 #pragma omp taskloop if(argc) shared(argc, argv) collapse(2) num_tasks(4)
   for (i = 0; i < argc; ++i)
diff --git a/test/OpenMP/taskloop_reduction_messages.cpp b/test/OpenMP/taskloop_reduction_messages.cpp
new file mode 100644
index 000000000000..7295075c3cb8
--- /dev/null
+++ b/test/OpenMP/taskloop_reduction_messages.cpp
@@ -0,0 +1,331 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+void foobar(int &ref) {
+#pragma omp taskloop reduction(+:ref)
+  for (int i = 0; i < 10; ++i)
+    foo();
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a:32;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+struct S7 {
+  int a: 32;
+  S7() {
+#pragma omp taskloop reduction(+:a) // expected-error {{expected addressable reduction item for the task-based directives}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+};
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp taskloop reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp taskloop' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp taskloop reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp taskloop reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp taskloop reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp taskloop reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp taskloop reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp taskloop' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp taskloop reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp taskloop reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp taskloop reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp taskloop reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp taskloop reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+#pragma omp taskloop nogroup reduction(+ : m) // expected-error {{'reduction' clause cannot be used with 'nogroup' clause}}
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/taskloop_simd_ast_print.cpp b/test/OpenMP/taskloop_simd_ast_print.cpp
index 9bee8f7f93fd..8678b9122e29 100644
--- a/test/OpenMP/taskloop_simd_ast_print.cpp
+++ b/test/OpenMP/taskloop_simd_ast_print.cpp
@@ -21,7 +21,7 @@ T tmain(T argc) {
 // CHECK-NEXT: for (int i = 0; i < 2; ++i)
 // CHECK-NEXT: a = 2;
 #pragma omp parallel
-#pragma omp taskloop simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) shared(g) if (c) final(d) mergeable priority(f) simdlen(N) nogroup num_tasks(N) reduction(min:a)
+#pragma omp taskloop simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) shared(g) if (c) final(d) mergeable priority(f) simdlen(N) nogroup num_tasks(N)
   for (int i = 0; i < 2; ++i)
     for (int j = 0; j < 2; ++j)
       for (int j = 0; j < 2; ++j)
@@ -34,7 +34,7 @@ T tmain(T argc) {
           for (int j = 0; j < 2; ++j)
             foo();
   // CHECK-NEXT: #pragma omp parallel
-  // CHECK-NEXT: #pragma omp taskloop simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) shared(g) if(c) final(d) mergeable priority(f) simdlen(N) nogroup num_tasks(N) reduction(min: a)
+  // CHECK-NEXT: #pragma omp taskloop simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) shared(g) if(c) final(d) mergeable priority(f) simdlen(N) nogroup num_tasks(N)
   // CHECK-NEXT: for (int i = 0; i < 2; ++i)
   // CHECK-NEXT: for (int j = 0; j < 2; ++j)
   // CHECK-NEXT: for (int j = 0; j < 2; ++j)
@@ -54,8 +54,8 @@ int main(int argc, char **argv) {
   int b = argc, c, d, e, f, g;
   static int a;
 // CHECK: static int a;
-#pragma omp taskloop simd if(taskloop: a) default(none) shared(a) final(b) priority(5) safelen(8) linear(b), aligned(argv) nogroup num_tasks(argc) reduction(*: g)
-  // CHECK-NEXT: #pragma omp taskloop simd if(taskloop: a) default(none) shared(a) final(b) priority(5) safelen(8) linear(b) aligned(argv) nogroup num_tasks(argc) reduction(*: g)
+#pragma omp taskloop simd if(taskloop: a) default(none) shared(a) final(b) priority(5) safelen(8) linear(b), aligned(argv) num_tasks(argc) reduction(*: g)
+  // CHECK-NEXT: #pragma omp taskloop simd if(taskloop: a) default(none) shared(a) final(b) priority(5) safelen(8) linear(b) aligned(argv) num_tasks(argc) reduction(*: g)
   for (int i = 0; i < 2; ++i)
     a = 2;
 // CHECK-NEXT: for (int i = 0; i < 2; ++i)
diff --git a/test/OpenMP/taskloop_simd_codegen.cpp b/test/OpenMP/taskloop_simd_codegen.cpp
index 60ba5f22129b..48c6f479e9fa 100644
--- a/test/OpenMP/taskloop_simd_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_codegen.cpp
@@ -8,6 +8,7 @@
 // CHECK-LABEL: @main
 int main(int argc, char **argv) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
 // CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
@@ -19,6 +20,7 @@ int main(int argc, char **argv) {
 // CHECK: store i64 1, i64* [[ST]],
 // CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
 // CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 0, i64 0, i8* null)
+// CHECK: call void @__kmpc_end_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
 #pragma omp taskloop simd priority(argc)
   for (int i = 0; i < 10; ++i)
     ;
@@ -33,10 +35,11 @@ int main(int argc, char **argv) {
 // CHECK: store i64 1, i64* [[ST]],
 // CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
 // CHECK: [[GRAINSIZE:%.+]] = zext i32 %{{.+}} to i64
-// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 1, i32 1, i64 [[GRAINSIZE]], i8* null)
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 1, i64 [[GRAINSIZE]], i8* null)
 #pragma omp taskloop simd nogroup grainsize(argc) simdlen(4)
   for (int i = 0; i < 10; ++i)
     ;
+// CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
 // CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
@@ -50,6 +53,7 @@ int main(int argc, char **argv) {
 // CHECK: store i64 1, i64* [[ST]],
 // CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
 // CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 [[IF_INT]], i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 4, i8* null)
+// CHECK: call void @__kmpc_end_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
   int i;
 #pragma omp taskloop simd if(argc) shared(argc, argv) collapse(2) num_tasks(4) safelen(32)
   for (i = 0; i < argc; ++i)
diff --git a/test/OpenMP/taskloop_simd_reduction_messages.cpp b/test/OpenMP/taskloop_simd_reduction_messages.cpp
new file mode 100644
index 000000000000..f9739f708a28
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_reduction_messages.cpp
@@ -0,0 +1,331 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+void foobar(int &ref) {
+#pragma omp taskloop simd reduction(+:ref)
+  for (int i = 0; i < 10; ++i)
+    foo();
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+struct S7 {
+  int a: 32;
+  S7() {
+#pragma omp taskloop reduction(+:a) // expected-error {{expected addressable reduction item for the task-based directives}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
+};
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp taskloop simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp taskloop simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp taskloop simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp taskloop simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp taskloop simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp taskloop simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp taskloop simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp taskloop simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp taskloop simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp taskloop simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp taskloop simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp taskloop simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+#pragma omp taskloop simd reduction(+ : m) nogroup // expected-error {{'reduction' clause cannot be used with 'nogroup' clause}}
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/Preprocessor/init.c b/test/Preprocessor/init.c
index 0bd9e9784599..0ee7edecb3c4 100644
--- a/test/Preprocessor/init.c
+++ b/test/Preprocessor/init.c
@@ -2185,13 +2185,13 @@
 // ARMV6-CLOUDABI:#define __CloudABI__ 1
 // ARMV6-CLOUDABI:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-netbsd-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NETBSD %s
-//
+// RUN: %clang -E -dM -ffreestanding -target arm-netbsd-eabi %s -o - | FileCheck -match-full-lines -check-prefix ARM-NETBSD %s
+
 // ARM-NETBSD-NOT:#define _LP64
 // ARM-NETBSD:#define __APCS_32__ 1
 // ARM-NETBSD-NOT:#define __ARMEB__ 1
 // ARM-NETBSD:#define __ARMEL__ 1
-// ARM-NETBSD:#define __ARM_ARCH_4T__ 1
+// ARM-NETBSD:#define __ARM_ARCH_5TE__ 1
 // ARM-NETBSD:#define __ARM_DWARF_EH__ 1
 // ARM-NETBSD:#define __ARM_EABI__ 1
 // ARM-NETBSD-NOT:#define __ARM_BIG_ENDIAN 1
@@ -2333,6 +2333,7 @@
 // ARM-NETBSD:#define __SIZE_MAX__ 4294967295UL
 // ARM-NETBSD:#define __SIZE_TYPE__ long unsigned int
 // ARM-NETBSD:#define __SIZE_WIDTH__ 32
+// ARM-NETBSD:#define __SOFTFP__ 1
 // ARM-NETBSD:#define __UINT16_C_SUFFIX__
 // ARM-NETBSD:#define __UINT16_MAX__ 65535
 // ARM-NETBSD:#define __UINT16_TYPE__ unsigned short
@@ -2377,6 +2378,11 @@
 // ARM-NETBSD:#define __arm 1
 // ARM-NETBSD:#define __arm__ 1
 
+// RUN: %clang -E -dM -ffreestanding -target arm-netbsd-eabihf %s -o - | FileCheck -match-full-lines -check-prefix ARMHF-NETBSD %s
+// ARMHF-NETBSD:#define __SIZE_WIDTH__ 32
+// ARMHF-NETBSD-NOT:#define __SOFTFP__ 1
+// ARMHF-NETBSD:#define __UINT16_C_SUFFIX__
+
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-eabihf < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-none-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s
diff --git a/test/Rewriter/objc-modern-metadata-visibility2.mm b/test/Rewriter/objc-modern-metadata-visibility2.mm
new file mode 100644
index 000000000000..4e64ac4beb46
--- /dev/null
+++ b/test/Rewriter/objc-modern-metadata-visibility2.mm
@@ -0,0 +1,45 @@
+// REQUIRES: abi-breaking-checks
+// NOTE: This test has been split from objc-modern-metadata-visibility.mm in
+// order to test with -reverse-iterate as this flag is only present with
+// ABI_BREAKING_CHECKS.
+
+// RUN: %clang_cc1 -E %s -o %t.mm -mllvm -reverse-iterate
+// RUN: %clang_cc1 -x objective-c++ -fblocks -fms-extensions -rewrite-objc %t.mm -mllvm -reverse-iterate -o - | FileCheck %s
+// rdar://11144048
+
+@class NSString;
+
+@interface NSObject {
+    Class isa;
+}
+@end
+
+@interface Sub : NSObject {
+    int subIvar;
+    NSString *nsstring;
+@private
+    id PrivateIvar;
+}
+@end
+
+@implementation Sub
+- (id) MyNSString { return subIvar ? PrivateIvar : nsstring; }
+@end
+
+@interface NSString @end
+@implementation NSString @end
+
+// CHECK: __declspec(allocate(".objc_ivar$B")) extern "C" __declspec(dllimport) unsigned long OBJC_IVAR_$_Sub$subIvar;
+// CHECK: __declspec(allocate(".objc_ivar$B")) extern "C" unsigned long OBJC_IVAR_$_Sub$PrivateIvar;
+// CHECK: __declspec(allocate(".objc_ivar$B")) extern "C" __declspec(dllimport) unsigned long OBJC_IVAR_$_Sub$nsstring;
+// CHECK: #pragma warning(disable:4273)
+// CHECK: __declspec(allocate(".objc_ivar$B")) extern "C" __declspec(dllexport) unsigned long int OBJC_IVAR_$_Sub$subIvar
+// CHECK: __declspec(allocate(".objc_ivar$B")) extern "C" __declspec(dllexport) unsigned long int OBJC_IVAR_$_Sub$nsstring
+// CHECK: __declspec(allocate(".objc_ivar$B")) extern "C" unsigned long int OBJC_IVAR_$_Sub$PrivateIvar
+// CHECK: extern "C" __declspec(dllimport) struct _class_t OBJC_METACLASS_$_NSObject;
+// CHECK: extern "C" __declspec(dllexport) struct _class_t OBJC_METACLASS_$_Sub
+// CHECK: extern "C" __declspec(dllimport) struct _class_t OBJC_CLASS_$_NSObject;
+// CHECK: extern "C" __declspec(dllexport) struct _class_t OBJC_CLASS_$_Sub
+// CHECK: extern "C" __declspec(dllexport) struct _class_t OBJC_CLASS_$_NSString;
+// CHECK: extern "C" __declspec(dllexport) struct _class_t OBJC_METACLASS_$_NSString
+// CHECK: extern "C" __declspec(dllexport) struct _class_t OBJC_CLASS_$_NSString
diff --git a/test/Sema/address-packed.c b/test/Sema/address-packed.c
index 2799e19c48f1..b0519bacd758 100644
--- a/test/Sema/address-packed.c
+++ b/test/Sema/address-packed.c
@@ -329,3 +329,12 @@ void g13(void) {
   uint32_t *p32;
   p32 = &a[0].x; // no-warning
 }
+
+struct Invalid0 {
+  void *x;
+  struct fwd f; // expected-error {{incomplete type}} expected-note {{forward declaration}}
+} __attribute__((packed));
+
+void *g14(struct Invalid0 *ivl) {
+  return &(ivl->x);
+}
diff --git a/test/Sema/attr-availability.c b/test/Sema/attr-availability.c
index e105037c8ed0..1b8cbd256ce9 100644
--- a/test/Sema/attr-availability.c
+++ b/test/Sema/attr-availability.c
@@ -21,6 +21,9 @@ ATSFontGetPostScriptName(int flags) __attribute__((availability(macosx,introduce
 extern void
 PartiallyAvailable() __attribute__((availability(macosx,introduced=10.8)));
 
+#ifdef WARN_PARTIAL
+// expected-note@+2 2 {{marked partial here}}
+#endif
 enum __attribute__((availability(macosx,introduced=10.8))) PartialEnum {
   kPartialEnumConstant,
 };
@@ -35,11 +38,19 @@ void test_10095131() {
   PartiallyAvailable();
 }
 
+#ifdef WARN_PARTIAL
+// FIXME: This note should point to the declaration with the availability
+// attribute.
+// expected-note@+2 {{marked partial here}}
+#endif
 extern void PartiallyAvailable() ;
 void with_redeclaration() {
-  PartiallyAvailable();  // Don't warn.
-
-  // enums should never warn.
+#ifdef WARN_PARTIAL
+  // expected-warning@+4 {{'PartiallyAvailable' is only available on macOS 10.8 or newer}} expected-note@+4 {{__builtin_available}}
+  // expected-warning@+4 {{'PartialEnum' is only available on macOS 10.8 or newer}} expected-note@+4 {{__builtin_available}}
+  // expected-warning@+3 {{'kPartialEnumConstant' is only available on macOS 10.8 or newer}} expected-note@+3 {{__builtin_available}}
+#endif
+  PartiallyAvailable();
   enum PartialEnum p = kPartialEnumConstant;
 }
 
@@ -86,13 +97,13 @@ enum Original {
   OriginalUnavailable __attribute__((availability(macosx, unavailable))) // expected-note + {{'OriginalUnavailable' has been explicitly marked unavailable here}}
 };
 
-enum AllDeprecated {
-  AllDeprecatedCase, // expected-note + {{'AllDeprecatedCase' has been explicitly marked deprecated here}}
+enum AllDeprecated { // expected-note + {{'AllDeprecated' has been explicitly marked deprecated here}}
+  AllDeprecatedCase,
   AllDeprecatedUnavailable __attribute__((availability(macosx, unavailable))) // expected-note + {{'AllDeprecatedUnavailable' has been explicitly marked unavailable here}}
 } __attribute__((availability(macosx, deprecated=10.2)));
 
-enum AllUnavailable {
-  AllUnavailableCase, // expected-note + {{'AllUnavailableCase' has been explicitly marked unavailable here}}
+enum AllUnavailable { // expected-note + {{'AllUnavailable' has been explicitly marked unavailable here}}
+  AllUnavailableCase,
 } __attribute__((availability(macosx, unavailable)));
 
 enum User {
diff --git a/test/Sema/attr-deprecated.c b/test/Sema/attr-deprecated.c
index 89c06860fb4e..9eeef5aed57d 100644
--- a/test/Sema/attr-deprecated.c
+++ b/test/Sema/attr-deprecated.c
@@ -104,9 +104,9 @@ foo_dep test17, // expected-warning {{'foo_dep' is deprecated}}
         test19;
 
 // rdar://problem/8518751
-enum __attribute__((deprecated)) Test20 { // expected-note {{'Test20' has been explicitly marked deprecated here}}
+enum __attribute__((deprecated)) Test20 { // expected-note 2 {{'Test20' has been explicitly marked deprecated here}}
   test20_a __attribute__((deprecated)), // expected-note {{'test20_a' has been explicitly marked deprecated here}}
-  test20_b // expected-note {{'test20_b' has been explicitly marked deprecated here}}
+  test20_b
 };
 void test20() {
   enum Test20 f; // expected-warning {{'Test20' is deprecated}}
diff --git a/test/Sema/attr-unavailable-message.c b/test/Sema/attr-unavailable-message.c
index 415cb2f079a2..70d5947086f9 100644
--- a/test/Sema/attr-unavailable-message.c
+++ b/test/Sema/attr-unavailable-message.c
@@ -36,13 +36,13 @@ void unavail(void) {
 
 // rdar://10201690
 enum foo {
-    a = 1, // expected-note {{'a' has been explicitly marked deprecated here}}
+    a = 1,
     b __attribute__((deprecated())) = 2, // expected-note {{'b' has been explicitly marked deprecated here}}
     c = 3
-}__attribute__((deprecated()));
+}__attribute__((deprecated())); // expected-note {{'foo' has been explicitly marked deprecated here}}
 
-enum fee { // expected-note {{'fee' has been explicitly marked unavailable here}}
-    r = 1, // expected-note {{'r' has been explicitly marked unavailable here}}
+enum fee { // expected-note 2 {{'fee' has been explicitly marked unavailable here}}
+    r = 1,
     s = 2,
     t = 3
 }__attribute__((unavailable()));
diff --git a/test/Sema/loop-control.c b/test/Sema/loop-control.c
index 6c33e8437bd0..1fc35d10218e 100644
--- a/test/Sema/loop-control.c
+++ b/test/Sema/loop-control.c
@@ -119,3 +119,51 @@ void pr8880_23(int x, int y) {
     for ( ; ({ ++y; break; y;}); ++y) {} // expected-warning{{'break' is bound to loop, GCC binds it to switch}}
   }
 }
+
+void pr32648_1(int x, int y) {
+  switch(x) {
+  case 1:
+    for ( ; ({ ++y; switch (y) { case 0: break; } y;}); ++y) {} // no warning
+  }
+}
+
+void pr32648_2(int x, int y) {
+  while(x) {
+    for ( ; ({ ++y; switch (y) { case 0: continue; } y;}); ++y) {}  // expected-warning {{'continue' is bound to current loop, GCC binds it to the enclosing loop}}
+  }
+}
+
+void pr32648_3(int x, int y) {
+  switch(x) {
+  case 1:
+    for ( ; ({ ++y; for (; y; y++) { break; } y;}); ++y) {} // no warning
+  }
+}
+
+void pr32648_4(int x, int y) {
+  switch(x) {
+  case 1:
+    for ( ; ({ ++y; for (({ break; }); y; y++) { } y;}); ++y) {} // expected-warning{{'break' is bound to loop, GCC binds it to switch}}
+  }
+}
+
+void pr32648_5(int x, int y) {
+  switch(x) {
+  case 1:
+    for ( ; ({ ++y; while (({ break; y; })) {} y;}); ++y) {} // expected-warning{{'break' is bound to current loop, GCC binds it to the enclosing loop}}
+  }
+}
+
+void pr32648_6(int x, int y) {
+  switch(x) {
+  case 1:
+    for ( ; ({ ++y; do {} while (({ break; y; })); y;}); ++y) {} // expected-warning{{'break' is bound to current loop, GCC binds it to the enclosing loop}}
+  }
+}
+
+void pr32648_7(int x, int y) {
+  switch(x) {
+  case 1:
+    for ( ; ({ ++y; do { break; } while (y); y;}); ++y) {} // no warning
+  }
+}
diff --git a/test/Sema/warn-cast-qual.c b/test/Sema/warn-cast-qual.c
index dc11f5717ebe..a682cad75ed6 100644
--- a/test/Sema/warn-cast-qual.c
+++ b/test/Sema/warn-cast-qual.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -Wcast-qual -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -x c++ -fsyntax-only -Wcast-qual -verify %s
 
 #include <stdint.h>
 
@@ -26,4 +27,34 @@ void foo() {
 
   const char **charptrptrc;
   char **charptrptr = (char **)charptrptrc; // expected-warning {{cast from 'const char *' to 'char *' drops const qualifier}}
+
+  const char *constcharptr;
+  char *charptr = (char *)constcharptr; // expected-warning {{cast from 'const char *' to 'char *' drops const qualifier}}
+  const char *constcharptr2 = (char *)constcharptr; // expected-warning {{cast from 'const char *' to 'char *' drops const qualifier}}
+  const char *charptr2 = (char *)charptr; // no warning
+}
+
+void bar_0() {
+  struct C {
+    const int a;
+    int b;
+  };
+
+  const struct C S = {0, 0};
+
+  *(int *)(&S.a) = 0; // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+  *(int *)(&S.b) = 0; // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+}
+
+void bar_1() {
+  struct C {
+    const int a;
+    int b;
+  };
+
+  struct C S = {0, 0};
+  S.b = 0; // no warning
+
+  *(int *)(&S.a) = 0; // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+  *(int *)(&S.b) = 0; // no warning
 }
diff --git a/test/Sema/warn-documentation.cpp b/test/Sema/warn-documentation.cpp
index 0c92b2aa029f..ccf374ccd060 100644
--- a/test/Sema/warn-documentation.cpp
+++ b/test/Sema/warn-documentation.cpp
@@ -1186,7 +1186,7 @@ class Predicate
 /// @brief A C++ wrapper class for providing threaded access to a value
 /// of type T.
 ///
-/// A template specilization class.
+/// A template specialization class.
 //----------------------------------------------------------------------
 template<> class Predicate<int, char>
 {
diff --git a/test/SemaCXX/amdgpu-sizeof-alignof.cpp b/test/SemaCXX/amdgpu-sizeof-alignof.cpp
new file mode 100644
index 000000000000..97da71a1a3e2
--- /dev/null
+++ b/test/SemaCXX/amdgpu-sizeof-alignof.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -triple amdgcn---amdgiz -std=c++11 -fsyntax-only -verify %s
+// expected-no-diagnostics
+typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+typedef __INTPTR_TYPE__ intptr_t;
+typedef __UINTPTR_TYPE__ uintptr_t;
+typedef __attribute__((address_space(1))) void *global_ptr_t;
+typedef __attribute__((address_space(2))) void *constant_ptr_t;
+typedef __attribute__((address_space(3))) void *local_ptr_t;
+typedef __attribute__((address_space(5))) void *private_ptr_t;
+
+void test() {
+  static_assert(sizeof(size_t) == 8, "bad size");
+  static_assert(alignof(size_t) == 8, "bad alignment");
+  static_assert(sizeof(intptr_t) == 8, "bad size");
+  static_assert(alignof(intptr_t) == 8, "bad alignment");
+  static_assert(sizeof(uintptr_t) == 8, "bad size");
+  static_assert(alignof(uintptr_t) == 8, "bad alignment");
+  static_assert(sizeof(ptrdiff_t) == 8, "bad size");
+  static_assert(alignof(ptrdiff_t) == 8, "bad alignment");
+
+  static_assert(sizeof(char) == 1, "bad size");
+  static_assert(alignof(char) == 1, "bad alignment");
+  static_assert(sizeof(short) == 2, "bad size");
+  static_assert(alignof(short) == 2, "bad alignment");
+  static_assert(sizeof(int) == 4, "bad size");
+  static_assert(alignof(int) == 4, "bad alignment");
+  static_assert(sizeof(long) == 8, "bad size");
+  static_assert(alignof(long) == 8, "bad alignment");
+  static_assert(sizeof(long long) == 8, "bad size");
+  static_assert(alignof(long long) == 8, "bad alignment");
+  static_assert(sizeof(float) == 4, "bad size");
+  static_assert(alignof(float) == 4, "bad alignment");
+  static_assert(sizeof(double) == 8, "bad size");
+  static_assert(alignof(double) == 8, "bad alignment");
+
+  static_assert(sizeof(void*) == 8, "bad size");
+  static_assert(alignof(void*) == 8, "bad alignment");
+  static_assert(sizeof(global_ptr_t) == 8, "bad size");
+  static_assert(alignof(global_ptr_t) == 8, "bad alignment");
+  static_assert(sizeof(constant_ptr_t) == 8, "bad size");
+  static_assert(alignof(constant_ptr_t) == 8, "bad alignment");
+  static_assert(sizeof(local_ptr_t) == 4, "bad size");
+  static_assert(alignof(local_ptr_t) == 4, "bad alignment");
+  static_assert(sizeof(private_ptr_t) == 4, "bad size");
+  static_assert(alignof(private_ptr_t) == 4, "bad alignment");
+}
diff --git a/test/SemaCXX/attr-deprecated.cpp b/test/SemaCXX/attr-deprecated.cpp
index 1680c5c6760d..5ba55f0c23b5 100644
--- a/test/SemaCXX/attr-deprecated.cpp
+++ b/test/SemaCXX/attr-deprecated.cpp
@@ -199,8 +199,8 @@ namespace test5 {
 
 // rdar://problem/8518751
 namespace test6 {
-  enum __attribute__((deprecated)) A { // expected-note {{'A' has been explicitly marked deprecated here}}
-    a0 // expected-note {{'a0' has been explicitly marked deprecated here}}
+  enum __attribute__((deprecated)) A { // expected-note 2 {{'A' has been explicitly marked deprecated here}}
+    a0
   };
   void testA() {
     A x; // expected-warning {{'A' is deprecated}}
@@ -218,8 +218,8 @@ namespace test6 {
   }
 
   template <class T> struct C {
-    enum __attribute__((deprecated)) Enum { // expected-note {{'Enum' has been explicitly marked deprecated here}}
-      c0 // expected-note {{'c0' has been explicitly marked deprecated here}}
+    enum __attribute__((deprecated)) Enum { // expected-note 2 {{'Enum' has been explicitly marked deprecated here}}
+      c0
     };
   };
   void testC() {
diff --git a/test/SemaCXX/coroutines.cpp b/test/SemaCXX/coroutines.cpp
index 4eedc2162f07..6394829f356a 100644
--- a/test/SemaCXX/coroutines.cpp
+++ b/test/SemaCXX/coroutines.cpp
@@ -22,8 +22,24 @@ void no_coroutine_traits() {
 
 namespace std {
 namespace experimental {
-template <typename... T>
-struct coroutine_traits; // expected-note {{declared here}}
+
+template <class... Args>
+struct void_t_imp {
+  using type = void;
+};
+template <class... Args>
+using void_t = typename void_t_imp<Args...>::type;
+
+template <class T, class = void>
+struct traits_sfinae_base {};
+
+template <class T>
+struct traits_sfinae_base<T, void_t<typename T::promise_type>> {
+  using promise_type = typename T::promise_type;
+};
+
+template <class Ret, class... Args>
+struct coroutine_traits : public traits_sfinae_base<Ret> {};
 }}  // namespace std::experimental
 
 template<typename Promise> struct coro {};
@@ -50,8 +66,9 @@ struct suspend_never {
   void await_resume() {}
 };
 
-void no_specialization() {
-  co_await a; // expected-error {{implicit instantiation of undefined template 'std::experimental::coroutine_traits<void>'}}
+struct DummyVoidTag {};
+DummyVoidTag no_specialization() { // expected-error {{this function cannot be a coroutine: 'std::experimental::coroutine_traits<DummyVoidTag>' has no member named 'promise_type'}}
+  co_await a;
 }
 
 template <typename... T>
@@ -905,3 +922,243 @@ void test_dependent_param(T t, U) {
   co_return 42;
 }
 template void test_dependent_param(NoCopy<0>, NoCopy<1>); // expected-note {{requested here}}
+
+namespace CoroHandleMemberFunctionTest {
+struct CoroMemberTag {};
+struct BadCoroMemberTag {};
+
+template <class T, class U>
+constexpr bool IsSameV = false;
+template <class T>
+constexpr bool IsSameV<T, T> = true;
+
+template <class T>
+struct TypeTest {
+  template <class U>
+  static constexpr bool IsSame = IsSameV<T, U>;
+
+  template <class... Args>
+  static constexpr bool MatchesArgs = IsSameV<T,
+                                              std::experimental::coroutine_traits<CoroMemberTag, Args...>>;
+};
+
+template <class T>
+struct AwaitReturnsType {
+  bool await_ready() const;
+  void await_suspend(...) const;
+  T await_resume() const;
+};
+
+template <class... CoroTraitsArgs>
+struct CoroMemberPromise {
+  using TraitsT = std::experimental::coroutine_traits<CoroTraitsArgs...>;
+  using TypeTestT = TypeTest<TraitsT>;
+  using AwaitTestT = AwaitReturnsType<TypeTestT>;
+
+  CoroMemberTag get_return_object();
+  suspend_always initial_suspend();
+  suspend_always final_suspend();
+
+  AwaitTestT yield_value(int);
+
+  void return_void();
+  void unhandled_exception();
+};
+
+} // namespace CoroHandleMemberFunctionTest
+
+template <class... Args>
+struct ::std::experimental::coroutine_traits<CoroHandleMemberFunctionTest::CoroMemberTag, Args...> {
+  using promise_type = CoroHandleMemberFunctionTest::CoroMemberPromise<CoroHandleMemberFunctionTest::CoroMemberTag, Args...>;
+};
+
+namespace CoroHandleMemberFunctionTest {
+struct TestType {
+
+  CoroMemberTag test_qual() {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType &>, "");
+    static_assert(!TC.MatchesArgs<TestType>, "");
+    static_assert(!TC.MatchesArgs<TestType *>, "");
+  }
+
+  CoroMemberTag test_sanity(int *) const {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<const TestType &>, ""); // expected-error {{static_assert failed}}
+    static_assert(TC.MatchesArgs<const TestType &>, ""); // expected-error {{static_assert failed}}
+    static_assert(TC.MatchesArgs<const TestType &, int *>, "");
+  }
+
+  CoroMemberTag test_qual(int *, const float &&, volatile void *volatile) const {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<const TestType &, int *, const float &&, volatile void *volatile>, "");
+  }
+
+  CoroMemberTag test_qual() const volatile {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<const volatile TestType &>, "");
+  }
+
+  CoroMemberTag test_ref_qual() & {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType &>, "");
+  }
+  CoroMemberTag test_ref_qual() const & {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType const &>, "");
+  }
+  CoroMemberTag test_ref_qual() && {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType &&>, "");
+  }
+  CoroMemberTag test_ref_qual(const char *&) const volatile && {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType const volatile &&, const char *&>, "");
+  }
+
+  CoroMemberTag test_args(int) {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType &, int>, "");
+  }
+  CoroMemberTag test_args(int, long &, void *) const {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<TestType const &, int, long &, void *>, "");
+  }
+
+  template <class... Args>
+  CoroMemberTag test_member_template(Args...) const && {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<TestType const &&, Args...>, "");
+  }
+
+  static CoroMemberTag test_static() {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<>, "");
+    static_assert(!TC.MatchesArgs<TestType>, "");
+    static_assert(!TC.MatchesArgs<TestType &>, "");
+    static_assert(!TC.MatchesArgs<TestType *>, "");
+  }
+
+  static CoroMemberTag test_static(volatile void *const, char &&) {
+    auto TC = co_yield 0;
+    static_assert(TC.MatchesArgs<volatile void *const, char &&>, "");
+  }
+
+  template <class Dummy>
+  static CoroMemberTag test_static_template(const char *volatile &, unsigned) {
+    auto TC = co_yield 0;
+    using TCT = decltype(TC);
+    static_assert(TCT::MatchesArgs<const char *volatile &, unsigned>, "");
+    static_assert(!TCT::MatchesArgs<TestType &, const char *volatile &, unsigned>, "");
+  }
+
+  BadCoroMemberTag test_diagnostics() {
+    // expected-error@-1 {{this function cannot be a coroutine: 'std::experimental::coroutine_traits<CoroHandleMemberFunctionTest::BadCoroMemberTag, CoroHandleMemberFunctionTest::TestType &>' has no member named 'promise_type'}}
+    co_return;
+  }
+  BadCoroMemberTag test_diagnostics(int) const && {
+    // expected-error@-1 {{this function cannot be a coroutine: 'std::experimental::coroutine_traits<CoroHandleMemberFunctionTest::BadCoroMemberTag, const CoroHandleMemberFunctionTest::TestType &&, int>' has no member named 'promise_type'}}
+    co_return;
+  }
+
+  static BadCoroMemberTag test_static_diagnostics(long *) {
+    // expected-error@-1 {{this function cannot be a coroutine: 'std::experimental::coroutine_traits<CoroHandleMemberFunctionTest::BadCoroMemberTag, long *>' has no member named 'promise_type'}}
+    co_return;
+  }
+};
+
+template CoroMemberTag TestType::test_member_template(long, const char *) const &&;
+template CoroMemberTag TestType::test_static_template<void>(const char *volatile &, unsigned);
+
+template <class... Args>
+struct DepTestType {
+
+  CoroMemberTag test_sanity(int *) const {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<const DepTestType &>, ""); // expected-error {{static_assert failed}}
+    static_assert(TC.template MatchesArgs<>, ""); // expected-error {{static_assert failed}}
+    static_assert(TC.template MatchesArgs<const DepTestType &, int *>, "");
+  }
+
+  CoroMemberTag test_qual() {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType &>, "");
+    static_assert(!TC.template MatchesArgs<DepTestType>, "");
+    static_assert(!TC.template MatchesArgs<DepTestType *>, "");
+  }
+
+  CoroMemberTag test_qual(int *, const float &&, volatile void *volatile) const {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<const DepTestType &, int *, const float &&, volatile void *volatile>, "");
+  }
+
+  CoroMemberTag test_qual() const volatile {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<const volatile DepTestType &>, "");
+  }
+
+  CoroMemberTag test_ref_qual() & {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType &>, "");
+  }
+  CoroMemberTag test_ref_qual() const & {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType const &>, "");
+  }
+  CoroMemberTag test_ref_qual() && {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType &&>, "");
+  }
+  CoroMemberTag test_ref_qual(const char *&) const volatile && {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType const volatile &&, const char *&>, "");
+  }
+
+  CoroMemberTag test_args(int) {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType &, int>, "");
+  }
+  CoroMemberTag test_args(int, long &, void *) const {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType const &, int, long &, void *>, "");
+  }
+
+  template <class... UArgs>
+  CoroMemberTag test_member_template(UArgs...) const && {
+    auto TC = co_yield 0;
+    static_assert(TC.template MatchesArgs<DepTestType const &&, UArgs...>, "");
+  }
+
+  static CoroMemberTag test_static() {
+    auto TC = co_yield 0;
+    using TCT = decltype(TC);
+    static_assert(TCT::MatchesArgs<>, "");
+    static_assert(!TCT::MatchesArgs<DepTestType>, "");
+    static_assert(!TCT::MatchesArgs<DepTestType &>, "");
+    static_assert(!TCT::MatchesArgs<DepTestType *>, "");
+
+    // Ensure diagnostics are actually being generated here
+    static_assert(TCT::MatchesArgs<int>, ""); // expected-error {{static_assert failed}}
+  }
+
+  static CoroMemberTag test_static(volatile void *const, char &&) {
+    auto TC = co_yield 0;
+    using TCT = decltype(TC);
+    static_assert(TCT::MatchesArgs<volatile void *const, char &&>, "");
+  }
+
+  template <class Dummy>
+  static CoroMemberTag test_static_template(const char *volatile &, unsigned) {
+    auto TC = co_yield 0;
+    using TCT = decltype(TC);
+    static_assert(TCT::MatchesArgs<const char *volatile &, unsigned>, "");
+    static_assert(!TCT::MatchesArgs<DepTestType &, const char *volatile &, unsigned>, "");
+  }
+};
+
+template struct DepTestType<int>; // expected-note {{requested here}}
+template CoroMemberTag DepTestType<int>::test_member_template(long, const char *) const &&;
+
+template CoroMemberTag DepTestType<int>::test_static_template<void>(const char *volatile &, unsigned);
+
+} // namespace CoroHandleMemberFunctionTest
diff --git a/test/SemaCXX/dllimport-memptr.cpp b/test/SemaCXX/dllimport-memptr.cpp
new file mode 100644
index 000000000000..35bf5b8610b5
--- /dev/null
+++ b/test/SemaCXX/dllimport-memptr.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -verify -std=c++11 %s
+
+// expected-no-diagnostics
+
+struct __declspec(dllimport) Foo { int get_a(); };
+template <int (Foo::*Getter)()> struct HasValue { };
+HasValue<&Foo::get_a> hv;
diff --git a/test/SemaCXX/modules-ts.cppm b/test/SemaCXX/modules-ts.cppm
index 29122ec7dab5..2ea4958bffda 100644
--- a/test/SemaCXX/modules-ts.cppm
+++ b/test/SemaCXX/modules-ts.cppm
@@ -13,7 +13,13 @@ export module foo;
 // expected-note@modules-ts.cppm:* {{loaded from}}
 #endif
 
-static int m; // ok, internal linkage, so no redefinition error
+static int m;
+#if TEST == 2 // FIXME: 'm' has internal linkage, so there should be no error here
+// expected-error@-2 {{redefinition of '}}
+// expected-note@-3 {{unguarded header; consider using #ifdef guards or #pragma once}}
+// FIXME: We should drop the "header from" in this diagnostic.
+// expected-note-re@modules-ts.cppm:1 {{'{{.*}}modules-ts.cppm' included multiple times, additional include site in header from module 'foo'}}
+#endif
 int n;
 #if TEST >= 2
 // expected-error@-2 {{redefinition of '}}
diff --git a/test/SemaCXX/warn-cast-qual.cpp b/test/SemaCXX/warn-cast-qual.cpp
new file mode 100644
index 000000000000..d1f0cc73a63d
--- /dev/null
+++ b/test/SemaCXX/warn-cast-qual.cpp
@@ -0,0 +1,140 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -Wcast-qual -verify %s
+
+#include <stdint.h>
+
+// do *NOT* warn on const_cast<>()
+// use clang-tidy's cppcoreguidelines-pro-type-const-cast for that.
+void foo_ptr() {
+  const char *const ptr = 0;
+  char *t0 = const_cast<char *>(ptr); // no warning
+
+  volatile char *ptr2 = 0;
+  char *t1 = const_cast<char *>(ptr2); // no warning
+
+  const volatile char *ptr3 = 0;
+  char *t2 = const_cast<char *>(ptr3); // no warning
+}
+
+void cstr() {
+  void* p0 = (void*)(const void*)"txt"; // expected-warning {{cast from 'const void *' to 'void *' drops const qualifier}}
+  void* p1 = (void*)"txt"; // FIXME
+  char* p2 = (char*)"txt"; // expected-warning {{cast from 'const char *' to 'char *' drops const qualifier}}
+}
+
+void foo_0() {
+  const int a = 0;
+
+  const int &a0 = a;              // no warning
+  const int &a1 = (const int &)a; // no warning
+
+  int &a2 = (int &)a;                      // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+  const int &a3 = (int &)a;                // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+  int &a4 = (int &)((const int &)a);       // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+  int &a5 = (int &)((int &)a);             // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+  const int &a6 = (int &)((int &)a);       // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+  const int &a7 = (int &)((const int &)a); // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+  const int &a8 = (const int &)((int &)a); // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+}
+
+void foo_1() {
+  volatile int a = 0;
+
+  volatile int &a0 = a;                 // no warning
+  volatile int &a1 = (volatile int &)a; // no warning
+
+  int &a2 = (int &)a;                            // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+  volatile int &a3 = (int &)a;                   // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+  int &a4 = (int &)((volatile int &)a);          // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+  int &a5 = (int &)((int &)a);                   // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+  volatile int &a6 = (int &)((int &)a);          // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+  volatile int &a7 = (int &)((volatile int &)a); // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+  volatile int &a8 = (volatile int &)((int &)a); // expected-warning {{cast from 'volatile int' to 'int &' drops volatile qualifier}}
+}
+
+void foo_2() {
+  const volatile int a = 0;
+
+  const volatile int &a0 = a;                       // no warning
+  const volatile int &a1 = (const volatile int &)a; // no warning
+
+  int &a2 = (int &)a;                                        // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+  const volatile int &a3 = (int &)a;                         // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+  int &a4 = (int &)((const volatile int &)a);                // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+  int &a5 = (int &)((int &)a);                               // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+  const volatile int &a6 = (int &)((int &)a);                // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+  const volatile int &a7 = (int &)((const volatile int &)a); // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+  const volatile int &a8 = (const volatile int &)((int &)a); // expected-warning {{cast from 'const volatile int' to 'int &' drops const and volatile qualifiers}}
+}
+
+void bar_0() {
+  const int *_a = 0;
+  const int **a = &_a;
+
+  int **a0 = (int **)((const int **)a); // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+  int **a1 = (int **)((int **)a);       // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+
+  // const int **a2 = (int **)((int **)a);
+  // const int **a3 = (int **)((const int **)a);
+
+  const int **a4 = (const int **)((int **)a);        // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}} expected-warning {{cast from 'int **' to 'const int **' must have all intermediate pointers const qualified to be safe}}
+  const int **a5 = (const int **)((const int **)a); // no warning
+}
+
+void bar_1() {
+  const int *_a = 0;
+  const int *&a = _a;
+
+  int *&a0 = (int *&)((const int *&)a); // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+  int *&a1 = (int *&)((int *&)a);       // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+
+  // const int *&a2 = (int *&)((int *&)a);
+  // const int *&a3 = (int *&)((const int *&)a);
+
+  const int *&a4 = (const int *&)((int *&)a);        // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}} expected-warning {{cast from 'int *' to 'const int *&' must have all intermediate pointers const qualified to be safe}}
+  const int *&a5 = (const int *&)((const int *&)a); // no warning
+}
+
+void baz_0() {
+  struct C {
+    void A() {}
+    void B() const {}
+  };
+
+  const C S;
+  S.B();
+
+  ((C &)S).B(); // expected-warning {{cast from 'const C' to 'C &' drops const qualifier}}
+  ((C &)S).A(); // expected-warning {{cast from 'const C' to 'C &' drops const qualifier}}
+
+  ((C *)&S)->B(); // expected-warning {{cast from 'const C *' to 'C *' drops const qualifier}}
+  ((C *)&S)->A(); // expected-warning {{cast from 'const C *' to 'C *' drops const qualifier}}
+}
+
+void baz_1() {
+  struct C {
+    const int a;
+    int b;
+
+    C() : a(0) {}
+  };
+
+  {
+    C S;
+    S.b = 0;
+
+    (int &)(S.a) = 0; // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+    (int &)(S.b) = 0; // no warning
+
+    *(int *)(&S.a) = 0; // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+    *(int *)(&S.b) = 0; // no warning
+  }
+  {
+    const C S;
+
+    (int &)(S.a) = 0; // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+    (int &)(S.b) = 0; // expected-warning {{cast from 'const int' to 'int &' drops const qualifier}}
+
+    *(int *)(&S.a) = 0; // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+    *(int *)(&S.b) = 0; // expected-warning {{cast from 'const int *' to 'int *' drops const qualifier}}
+  }
+}
diff --git a/test/SemaCXX/warn-loop-analysis.cpp b/test/SemaCXX/warn-loop-analysis.cpp
index 25ec7a7862ed..2934003848a2 100644
--- a/test/SemaCXX/warn-loop-analysis.cpp
+++ b/test/SemaCXX/warn-loop-analysis.cpp
@@ -202,6 +202,12 @@ void test7() {
     if (true) continue;
     i--;
   }
+
+  // But do warn if the continue is in a nested loop.
+  for (;;i--) { // expected-note{{decremented here}}
+    for (int j = 0; j < 10; ++j) continue;
+    i--; // expected-warning{{decremented both}}
+  }
 }
 
 struct iterator {
@@ -259,6 +265,12 @@ void test8() {
     if (true) continue;
     i--;
   }
+
+  // But do warn if the continue is in a nested loop.
+  for (;;i--) { // expected-note{{decremented here}}
+    for (int j = 0; j < 10; ++j) continue;
+    i--; // expected-warning{{decremented both}}
+  }
 }
 
 int f(int);
diff --git a/test/SemaCXX/warn-throw-out-noexcept-func.cpp b/test/SemaCXX/warn-throw-out-noexcept-func.cpp
index fc2919a1e327..67ffbd93940a 100644
--- a/test/SemaCXX/warn-throw-out-noexcept-func.cpp
+++ b/test/SemaCXX/warn-throw-out-noexcept-func.cpp
@@ -2,8 +2,8 @@
 struct A_ShouldDiag {
   ~A_ShouldDiag(); // implicitly noexcept(true)
 };
-A_ShouldDiag::~A_ShouldDiag() { // expected-note {{destructor or deallocator has a (possibly implicit) non-throwing excepton specification}}
-  throw 1; // expected-warning {{has a non-throwing exception specification but can still throw, resulting in unexpected program termination}}
+A_ShouldDiag::~A_ShouldDiag() { // expected-note {{destructor has a implicit non-throwing exception specification}}
+  throw 1; // expected-warning {{has a non-throwing exception specification but can still throw}}
 }
 struct B_ShouldDiag {
   int i;
@@ -11,7 +11,7 @@ struct B_ShouldDiag {
 };
 struct R_ShouldDiag : A_ShouldDiag {
   B_ShouldDiag b;
-  ~R_ShouldDiag() {     // expected-note  {{destructor or deallocator has a}}
+  ~R_ShouldDiag() { // expected-note  {{destructor has a implicit non-throwing exception specification}}
     throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
@@ -30,18 +30,18 @@ struct N_ShouldDiag {
   ~N_ShouldDiag(); //implicitly noexcept(true)
 };
 
-N_ShouldDiag::~N_ShouldDiag() {  // expected-note  {{destructor or deallocator has a}}
+N_ShouldDiag::~N_ShouldDiag() { // expected-note  {{destructor has a implicit non-throwing exception specification}}
   throw 1; // expected-warning {{has a non-throwing exception specification but}}
 }
 struct X_ShouldDiag {
   B_ShouldDiag b;
-  ~X_ShouldDiag() noexcept { // expected-note  {{destructor or deallocator has a}}
-    throw 1;      // expected-warning {{has a non-throwing exception specification but}}
+  ~X_ShouldDiag() noexcept { // expected-note  {{destructor has a non-throwing exception}}
+    throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
 struct Y_ShouldDiag : A_ShouldDiag {
-  ~Y_ShouldDiag() noexcept(true) { // expected-note  {{destructor or deallocator has a}}
-    throw 1;            // expected-warning {{has a non-throwing exception specification but}}
+  ~Y_ShouldDiag() noexcept(true) { // expected-note  {{destructor has a non-throwing exception specification}}
+    throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
 struct C_ShouldNotDiag {
@@ -54,7 +54,7 @@ struct D_ShouldNotDiag {
     throw 1;
   }
 };
-struct E_ShouldNotDiag  {
+struct E_ShouldNotDiag {
   C_ShouldNotDiag c;
   ~E_ShouldNotDiag(); //implicitly noexcept(false)
 };
@@ -68,7 +68,7 @@ class A1_ShouldDiag {
   T b;
 
 public:
-  ~A1_ShouldDiag() {    // expected-note  {{destructor or deallocator has a}}
+  ~A1_ShouldDiag() { // expected-note  {{destructor has a implicit non-throwing exception specification}}
     throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
@@ -81,19 +81,19 @@ template <typename T>
 struct R1_ShouldDiag : A1_ShouldDiag<T> //expected-note {{in instantiation of member function}}
 {
   B1_ShouldDiag<T> b;
-  ~R1_ShouldDiag() {    // expected-note  {{destructor or deallocator has a}}
+  ~R1_ShouldDiag() { // expected-note  {{destructor has a implicit non-throwing exception specification}}
     throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
 template <typename T>
 struct S1_ShouldDiag : A1_ShouldDiag<T> {
   B1_ShouldDiag<T> b;
-  ~S1_ShouldDiag() noexcept { // expected-note  {{destructor or deallocator has a}}
-    throw 1;       // expected-warning {{has a non-throwing exception specification but}}
+  ~S1_ShouldDiag() noexcept { // expected-note  {{destructor has a non-throwing exception specification}}
+    throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
-void operator delete(void *ptr) noexcept { // expected-note  {{destructor or deallocator has a}}
-  throw 1;                                 // expected-warning {{has a non-throwing exception specification but}}
+void operator delete(void *ptr) noexcept { // expected-note  {{deallocator has a non-throwing exception specification}}
+  throw 1; // expected-warning {{has a non-throwing exception specification but}}
 }
 struct except_fun {
   static const bool i = false;
@@ -109,18 +109,18 @@ struct dependent_warn {
 };
 template <typename T>
 struct dependent_warn_noexcept {
-  ~dependent_warn_noexcept() noexcept(T::i) { // expected-note  {{destructor or deallocator has a}}
-    throw 1;                                  // expected-warning {{has a non-throwing exception specification but}}
+  ~dependent_warn_noexcept() noexcept(T::i) { // expected-note  {{destructor has a non-throwing exception specification}}
+    throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
 template <typename T>
 struct dependent_warn_both {
-  ~dependent_warn_both() noexcept(T::i) { // expected-note  {{destructor or deallocator has a}}
-    throw 1;                              // expected-warning {{has a non-throwing exception specification but}}
+  ~dependent_warn_both() noexcept(T::i) { // expected-note  {{destructor has a non-throwing exception specification}}
+    throw 1; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
-void foo() noexcept { //expected-note {{non-throwing function declare here}}
-  throw 1;            // expected-warning {{has a non-throwing exception specification but}}
+void foo() noexcept { //expected-note {{function declared non-throwing here}}
+  throw 1; // expected-warning {{has a non-throwing exception specification but}}
 }
 struct Throws {
   ~Throws() noexcept(false);
@@ -128,14 +128,14 @@ struct Throws {
 
 struct ShouldDiagnose {
   Throws T;
-  ~ShouldDiagnose() noexcept { //expected-note {{destructor or deallocator has a}}
+  ~ShouldDiagnose() noexcept { //expected-note {{destructor has a non-throwing exception specification}}
     throw; // expected-warning {{has a non-throwing exception specification but}}
   }
 };
 struct ShouldNotDiagnose {
   Throws T;
-  ~ShouldNotDiagnose() { 
-    throw; 
+  ~ShouldNotDiagnose() {
+    throw;
   }
 };
 
@@ -158,21 +158,21 @@ void g_ShouldNotDiag() noexcept {
   }
 }
 
-void h_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void h_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   try {
     throw 12; // expected-warning {{has a non-throwing exception specification but}}
   } catch (const char *) {
   }
 }
 
-void i_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void i_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   try {
     throw 12;
   } catch (int) {
     throw; // expected-warning {{has a non-throwing exception specification but}}
   }
 }
-void j_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void j_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   try {
     throw 12;
   } catch (int) {
@@ -180,7 +180,7 @@ void j_ShouldDiag() noexcept { //expected-note {{non-throwing function declare h
   }
 }
 
-void k_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void k_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   try {
     throw 12;
   } catch (...) {
@@ -188,7 +188,7 @@ void k_ShouldDiag() noexcept { //expected-note {{non-throwing function declare h
   }
 }
 
-void loo_ShouldDiag(int i) noexcept { //expected-note {{non-throwing function declare here}}
+void loo_ShouldDiag(int i) noexcept { //expected-note {{function declared non-throwing here}}
   if (i)
     try {
       throw 12;
@@ -203,13 +203,13 @@ void loo1_ShouldNotDiag() noexcept {
     throw 12;
 }
 
-void loo2_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void loo2_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   if (1)
     throw 12; // expected-warning {{has a non-throwing exception specification but}}
 }
 struct S {};
 
-void l_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void l_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   try {
     throw S{}; //expected-warning {{has a non-throwing exception specification but}}
   } catch (S *s) {
@@ -222,7 +222,6 @@ void m_ShouldNotDiag() noexcept {
     throw s;
   } catch (S s) {
   }
-
 }
 void n_ShouldNotDiag() noexcept {
   try {
@@ -231,7 +230,7 @@ void n_ShouldNotDiag() noexcept {
   } catch (const S &s) {
   }
 }
-void o_ShouldDiag() noexcept { //expected-note {{non-throwing function declare here}}
+void o_ShouldDiag() noexcept { //expected-note {{function declared non-throwing here}}
   try {
     throw; //expected-warning {{has a non-throwing exception specification but}}
   } catch (...) {
@@ -239,7 +238,7 @@ void o_ShouldDiag() noexcept { //expected-note {{non-throwing function declare h
 }
 
 #define NOEXCEPT noexcept
-void with_macro() NOEXCEPT { //expected-note {{non-throwing function declare here}}
+void with_macro() NOEXCEPT { //expected-note {{function declared non-throwing here}}
   throw 1; // expected-warning {{has a non-throwing exception specification but}}
 }
 
@@ -248,8 +247,8 @@ void with_try_block() try {
 } catch (...) {
 }
 
-void with_try_block1() noexcept try { //expected-note {{non-throwing function declare here}}
-  throw 2;  // expected-warning {{has a non-throwing exception specification but}}
+void with_try_block1() noexcept try { //expected-note {{function declared non-throwing here}}
+  throw 2; // expected-warning {{has a non-throwing exception specification but}}
 } catch (char *) {
 }
 
@@ -272,20 +271,20 @@ void goodPointer() noexcept {
     throw &d;
   } catch (B *) {}
 }
-void badPlain() noexcept { // expected-note {{non-throwing function declare here}}
+void badPlain() noexcept { //expected-note {{function declared non-throwing here}}
   try {
-    throw B(); // expected-warning {{'badPlain' has a non-throwing exception specification but can still throw, resulting in unexpected program termination}}
+    throw B(); // expected-warning {{'badPlain' has a non-throwing exception specification but can still throw}}
   } catch (D) {}
 }
-void badReference() noexcept { // expected-note {{non-throwing function declare here}}
+void badReference() noexcept { //expected-note {{function declared non-throwing here}}
   try {
-    throw B(); // expected-warning {{'badReference' has a non-throwing exception specification but can still throw, resulting in unexpected program termination}}
+    throw B(); // expected-warning {{'badReference' has a non-throwing exception specification but can still throw}}
   } catch (D &) {}
 }
-void badPointer() noexcept { // expected-note {{non-throwing function declare here}}
+void badPointer() noexcept { //expected-note {{function declared non-throwing here}}
   B b;
   try {
-    throw &b; // expected-warning {{'badPointer' has a non-throwing exception specification but can still throw, resulting in unexpected program termination}}
+    throw &b; // expected-warning {{'badPointer' has a non-throwing exception specification but can still throw}}
   } catch (D *) {}
 }
 }
diff --git a/test/SemaObjC/attr-availability.m b/test/SemaObjC/attr-availability.m
index 02b7c5c81491..1245ac7409b8 100644
--- a/test/SemaObjC/attr-availability.m
+++ b/test/SemaObjC/attr-availability.m
@@ -13,7 +13,7 @@
 @interface A <P>
 - (void)method __attribute__((availability(macosx,introduced=10.1,deprecated=10.2))); // expected-note {{'method' has been explicitly marked deprecated here}}
 #if defined(WARN_PARTIAL)
-  // expected-note@+2 {{'partialMethod' has been explicitly marked partial here}}
+  // expected-note@+2 2 {{'partialMethod' has been explicitly marked partial here}}
 #endif
 - (void)partialMethod __attribute__((availability(macosx,introduced=10.8)));
 
@@ -66,7 +66,10 @@ void f(A *a, B *b) {
 @end
 
 void f_after_redecl(A *a, B *b) {
-  [a partialMethod]; // no warning
+#ifdef WARN_PARTIAL
+  // expected-warning@+2{{'partialMethod' is only available on macOS 10.8 or newer}} expected-note@+2 {{@available}}
+#endif
+  [a partialMethod];
   [b partialMethod]; // no warning
   [a partial_proto_method]; // no warning
   [b partial_proto_method]; // no warning
@@ -133,6 +136,10 @@ id NSNibOwner, topNibObjects;
 @end
 
 @interface PartialI <PartialProt>
+#ifdef WARN_PARTIAL
+// expected-note@+3{{marked partial here}}
+// expected-note@+3{{marked partial here}}
+#endif
 - (void)partialMethod __attribute__((availability(macosx,introduced=10.8)));
 + (void)partialMethod __attribute__((availability(macosx,introduced=10.8)));
 @end
@@ -160,14 +167,20 @@ id NSNibOwner, topNibObjects;
 @end
 
 void partialfun(PartialI* a) {
-  [a partialMethod]; // no warning
+#ifdef WARN_PARTIAL
+  // expected-warning@+2 {{'partialMethod' is only available on macOS 10.8 or newer}} expected-note@+2{{@available}}
+#endif
+  [a partialMethod];
   [a ipartialMethod1]; // no warning
 #if defined(WARN_PARTIAL)
   // expected-warning@+2 {{'ipartialMethod2' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'ipartialMethod2' in an @available check to silence this warning}}
 #endif
   [a ipartialMethod2];
   [a ppartialMethod]; // no warning
-  [PartialI partialMethod]; // no warning
+#ifdef WARN_PARTIAL
+  // expected-warning@+2 {{'partialMethod' is only available on macOS 10.8 or newer}} expected-note@+2 {{@available}}
+#endif
+  [PartialI partialMethod];
   [PartialI ipartialMethod1]; // no warning
 #if defined(WARN_PARTIAL)
   // expected-warning@+2 {{'ipartialMethod2' is only available on macOS 10.8 or newer}} expected-note@+2 {{enclose 'ipartialMethod2' in an @available check to silence this warning}}
@@ -177,20 +190,23 @@ void partialfun(PartialI* a) {
 }
 
 #if defined(WARN_PARTIAL)
-  // expected-note@+2 {{'PartialI2' has been explicitly marked partial here}}
+  // expected-note@+2 2 {{'PartialI2' has been explicitly marked partial here}}
 #endif
 __attribute__((availability(macosx, introduced = 10.8))) @interface PartialI2
 @end
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'PartialI2' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'PartialI2' to silence this warning}}
+// expected-warning@+2 {{'PartialI2' is partial: introduced in macOS 10.8}} expected-note@+2 {{annotate 'partialinter1' with an availability attribute to silence}}
 #endif
 void partialinter1(PartialI2* p) {
 }
 
 @class PartialI2;
 
-void partialinter2(PartialI2* p) { // no warning
+#ifdef WARN_PARTIAL
+// expected-warning@+2 {{'PartialI2' is partial: introduced in macOS 10.8}} expected-note@+2 {{annotate 'partialinter2' with an availability attribute to silence}}
+#endif
+void partialinter2(PartialI2* p) {
 }
 
 
diff --git a/test/SemaObjC/default-synthesize-3.m b/test/SemaObjC/default-synthesize-3.m
index fe2b35f61523..9a05408aa060 100644
--- a/test/SemaObjC/default-synthesize-3.m
+++ b/test/SemaObjC/default-synthesize-3.m
@@ -173,13 +173,13 @@ typedef NSObject<Fooing> FooObject;
 @end
 
 @implementation Okay // expected-warning {{auto property synthesis will not synthesize property 'muahahaha' declared in protocol 'Fooing'}} expected-warning {{auto property synthesis will not synthesize property 'hoho' declared in protocol 'SubFooling'}}
-@end
+@end // expected-note 2 {{add a '@synthesize' directive}}
 
 @interface Fail : FooObject
 @end
 
 @implementation Fail // expected-warning {{auto property synthesis will not synthesize property 'muahahaha' declared in protocol 'Fooing'}} expected-warning {{auto property synthesis will not synthesize property 'hoho' declared in protocol 'SubFooling'}}
-@end
+@end // expected-note 2 {{add a '@synthesize' directive}}
 
 // rdar://16089191
 @class NSURL;
diff --git a/test/SemaObjC/default-synthesize.m b/test/SemaObjC/default-synthesize.m
index 3f0ae0261daf..61ce9317c519 100644
--- a/test/SemaObjC/default-synthesize.m
+++ b/test/SemaObjC/default-synthesize.m
@@ -137,7 +137,7 @@
 @end
  
 @implementation MyClass // expected-warning {{auto property synthesis will not synthesize property 'requiredString' declared in protocol 'MyProtocol'}}
-@end
+@end // expected-note {{add a '@synthesize' directive}}
 
 // rdar://18152478
 @protocol NSObject @end
diff --git a/test/SemaObjC/forward-protocol-incomplete-impl-warn.m b/test/SemaObjC/forward-protocol-incomplete-impl-warn.m
index c235e32316a9..583bb4dd891d 100644
--- a/test/SemaObjC/forward-protocol-incomplete-impl-warn.m
+++ b/test/SemaObjC/forward-protocol-incomplete-impl-warn.m
@@ -17,4 +17,4 @@
 
 @implementation IBImageCatalogDocument // expected-warning {{auto property synthesis will not synthesize property 'Prop' declared in protocol 'DVTInvalidation'}} \
 				       // expected-warning {{method 'invalidate' in protocol 'DVTInvalidation' not implemented}}
-@end
+@end // expected-note {{add a '@synthesize' directive}}
diff --git a/test/SemaObjC/objc-container-subscripting-1.m b/test/SemaObjC/objc-container-subscripting-1.m
index a58a7c3bda81..b5ed5a68dd84 100644
--- a/test/SemaObjC/objc-container-subscripting-1.m
+++ b/test/SemaObjC/objc-container-subscripting-1.m
@@ -16,8 +16,7 @@ id oldObject = array[10]; // expected-warning {{instance method '-objectAtIndexe
 array[10] = 0; // expected-warning {{instance method '-setObject:atIndexedSubscript:' not found (return type defaults to 'id')}}
 
 id<P> p_array;
-oldObject = p_array[10]; // expected-warning {{instance method '-objectAtIndexedSubscript:' not found (return type defaults to 'id')}}
+oldObject = p_array[10]; // expected-error {{expected method to read array element not found on object of type 'id<P>'}}
 
-p_array[10] = 0; // expected-warning {{instance method '-setObject:atIndexedSubscript:' not found (return type defaults to 'id')}}
+p_array[10] = 0; // expected-error {{expected method to write array element not found on object of type 'id<P>'}}
 }
-
diff --git a/test/SemaObjC/objc-container-subscripting-2.m b/test/SemaObjC/objc-container-subscripting-2.m
index 62320fcebb77..6e7ee7c40e58 100644
--- a/test/SemaObjC/objc-container-subscripting-2.m
+++ b/test/SemaObjC/objc-container-subscripting-2.m
@@ -28,3 +28,22 @@ void test_unused() {
   dict[array]; // expected-warning {{container access result unused - container access should not be used for side effects}}
 }
 
+void testQualifiedId(id<P> qualifiedId) {
+  id object = qualifiedId[10];   // expected-error {{expected method to read array element not found on object of type 'id<P>'}}
+  qualifiedId[10] = qualifiedId; // expected-error {{expected method to write array element not found on object of type 'id<P>'}}
+}
+
+void testUnqualifiedId(id unqualId) {
+  id object = unqualId[10];
+  unqualId[10] = object;
+}
+
+@protocol Subscriptable
+- (id)objectAtIndexedSubscript:(size_t)index;
+- (void)setObject:(id)object atIndexedSubscript:(size_t)index;
+@end
+
+void testValidQualifiedId(id<Subscriptable> qualifiedId) {
+  id object = qualifiedId[10];
+  qualifiedId[10] = object;
+}
diff --git a/test/SemaObjC/unguarded-availability-new.m b/test/SemaObjC/unguarded-availability-new.m
index 33baedebc279..9c44b164d541 100644
--- a/test/SemaObjC/unguarded-availability-new.m
+++ b/test/SemaObjC/unguarded-availability-new.m
@@ -96,16 +96,16 @@ typedef int AVAILABLE_NEXT new_int;
 FUNC_AVAILABLE new_int x;
 #ifndef NO_WARNING
 #ifdef MAC
-  // expected-warning@-3 {{'new_int' is partial: introduced in macOS 10.14}} expected-note@-3 {{explicitly redeclare 'new_int' to silence this warning}}
+  // expected-warning@-3 {{'new_int' is partial: introduced in macOS 10.14}} expected-note@-3 {{annotate 'x' with an availability attribute to silence}}
 #endif
 #ifdef IOS
-  // expected-warning@-6 {{'new_int' is partial: introduced in iOS 12}} expected-note@-6 {{explicitly redeclare 'new_int' to silence this warning}}
+  // expected-warning@-6 {{'new_int' is partial: introduced in iOS 12}} expected-note@-6 {{annotate 'x' with an availability attribute to silence}}
 #endif
 #ifdef TVOS
-  // expected-warning@-9 {{'new_int' is partial: introduced in tvOS 13}} expected-note@-9 {{explicitly redeclare 'new_int' to silence this warning}}
+  // expected-warning@-9 {{'new_int' is partial: introduced in tvOS 13}} expected-note@-9 {{annotate 'x' with an availability attribute to silence}}
 #endif
 #ifdef WATCHOS
-  // expected-warning@-12 {{'new_int' is partial: introduced in watchOS 5}} expected-note@-12 {{explicitly redeclare 'new_int' to silence this warning}}
+  // expected-warning@-12 {{'new_int' is partial: introduced in watchOS 5}} expected-note@-12 {{annotate 'x' with an availability attribute to silence}}
 #endif
 #endif
 
diff --git a/test/SemaObjC/unguarded-availability.m b/test/SemaObjC/unguarded-availability.m
index 5febee038e3c..139b79158afe 100644
--- a/test/SemaObjC/unguarded-availability.m
+++ b/test/SemaObjC/unguarded-availability.m
@@ -5,6 +5,8 @@
 #define AVAILABLE_10_11 __attribute__((availability(macos, introduced = 10.11)))
 #define AVAILABLE_10_12 __attribute__((availability(macos, introduced = 10.12)))
 
+typedef int AVAILABLE_10_12 new_int; // expected-note + {{marked partial here}}
+
 int func_10_11() AVAILABLE_10_11; // expected-note 4 {{'func_10_11' has been explicitly marked partial here}}
 
 #ifdef OBJCPP
@@ -70,9 +72,9 @@ void use_typedef() {
 }
 
 __attribute__((objc_root_class))
-AVAILABLE_10_11 @interface Class_10_11 {
+AVAILABLE_10_11 @interface Class_10_11 { // expected-note{{annotate 'Class_10_11' with an availability attribute to silence}}
   int_10_11 foo;
-  int_10_12 bar; // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}} expected-note{{redeclare}}
+  int_10_12 bar; // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}}
 }
 - (void)method1;
 - (void)method2;
@@ -125,7 +127,7 @@ void test_blocks() {
   };
 }
 
-void test_params(int_10_12 x); // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}} expected-note{{redeclare}}
+void test_params(int_10_12 x); // expected-warning {{'int_10_12' is partial: introduced in macOS 10.12}} expected-note{{annotate 'test_params' with an availability attribute to silence}}
 
 void test_params2(int_10_12 x) AVAILABLE_10_12; // no warn
 
@@ -234,3 +236,54 @@ void functionInFunction() {
 }
 
 #endif
+
+struct InStruct { // expected-note{{annotate 'InStruct' with an availability attribute to silence}}
+  new_int mem; // expected-warning{{'new_int' is partial}}
+
+  struct { new_int mem; } anon; // expected-warning{{'new_int' is partial}} expected-note{{annotate anonymous struct with an availability attribute}}
+};
+
+#ifdef OBJCPP
+static constexpr int AVAILABLE_10_12 SomeConstexprValue = 2; // expected-note{{marked partial here}}
+typedef enum { // expected-note{{annotate anonymous enum with an availability attribute}}
+  SomeValue = SomeConstexprValue // expected-warning{{'SomeConstexprValue' is partial}} 
+} SomeEnum;
+#endif
+
+@interface InInterface
+-(new_int)meth; // expected-warning{{'new_int' is partial}} expected-note{{annotate 'meth' with an availability attribute}}
+@end
+
+@interface Proper // expected-note{{annotate 'Proper' with an availability attribute}}
+@property (class) new_int x; // expected-warning{{'new_int' is partial}}
+@end
+
+void with_local_struct() {
+  struct local { // expected-note{{annotate 'local' with an availability attribute}}
+    new_int x; // expected-warning{{'new_int' is partial}}
+  };
+}
+
+// rdar://33156429:
+// Avoid the warning on protocol requirements.
+
+AVAILABLE_10_12
+@protocol NewProtocol // expected-note {{'NewProtocol' has been explicitly marked partial here}}
+@end
+
+@protocol ProtocolWithNewProtocolRequirement <NewProtocol> // expected-note {{annotate 'ProtocolWithNewProtocolRequirement' with an availability attribute to silence}}
+
+@property(copy) id<NewProtocol> prop; // expected-warning {{'NewProtocol' is partial: introduced in macOS 10.12}}
+
+@end
+
+@interface BaseClass
+@end
+
+@interface ClassWithNewProtocolRequirement : BaseClass <NewProtocol>
+
+@end
+
+@interface BaseClass (CategoryWithNewProtocolRequirement) <NewProtocol>
+
+@end
diff --git a/test/SemaOpenCL/cl20-device-side-enqueue.cl b/test/SemaOpenCL/cl20-device-side-enqueue.cl
index bafbe447ee28..3f6527afeadc 100644
--- a/test/SemaOpenCL/cl20-device-side-enqueue.cl
+++ b/test/SemaOpenCL/cl20-device-side-enqueue.cl
@@ -19,19 +19,19 @@ kernel void enqueue_kernel_tests() {
     return 0;
   });
 
-  enqueue_kernel(vptr, flags, ndrange, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'queue_t' argument type}}
+  enqueue_kernel(vptr, flags, ndrange, ^(void) { // expected-error{{illegal call to 'enqueue_kernel', expected 'queue_t' argument type}}
     return 0;
   });
 
-  enqueue_kernel(default_queue, vptr, ndrange, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'kernel_enqueue_flags_t' (i.e. uint) argument type}}
+  enqueue_kernel(default_queue, vptr, ndrange, ^(void) { // expected-error{{illegal call to 'enqueue_kernel', expected 'kernel_enqueue_flags_t' (i.e. uint) argument type}}
     return 0;
   });
 
-  enqueue_kernel(default_queue, flags, vptr, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'ndrange_t' argument type}}
+  enqueue_kernel(default_queue, flags, vptr, ^(void) { // expected-error{{illegal call to 'enqueue_kernel', expected 'ndrange_t' argument type}}
     return 0;
   });
 
-  enqueue_kernel(default_queue, flags, ndrange, vptr); // expected-error{{illegal call to enqueue_kernel, expected block argument}}
+  enqueue_kernel(default_queue, flags, ndrange, vptr); // expected-error{{illegal call to 'enqueue_kernel', expected block argument}}
 
   enqueue_kernel(default_queue, flags, ndrange, ^(int i) { // expected-error{{blocks with parameters are not accepted in this prototype of enqueue_kernel call}}
     return 0;
@@ -46,21 +46,21 @@ kernel void enqueue_kernel_tests() {
                                                            return 0;
                                                          });
 
-  enqueue_kernel(default_queue, flags, ndrange, vptr, &event_wait_list, &evt, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected integer argument type}}
+  enqueue_kernel(default_queue, flags, ndrange, vptr, &event_wait_list, &evt, ^(void) { // expected-error{{illegal call to 'enqueue_kernel', expected integer argument type}}
     return 0;
   });
 
-  enqueue_kernel(default_queue, flags, ndrange, 1, vptr, &evt, ^(void) // expected-error{{illegal call to enqueue_kernel, expected 'clk_event_t *' argument type}}
+  enqueue_kernel(default_queue, flags, ndrange, 1, vptr, &evt, ^(void) // expected-error{{illegal call to 'enqueue_kernel', expected 'clk_event_t *' argument type}}
                                                                {
                                                                  return 0;
                                                                });
 
-  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, vptr, ^(void) // expected-error{{illegal call to enqueue_kernel, expected 'clk_event_t *' argument type}}
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, vptr, ^(void) // expected-error{{illegal call to 'enqueue_kernel', expected 'clk_event_t *' argument type}}
                                                                            {
                                                                              return 0;
                                                                            });
 
-  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, vptr); // expected-error{{illegal call to enqueue_kernel, expected block argument}}
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, vptr); // expected-error{{illegal call to 'enqueue_kernel', expected block argument}}
 
   // Testing the third overload type
   enqueue_kernel(default_queue, flags, ndrange,
diff --git a/test/SemaOpenCL/images.cl b/test/SemaOpenCL/images.cl
index f963de4e1359..9d6092c8c916 100644
--- a/test/SemaOpenCL/images.cl
+++ b/test/SemaOpenCL/images.cl
@@ -1,9 +1,32 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -verify -pedantic -fsyntax-only
 
-void img2d_ro(__read_only image2d_t img) {} // expected-note{{passing argument to parameter 'img' here}} expected-note{{passing argument to parameter 'img' here}}
+void img2d_ro(read_only image2d_t); // expected-note 3{{passing argument to parameter here}}
+void img2d_wo(write_only image2d_t); // expected-note 2{{passing argument to parameter here}}
+void img2d_rw(read_write image2d_t); // expected-note 2{{passing argument to parameter here}}
+void img2d_default(image2d_t); // expected-note 2{{passing argument to parameter here}}
 
-void imgage_access_test(image2d_t img2dro, write_only image2d_t img2dwo, image3d_t img3dro) {
-  img2d_ro(img2dro);
-  img2d_ro(img2dwo); // expected-error{{passing '__write_only image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+void imgage_access_test(image2d_t img2dro, image3d_t img3dro) {
+  img2d_ro(img2dro); // read_only = read_only
   img2d_ro(img3dro); // expected-error{{passing '__read_only image3d_t' to parameter of incompatible type '__read_only image2d_t'}}
 }
+
+kernel void read_only_access_test(read_only image2d_t img) {
+  img2d_ro(img); // read_only = read_only
+  img2d_wo(img); // expected-error {{passing '__read_only image2d_t' to parameter of incompatible type '__write_only image2d_t'}}
+  img2d_rw(img); // expected-error {{passing '__read_only image2d_t' to parameter of incompatible type '__read_write image2d_t'}}
+  img2d_default(img); // read_only = read_only
+}
+
+kernel void write_only_access_test(write_only image2d_t img) {
+  img2d_ro(img); // expected-error {{passing '__write_only image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+  img2d_wo(img); // write_only = write_only
+  img2d_rw(img); // expected-error {{passing '__write_only image2d_t' to parameter of incompatible type '__read_write image2d_t'}}
+  img2d_default(img); // expected-error {{passing '__write_only image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+}
+
+kernel void read_write_access_test(read_write image2d_t img) {
+  img2d_ro(img);  // expected-error {{passing '__read_write image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+  img2d_wo(img); // expected-error {{passing '__read_write image2d_t' to parameter of incompatible type '__write_only image2d_t'}}
+  img2d_rw(img); //read_write = read_write
+  img2d_default(img); // expected-error {{passing '__read_write image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+}
diff --git a/test/SemaTemplate/constexpr-instantiate.cpp b/test/SemaTemplate/constexpr-instantiate.cpp
index dfb8a07d3b7d..31dbdb617a6a 100644
--- a/test/SemaTemplate/constexpr-instantiate.cpp
+++ b/test/SemaTemplate/constexpr-instantiate.cpp
@@ -191,7 +191,7 @@ namespace Unevaluated {
       static constexpr bool f() { return sizeof(T) < U::size; }
 
       template<typename U>
-      static typename enable_if<f<U>(), void>::type g() {} // expected-note {{disabled by 'enable_if'}}
+      static typename enable_if<f<U>(), void>::type g() {} // expected-note {{requirement 'f<Unevaluated::PR13423::U>()' was not satisfied}}
     };
 
     struct U { static constexpr int size = 2; };
diff --git a/test/SemaTemplate/overload-candidates.cpp b/test/SemaTemplate/overload-candidates.cpp
index c0abb5b17488..de998d74f9af 100644
--- a/test/SemaTemplate/overload-candidates.cpp
+++ b/test/SemaTemplate/overload-candidates.cpp
@@ -47,7 +47,7 @@ namespace boost {
   template<bool, typename = void> struct enable_if {};
   template<typename T> struct enable_if<true, T> { typedef T type; };
 }
-template<typename T> typename boost::enable_if<sizeof(T) == 4, int>::type if_size_4(); // expected-note{{candidate template ignored: disabled by 'enable_if' [with T = char]}}
+template<typename T> typename boost::enable_if<sizeof(T) == 4, int>::type if_size_4(); // expected-note{{candidate template ignored: requirement 'sizeof(char) == 4' was not satisfied [with T = char]}}
 int k = if_size_4<char>(); // expected-error{{no matching function}}
 
 namespace llvm {
@@ -61,7 +61,7 @@ void test_if_int() {
 }
 
 template<typename T> struct NonTemplateFunction {
-  typename boost::enable_if<sizeof(T) == 4, int>::type f(); // expected-error{{no type named 'type' in 'boost::enable_if<false, int>'; 'enable_if' cannot be used to disable this declaration}}
+  typename boost::enable_if<sizeof(T) == 4, int>::type f(); // expected-error{{failed requirement 'sizeof(char) == 4'; 'enable_if' cannot be used to disable this declaration}}
 };
 NonTemplateFunction<char> NTFC; // expected-note{{here}}
 
@@ -100,7 +100,7 @@ namespace PR15673 {
 #if __cplusplus <= 199711L
   // expected-warning@-2 {{default template arguments for a function template are a C++11 extension}}
 #endif
-  // expected-note@-4 {{candidate template ignored: disabled by 'enable_if' [with T = int]}}
+  // expected-note@+1 {{candidate template ignored: requirement 'a_trait<int>::value' was not satisfied [with T = int]}}
   void foo() {}
   void bar() { foo<int>(); } // expected-error {{no matching function for call to 'foo'}}
 
@@ -128,7 +128,7 @@ namespace PR15673 {
 #if __cplusplus <= 199711L
   // expected-warning@-2 {{alias declarations are a C++11 extension}}
 #endif
-  // expected-note@-4 {{candidate template ignored: disabled by 'enable_if' [with T = int]}}
+  // expected-note@+7 {{candidate template ignored: requirement 'some_trait<int>::value' was not satisfied [with T = int]}}
 
   template<typename T,
            typename Requires = unicorns<T> >
@@ -137,4 +137,30 @@ namespace PR15673 {
 #endif
   void wibble() {}
   void wobble() { wibble<int>(); } // expected-error {{no matching function for call to 'wibble'}}
+
+  template<typename T>
+  struct some_passing_trait : std::true_type {};
+
+#if __cplusplus <= 199711L
+  // expected-warning@+4 {{default template arguments for a function template are a C++11 extension}}
+  // expected-warning@+4 {{default template arguments for a function template are a C++11 extension}}
+#endif
+  template<typename T,
+           int n = 42,
+           typename std::enable_if<n == 43 || (some_passing_trait<T>::value && some_trait<T>::value), int>::type = 0>
+  void almost_rangesv3(); // expected-note{{candidate template ignored: requirement '42 == 43 || (some_passing_trait<int>::value && some_trait<int>::value)' was not satisfied}}
+  void test_almost_rangesv3() { almost_rangesv3<int>(); } // expected-error{{no matching function for call to 'almost_rangesv3'}}
+
+  #define CONCEPT_REQUIRES_(...)                                        \
+    int x = 42,                                                         \
+    typename std::enable_if<(x == 43) || (__VA_ARGS__)>::type = 0
+
+#if __cplusplus <= 199711L
+  // expected-warning@+4 {{default template arguments for a function template are a C++11 extension}}
+  // expected-warning@+3 {{default template arguments for a function template are a C++11 extension}}
+#endif
+  template<typename T,
+           CONCEPT_REQUIRES_(some_passing_trait<T>::value && some_trait<T>::value)>
+  void rangesv3(); // expected-note{{candidate template ignored: requirement 'some_trait<int>::value' was not satisfied [with T = int, x = 42]}}
+  void test_rangesv3() { rangesv3<int>(); } // expected-error{{no matching function for call to 'rangesv3'}}
 }
diff --git a/test/Unit/lit.cfg b/test/Unit/lit.cfg
index 39fad0099ceb..90eb2ac604a5 100644
--- a/test/Unit/lit.cfg
+++ b/test/Unit/lit.cfg
@@ -4,6 +4,7 @@
 
 import os
 import platform
+import subprocess
 
 import lit.formats
 import lit.util
@@ -65,8 +66,8 @@ if config.test_exec_root is None:
         lit_config.fatal('No site specific configuration available!')
 
     # Get the source and object roots.
-    llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
-    llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip()
+    llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).strip()
+    llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).strip()
     clang_src_root = os.path.join(llvm_src_root, "tools", "clang")
     clang_obj_root = os.path.join(llvm_obj_root, "tools", "clang")
 
diff --git a/test/lit.cfg b/test/lit.cfg
index e72eca6bd397..28026578299b 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -148,8 +148,8 @@ if config.test_exec_root is None:
         lit_config.fatal('No site specific configuration available!')
 
     # Get the source and object roots.
-    llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
-    llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip()
+    llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).strip()
+    llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).strip()
     clang_src_root = os.path.join(llvm_src_root, "tools", "clang")
     clang_obj_root = os.path.join(llvm_obj_root, "tools", "clang")
 
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index 1e925569dd95..cf3581e259f7 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -804,6 +804,8 @@ static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) {
       printf(" (const)");
     if (clang_CXXMethod_isPureVirtual(Cursor))
       printf(" (pure)");
+    if (clang_EnumDecl_isScoped(Cursor))
+      printf(" (scoped)");
     if (clang_Cursor_isVariadic(Cursor))
       printf(" (variadic)");
     if (clang_Cursor_isObjCOptional(Cursor))
diff --git a/tools/clang-import-test/clang-import-test.cpp b/tools/clang-import-test/clang-import-test.cpp
index db4dc76eded5..6b724e9cf5fa 100644
--- a/tools/clang-import-test/clang-import-test.cpp
+++ b/tools/clang-import-test/clang-import-test.cpp
@@ -182,14 +182,6 @@ BuildCompilerInstance(ArrayRef<const char *> ClangArgv) {
   return Ins;
 }
 
-std::unique_ptr<CompilerInstance>
-BuildCompilerInstance(ArrayRef<std::string> ClangArgs) {
-  std::vector<const char *> ClangArgv(ClangArgs.size());
-  std::transform(ClangArgs.begin(), ClangArgs.end(), ClangArgv.begin(),
-                 [](const std::string &s) -> const char * { return s.data(); });
-  return init_convenience::BuildCompilerInstance(ClangArgv);
-}
-
 std::unique_ptr<ASTContext>
 BuildASTContext(CompilerInstance &CI, SelectorTable &ST, Builtin::Context &BC) {
   auto AST = llvm::make_unique<ASTContext>(
@@ -313,14 +305,8 @@ int main(int argc, const char **argv) {
   std::vector<std::unique_ptr<CompilerInstance>> IndirectCIs;
   if (!Direct) {
     for (auto &ImportCI : ImportCIs) {
-      llvm::Expected<std::unique_ptr<CompilerInstance>> IndirectCI =
-          BuildIndirect(ImportCI);
-      if (auto E = IndirectCI.takeError()) {
-        llvm::errs() << llvm::toString(std::move(E));
-        exit(-1);
-      } else {
-        IndirectCIs.push_back(std::move(*IndirectCI));
-      }
+      std::unique_ptr<CompilerInstance> IndirectCI = BuildIndirect(ImportCI);
+      IndirectCIs.push_back(std::move(IndirectCI));
     }
   }
   llvm::Expected<std::unique_ptr<CompilerInstance>> ExpressionCI =
diff --git a/tools/diagtool/CMakeLists.txt b/tools/diagtool/CMakeLists.txt
index e88c2ab6e8c3..3f7d80385a82 100644
--- a/tools/diagtool/CMakeLists.txt
+++ b/tools/diagtool/CMakeLists.txt
@@ -6,6 +6,7 @@ add_clang_executable(diagtool
   diagtool_main.cpp
   DiagTool.cpp
   DiagnosticNames.cpp
+  FindDiagnosticID.cpp
   ListWarnings.cpp
   ShowEnabledWarnings.cpp
   TreeView.cpp
diff --git a/tools/diagtool/FindDiagnosticID.cpp b/tools/diagtool/FindDiagnosticID.cpp
new file mode 100644
index 000000000000..167b9925eedc
--- /dev/null
+++ b/tools/diagtool/FindDiagnosticID.cpp
@@ -0,0 +1,58 @@
+//===- FindDiagnosticID.cpp - diagtool tool for finding diagnostic id -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DiagTool.h"
+#include "DiagnosticNames.h"
+#include "clang/Basic/AllDiagnostics.h"
+#include "llvm/Support/CommandLine.h"
+
+DEF_DIAGTOOL("find-diagnostic-id", "Print the id of the given diagnostic",
+             FindDiagnosticID)
+
+using namespace clang;
+using namespace diagtool;
+
+static Optional<DiagnosticRecord>
+findDiagnostic(ArrayRef<DiagnosticRecord> Diagnostics, StringRef Name) {
+  for (const auto &Diag : Diagnostics) {
+    StringRef DiagName = Diag.getName();
+    if (DiagName == Name)
+      return Diag;
+  }
+  return None;
+}
+
+int FindDiagnosticID::run(unsigned int argc, char **argv,
+                          llvm::raw_ostream &OS) {
+  static llvm::cl::OptionCategory FindDiagnosticIDOptions(
+      "diagtool find-diagnostic-id options");
+
+  static llvm::cl::opt<std::string> DiagnosticName(
+      llvm::cl::Positional, llvm::cl::desc("<diagnostic-name>"),
+      llvm::cl::Required, llvm::cl::cat(FindDiagnosticIDOptions));
+
+  std::vector<const char *> Args;
+  Args.push_back("find-diagnostic-id");
+  for (const char *A : llvm::makeArrayRef(argv, argc))
+    Args.push_back(A);
+
+  llvm::cl::HideUnrelatedOptions(FindDiagnosticIDOptions);
+  llvm::cl::ParseCommandLineOptions((int)Args.size(), Args.data(),
+                                    "Diagnostic ID mapping utility");
+
+  ArrayRef<DiagnosticRecord> AllDiagnostics = getBuiltinDiagnosticsByName();
+  Optional<DiagnosticRecord> Diag =
+      findDiagnostic(AllDiagnostics, DiagnosticName);
+  if (!Diag) {
+    llvm::errs() << "error: invalid diagnostic '" << DiagnosticName << "'\n";
+    return 1;
+  }
+  OS << Diag->DiagID << "\n";
+  return 0;
+}
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index deaab3608e62..236f264c1758 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -7086,8 +7086,10 @@ CXLinkageKind clang_getCursorLinkage(CXCursor cursor) {
     switch (ND->getLinkageInternal()) {
       case NoLinkage:
       case VisibleNoLinkage: return CXLinkage_NoLinkage;
+      case ModuleInternalLinkage:
       case InternalLinkage: return CXLinkage_Internal;
       case UniqueExternalLinkage: return CXLinkage_UniqueExternal;
+      case ModuleLinkage:
       case ExternalLinkage: return CXLinkage_External;
     };
 
@@ -7805,6 +7807,15 @@ unsigned clang_CXXMethod_isVirtual(CXCursor C) {
   return (Method && Method->isVirtual()) ? 1 : 0;
 }
 
+unsigned clang_EnumDecl_isScoped(CXCursor C) {
+  if (!clang_isDeclaration(C.kind))
+    return 0;
+
+  const Decl *D = cxcursor::getCursorDecl(C);
+  auto *Enum = dyn_cast_or_null<EnumDecl>(D);
+  return (Enum && Enum->isScoped()) ? 1 : 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Attribute introspection.
 //===----------------------------------------------------------------------===//
diff --git a/tools/libclang/CXIndexDataConsumer.cpp b/tools/libclang/CXIndexDataConsumer.cpp
index 9cd5ff4f505f..a2ef68be49de 100644
--- a/tools/libclang/CXIndexDataConsumer.cpp
+++ b/tools/libclang/CXIndexDataConsumer.cpp
@@ -423,11 +423,13 @@ bool CXIndexDataConsumer::isFunctionLocalDecl(const Decl *D) {
   if (const NamedDecl *ND = dyn_cast<NamedDecl>(D)) {
     switch (ND->getFormalLinkage()) {
     case NoLinkage:
-    case VisibleNoLinkage:
     case InternalLinkage:
       return true;
+    case VisibleNoLinkage:
+    case ModuleInternalLinkage:
     case UniqueExternalLinkage:
       llvm_unreachable("Not a sema linkage");
+    case ModuleLinkage:
     case ExternalLinkage:
       return false;
     }
diff --git a/tools/libclang/libclang.exports b/tools/libclang/libclang.exports
index e78899e4c759..e0d178a5291a 100644
--- a/tools/libclang/libclang.exports
+++ b/tools/libclang/libclang.exports
@@ -12,6 +12,7 @@ clang_CXXMethod_isConst
 clang_CXXMethod_isPureVirtual
 clang_CXXMethod_isStatic
 clang_CXXMethod_isVirtual
+clang_EnumDecl_isScoped
 clang_Cursor_getArgument
 clang_Cursor_getNumTemplateArguments
 clang_Cursor_getTemplateArgumentKind
diff --git a/unittests/ASTMatchers/ASTMatchersNodeTest.cpp b/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
index dfaa441cd764..58c26eafd7e0 100644
--- a/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
+++ b/unittests/ASTMatchers/ASTMatchersNodeTest.cpp
@@ -666,6 +666,12 @@ TEST(Matcher, IntegerLiterals) {
   EXPECT_TRUE(notMatches("int i = 'a';", HasIntLiteral));
   EXPECT_TRUE(notMatches("int i = 1e10;", HasIntLiteral));
   EXPECT_TRUE(notMatches("int i = 10.0;", HasIntLiteral));
+
+  // Negative integers.
+  EXPECT_TRUE(
+      matches("int i = -10;",
+              unaryOperator(hasOperatorName("-"),
+                            hasUnaryOperand(integerLiteral(equals(10))))));
 }
 
 TEST(Matcher, FloatLiterals) {
diff --git a/unittests/Format/CMakeLists.txt b/unittests/Format/CMakeLists.txt
index 5c04ba1143c6..fa7e32c33d9f 100644
--- a/unittests/Format/CMakeLists.txt
+++ b/unittests/Format/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_unittest(FormatTests
   FormatTestObjC.cpp
   FormatTestProto.cpp
   FormatTestSelective.cpp
+  FormatTestTextProto.cpp
   NamespaceEndCommentsFixerTest.cpp
   SortImportsTestJS.cpp
   SortIncludesTest.cpp
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index b5f959f9c1f7..937362f5c9d7 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -825,12 +825,35 @@ TEST_F(FormatTest, FormatsSwitchStatement) {
                "  case A:\n"
                "    f();\n"
                "    break;\n"
-               "  // On B:\n"
+               "    // fallthrough\n"
                "  case B:\n"
                "    g();\n"
                "    break;\n"
                "  }\n"
                "});");
+  EXPECT_EQ("DEBUG({\n"
+            "  switch (x) {\n"
+            "  case A:\n"
+            "    f();\n"
+            "    break;\n"
+            "  // On B:\n"
+            "  case B:\n"
+            "    g();\n"
+            "    break;\n"
+            "  }\n"
+            "});",
+            format("DEBUG({\n"
+                   "  switch (x) {\n"
+                   "  case A:\n"
+                   "    f();\n"
+                   "    break;\n"
+                   "  // On B:\n"
+                   "  case B:\n"
+                   "    g();\n"
+                   "    break;\n"
+                   "  }\n"
+                   "});",
+                   getLLVMStyle()));
   verifyFormat("switch (a) {\n"
                "case (b):\n"
                "  return;\n"
diff --git a/unittests/Format/FormatTestComments.cpp b/unittests/Format/FormatTestComments.cpp
index fdb5a08e7a21..7916e65e5114 100644
--- a/unittests/Format/FormatTestComments.cpp
+++ b/unittests/Format/FormatTestComments.cpp
@@ -805,6 +805,70 @@ TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) {
             format("namespace {}\n   /* Test */    #define A"));
 }
 
+TEST_F(FormatTestComments, KeepsLevelOfCommentBeforePPDirective) {
+  // Keep the current level if the comment was originally not aligned with
+  // the preprocessor directive.
+  EXPECT_EQ("void f() {\n"
+            "  int i;\n"
+            "  /* comment */\n"
+            "#ifdef A\n"
+            "  int j;\n"
+            "}",
+            format("void f() {\n"
+                   "  int i;\n"
+                   "  /* comment */\n"
+                   "#ifdef A\n"
+                   "  int j;\n"
+                   "}"));
+
+  EXPECT_EQ("void f() {\n"
+            "  int i;\n"
+            "  /* comment */\n"
+            "\n"
+            "#ifdef A\n"
+            "  int j;\n"
+            "}",
+            format("void f() {\n"
+                   "  int i;\n"
+                   "  /* comment */\n"
+                   "\n"
+                   "#ifdef A\n"
+                   "  int j;\n"
+                   "}"));
+
+  // Keep the current level if there is an empty line between the comment and
+  // the preprocessor directive.
+  EXPECT_EQ("void f() {\n"
+            "  int i;\n"
+            "  /* comment */\n"
+            "\n"
+            "#ifdef A\n"
+            "  int j;\n"
+            "}",
+            format("void f() {\n"
+                   "  int i;\n"
+                   "/* comment */\n"
+                   "\n"
+                   "#ifdef A\n"
+                   "  int j;\n"
+                   "}"));
+
+  // Align with the preprocessor directive if the comment was originally aligned
+  // with the preprocessor directive.
+  EXPECT_EQ("void f() {\n"
+            "  int i;\n"
+            "/* comment */\n"
+            "#ifdef A\n"
+            "  int j;\n"
+            "}",
+            format("void f() {\n"
+                   "  int i;\n"
+                   "/* comment */\n"
+                   "#ifdef A\n"
+                   "  int j;\n"
+                   "}"));
+}
+
 TEST_F(FormatTestComments, SplitsLongLinesInComments) {
   EXPECT_EQ("/* This is a long\n"
             " * comment that\n"
diff --git a/unittests/Format/FormatTestJS.cpp b/unittests/Format/FormatTestJS.cpp
index e84f470687ec..11e386a1c7c7 100644
--- a/unittests/Format/FormatTestJS.cpp
+++ b/unittests/Format/FormatTestJS.cpp
@@ -930,6 +930,14 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) {
                "  aaa\n"
                "];",
                getGoogleJSStyleWithColumns(12));
+  verifyFormat("class X {\n"
+               "  readonly ratherLongField =\n"
+               "      1;\n"
+               "}",
+               "class X {\n"
+               "  readonly ratherLongField = 1;\n"
+               "}",
+               getGoogleJSStyleWithColumns(20));
 }
 
 TEST_F(FormatTestJS, AutomaticSemicolonInsertionHeuristic) {
@@ -1622,6 +1630,7 @@ TEST_F(FormatTestJS, NestedTemplateStrings) {
 
 TEST_F(FormatTestJS, TaggedTemplateStrings) {
   verifyFormat("var x = html`<ul>`;");
+  verifyFormat("yield `hello`;");
 }
 
 TEST_F(FormatTestJS, CastSyntax) {
diff --git a/unittests/Format/FormatTestProto.cpp b/unittests/Format/FormatTestProto.cpp
index 2e3b9311d12c..639da87c6ea9 100644
--- a/unittests/Format/FormatTestProto.cpp
+++ b/unittests/Format/FormatTestProto.cpp
@@ -199,11 +199,11 @@ TEST_F(FormatTestProto, FormatsOptions) {
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_c: \"OK\"\n"
-               "  msg_field{field_d: 123}\n"
+               "  msg_field {field_d: 123}\n"
                "};");
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: OK\n"
-               "  field_b{field_c: OK}\n"
+               "  field_b {field_c: OK}\n"
                "  field_d: OKOKOK\n"
                "  field_e: OK\n"
                "}");
@@ -216,7 +216,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: OK\n"
-               "  field_b<field_c: OK>\n"
+               "  field_b <field_c: OK>\n"
                "  field_d: OKOKOK\n"
                "  field_e: OK\n"
                "}");
@@ -255,7 +255,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = <\n"
                "  field_a: \"OK\"\n"
-               "  msg_field<\n"
+               "  msg_field <\n"
                "    field_b: OK\n"
                "    field_c: OK\n"
                "    field_d: OK\n"
@@ -267,7 +267,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = <\n"
                "  field_a: \"OK\"\n"
-               "  msg_field<\n"
+               "  msg_field <\n"
                "    field_b: OK,\n"
                "    field_c: OK,\n"
                "    field_d: OK,\n"
@@ -303,7 +303,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = <\n"
                "  field_a: \"OK\"\n"
-               "  msg_field{\n"
+               "  msg_field {\n"
                "    field_b: OK\n"
                "    field_c: OK\n"
                "    field_d: OK\n"
@@ -315,7 +315,7 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = {\n"
                "  field_a: \"OK\"\n"
-               "  msg_field<\n"
+               "  msg_field <\n"
                "    field_b: OK\n"
                "    field_c: OK\n"
                "    field_d: OK\n"
@@ -339,18 +339,18 @@ TEST_F(FormatTestProto, FormatsOptions) {
 
   verifyFormat("option (MyProto.options) = <\n"
                "  field_a: \"OK\"\n"
-               "  msg_field{\n"
+               "  msg_field {\n"
                "    field_b: OK\n"
                "    field_c: OK\n"
                "    field_d: OK\n"
-               "    msg_field<\n"
+               "    msg_field <\n"
                "      field_A: 1\n"
                "      field_B: 2\n"
                "      field_C: 3\n"
                "      field_D: 4\n"
                "      field_E: 5\n"
                "    >\n"
-               "    msg_field<field_A: 1 field_B: 2 field_C: 3 field_D: 4>\n"
+               "    msg_field <field_A: 1 field_B: 2 field_C: 3 field_D: 4>\n"
                "    field_e: OK\n"
                "    field_f: OK\n"
                "  }\n"
diff --git a/unittests/Format/FormatTestTextProto.cpp b/unittests/Format/FormatTestTextProto.cpp
new file mode 100644
index 000000000000..2de7e181f2cb
--- /dev/null
+++ b/unittests/Format/FormatTestTextProto.cpp
@@ -0,0 +1,250 @@
+//===- unittest/Format/FormatTestProto.cpp --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FormatTestUtils.h"
+#include "clang/Format/Format.h"
+#include "llvm/Support/Debug.h"
+#include "gtest/gtest.h"
+
+#define DEBUG_TYPE "format-test"
+
+namespace clang {
+namespace format {
+
+class FormatTestTextProto : public ::testing::Test {
+protected:
+  static std::string format(llvm::StringRef Code, unsigned Offset,
+                            unsigned Length, const FormatStyle &Style) {
+    DEBUG(llvm::errs() << "---\n");
+    DEBUG(llvm::errs() << Code << "\n\n");
+    std::vector<tooling::Range> Ranges(1, tooling::Range(Offset, Length));
+    tooling::Replacements Replaces = reformat(Style, Code, Ranges);
+    auto Result = applyAllReplacements(Code, Replaces);
+    EXPECT_TRUE(static_cast<bool>(Result));
+    DEBUG(llvm::errs() << "\n" << *Result << "\n\n");
+    return *Result;
+  }
+
+  static std::string format(llvm::StringRef Code) {
+    FormatStyle Style = getGoogleStyle(FormatStyle::LK_TextProto);
+    Style.ColumnLimit = 60; // To make writing tests easier.
+    return format(Code, 0, Code.size(), Style);
+  }
+
+  static void verifyFormat(llvm::StringRef Code) {
+    EXPECT_EQ(Code.str(), format(test::messUp(Code)));
+  }
+};
+
+TEST_F(FormatTestTextProto, KeepsTopLevelEntriesFittingALine) {
+  verifyFormat("field_a: OK field_b: OK field_c: OK field_d: OK field_e: OK");
+}
+
+TEST_F(FormatTestTextProto, SupportsMessageFields) {
+  verifyFormat("msg_field: {}");
+
+  verifyFormat("msg_field: {field_a: A}");
+
+  verifyFormat("msg_field: {field_a: \"OK\" field_b: 123}");
+
+  verifyFormat("msg_field: {\n"
+               "  field_a: 1\n"
+               "  field_b: OK\n"
+               "  field_c: \"OK\"\n"
+               "  field_d: 123\n"
+               "  field_e: 23\n"
+               "}");
+
+  verifyFormat("msg_field {}");
+
+  verifyFormat("msg_field {field_a: A}");
+
+  verifyFormat("msg_field {field_a: \"OK\" field_b: 123}");
+
+  verifyFormat("msg_field {\n"
+               "  field_a: 1\n"
+               "  field_b: OK\n"
+               "  field_c: \"OK\"\n"
+               "  field_d: 123\n"
+               "  field_e: 23.0\n"
+               "  field_f: false\n"
+               "  field_g: 'lala'\n"
+               "  field_h: 1234.567e-89\n"
+               "}");
+
+  verifyFormat("msg_field: {msg_field {field_a: 1}}");
+
+  verifyFormat("id: \"ala.bala\"\n"
+               "item {type: ITEM_A rank: 1 score: 90.0}\n"
+               "item {type: ITEM_B rank: 2 score: 70.5}\n"
+               "item {\n"
+               "  type: ITEM_A\n"
+               "  rank: 3\n"
+               "  score: 20.0\n"
+               "  description: \"the third item has a description\"\n"
+               "}");
+}
+
+TEST_F(FormatTestTextProto, AvoidsTopLevelBinPacking) {
+  verifyFormat("field_a: OK\n"
+               "field_b: OK\n"
+               "field_c: OK\n"
+               "field_d: OK\n"
+               "field_e: OK\n"
+               "field_f: OK");
+
+  verifyFormat("field_a: OK\n"
+               "field_b: \"OK\"\n"
+               "field_c: \"OK\"\n"
+               "msg_field: {field_d: 123}\n"
+               "field_e: OK\n"
+               "field_f: OK");
+
+  verifyFormat("field_a: OK\n"
+               "field_b: \"OK\"\n"
+               "field_c: \"OK\"\n"
+               "msg_field: {field_d: 123 field_e: OK}");
+
+  verifyFormat("a: {\n"
+               "  field_a: OK\n"
+               "  field_b {field_c: OK}\n"
+               "  field_d: OKOKOK\n"
+               "  field_e: OK\n"
+               "}");
+
+  verifyFormat("field_a: OK,\n"
+               "field_b {field_c: OK},\n"
+               "field_d: OKOKOK,\n"
+               "field_e: OK");
+}
+
+TEST_F(FormatTestTextProto, AddsNewlinesAfterTrailingComments) {
+  verifyFormat("field_a: OK  // Comment\n"
+               "field_b: 1");
+
+  verifyFormat("field_a: OK\n"
+               "msg_field: {\n"
+               "  field_b: OK  // Comment\n"
+               "}");
+
+  verifyFormat("field_a: OK\n"
+               "msg_field {\n"
+               "  field_b: OK  // Comment\n"
+               "}");
+}
+
+TEST_F(FormatTestTextProto, SupportsAngleBracketMessageFields) {
+  // Single-line tests
+  verifyFormat("msg_field <>");
+  verifyFormat("msg_field: <>");
+  verifyFormat("msg_field <field_a: OK>");
+  verifyFormat("msg_field: <field_a: 123>");
+  verifyFormat("msg_field <field_a <>>");
+  verifyFormat("msg_field <field_a <field_b <>>>");
+  verifyFormat("msg_field: <field_a <field_b: <>>>");
+  verifyFormat("msg_field <field_a: OK, field_b: \"OK\">");
+  verifyFormat("msg_field <field_a: OK field_b: <>, field_c: OK>");
+  verifyFormat("msg_field <field_a {field_b: 1}, field_c: <field_d: 2>>");
+  verifyFormat("msg_field: <field_a: OK, field_b: \"OK\">");
+  verifyFormat("msg_field: <field_a: OK field_b: <>, field_c: OK>");
+  verifyFormat("msg_field: <field_a {field_b: 1}, field_c: <field_d: 2>>");
+  verifyFormat("field_a: \"OK\", msg_field: <field_b: 123>, field_c: {}");
+  verifyFormat("field_a <field_b: 1>, msg_field: <field_b: 123>, field_c <>");
+  verifyFormat("field_a <field_b: 1> msg_field: <field_b: 123> field_c <>");
+  verifyFormat("field <field <field: <>>, field <>> field: <field: 1>");
+
+  // Multiple lines tests
+  verifyFormat("msg_field <\n"
+               "  field_a: OK\n"
+               "  field_b: \"OK\"\n"
+               "  field_c: 1\n"
+               "  field_d: 12.5\n"
+               "  field_e: OK\n"
+               ">");
+
+  verifyFormat("msg_field: <>\n"
+               "field_c: \"OK\",\n"
+               "msg_field: <field_d: 123>\n"
+               "field_e: OK\n"
+               "msg_field: <field_d: 12>");
+
+  verifyFormat("field_a: OK,\n"
+               "field_b <field_c: OK>,\n"
+               "field_d: <12.5>,\n"
+               "field_e: OK");
+
+  verifyFormat("field_a: OK\n"
+               "field_b <field_c: OK>\n"
+               "field_d: <12.5>\n"
+               "field_e: OKOKOK");
+
+  verifyFormat("msg_field <\n"
+               "  field_a: OK,\n"
+               "  field_b <field_c: OK>,\n"
+               "  field_d: <12.5>,\n"
+               "  field_e: OK\n"
+               ">");
+
+  verifyFormat("msg_field <\n"
+               "  field_a: <field: OK>,\n"
+               "  field_b <field_c: OK>,\n"
+               "  field_d: <12.5>,\n"
+               "  field_e: OK,\n"
+               ">");
+
+  verifyFormat("msg_field: <\n"
+               "  field_a: \"OK\"\n"
+               "  msg_field: {field_b: OK}\n"
+               "  field_g: OK\n"
+               "  field_g: OK\n"
+               "  field_g: OK\n"
+               ">");
+
+  verifyFormat("field_a {\n"
+               "  field_d: ok\n"
+               "  field_b: <field_c: 1>\n"
+               "  field_d: ok\n"
+               "  field_d: ok\n"
+               "}");
+
+  verifyFormat("field_a: {\n"
+               "  field_d: ok\n"
+               "  field_b: <field_c: 1>\n"
+               "  field_d: ok\n"
+               "  field_d: ok\n"
+               "}");
+
+  verifyFormat("field_a: <f1: 1, f2: <>>\n"
+               "field_b <\n"
+               "  field_b1: <>\n"
+               "  field_b2: ok,\n"
+               "  field_b3: <\n"
+               "    field_x {}  // Comment\n"
+               "    field_y: {field_z: 1}\n"
+               "    field_w: ok\n"
+               "  >\n"
+               "  field {\n"
+               "    field_x <>  // Comment\n"
+               "    field_y: <field_z: 1>\n"
+               "    field_w: ok\n"
+               "    msg_field: <\n"
+               "      field: <>\n"
+               "      field: <field: 1>\n"
+               "      field: <field: 2>\n"
+               "      field: <field: 3>\n"
+               "      field: <field: 4>\n"
+               "      field: ok\n"
+               "    >\n"
+               "  }\n"
+               ">\n"
+               "field: OK,\n"
+               "field_c <field <field <>>>");
+}
+} // end namespace tooling
+} // end namespace clang
diff --git a/unittests/Tooling/RecursiveASTVisitorTest.cpp b/unittests/Tooling/RecursiveASTVisitorTest.cpp
index 74c9c3db34d8..63dbd1475454 100644
--- a/unittests/Tooling/RecursiveASTVisitorTest.cpp
+++ b/unittests/Tooling/RecursiveASTVisitorTest.cpp
@@ -244,4 +244,62 @@ TEST(RecursiveASTVisitor, InitListExprIsPostOrderNoQueueVisitedTwice) {
                               InitListExprPostOrderNoQueueVisitor::Lang_C));
 }
 
+// Check to ensure that nested name specifiers are visited.
+class NestedNameSpecifiersVisitor
+    : public ExpectedLocationVisitor<NestedNameSpecifiersVisitor> {
+public:
+  bool VisitRecordTypeLoc(RecordTypeLoc RTL) {
+    if (!RTL)
+      return true;
+    Match(RTL.getDecl()->getName(), RTL.getNameLoc());
+    return true;
+  }
+
+  bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) {
+    if (!NNS)
+      return true;
+    if (const NamespaceDecl *ND =
+            NNS.getNestedNameSpecifier()->getAsNamespace())
+      Match(ND->getName(), NNS.getLocalBeginLoc());
+    return ExpectedLocationVisitor::TraverseNestedNameSpecifierLoc(NNS);
+  }
+};
+
+TEST(RecursiveASTVisitor,
+     NestedNameSpecifiersForTemplateSpecializationsAreVisited) {
+  StringRef Source = R"(
+namespace ns {
+struct Outer {
+    template<typename T, typename U>
+    struct Nested { };
+
+    template<typename T>
+    static T x;
+};
+}
+
+template<>
+struct ns::Outer::Nested<int, int>;
+
+template<>
+struct ns::Outer::Nested<int, int> { };
+
+template<typename T>
+struct ns::Outer::Nested<int, T> { };
+
+template<>
+int ns::Outer::x<int> = 0;
+)";
+  NestedNameSpecifiersVisitor Visitor;
+  Visitor.ExpectMatch("ns", 13, 8);
+  Visitor.ExpectMatch("ns", 16, 8);
+  Visitor.ExpectMatch("ns", 19, 8);
+  Visitor.ExpectMatch("ns", 22, 5);
+  Visitor.ExpectMatch("Outer", 13, 12);
+  Visitor.ExpectMatch("Outer", 16, 12);
+  Visitor.ExpectMatch("Outer", 19, 12);
+  Visitor.ExpectMatch("Outer", 22, 9);
+  EXPECT_TRUE(Visitor.runOver(Source, NestedNameSpecifiersVisitor::Lang_CXX14));
+}
+
 } // end anonymous namespace
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index 62fcccbacb55..49c1edce3220 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -860,10 +860,6 @@ void Type::applyModifier(char Mod) {
     Float = true;
     ElementBitwidth = 64;
     break;
-  case 'H':
-    Float = true;
-    ElementBitwidth = 16;
-    break;
   case 'g':
     if (AppliedQuad)
       Bitwidth /= 2;
@@ -1010,7 +1006,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
 }
 
 static bool isFloatingPointProtoModifier(char Mod) {
-  return Mod == 'F' || Mod == 'f' || Mod == 'H';
+  return Mod == 'F' || Mod == 'f';
 }
 
 std::string Intrinsic::getBuiltinTypeStr() {
diff --git a/utils/bash-autocomplete.sh b/utils/bash-autocomplete.sh
index 775b1f45351a..1e38bcfcaad7 100755
--- a/utils/bash-autocomplete.sh
+++ b/utils/bash-autocomplete.sh
@@ -1,45 +1,75 @@
 # Please add "source /path/to/bash-autocomplete.sh" to your .bashrc to use this.
+
+_clang_filedir()
+{
+  # _filedir function provided by recent versions of bash-completion package is
+  # better than "compgen -f" because the former honors spaces in pathnames while
+  # the latter doesn't. So we use compgen only when _filedir is not provided.
+  _filedir 2> /dev/null || COMPREPLY=( $( compgen -f ) )
+}
+
 _clang()
 {
-  local cur prev words cword arg flags
-  _init_completion -n : || return
+  local cur prev words cword arg flags w1 w2
+  # If latest bash-completion is not supported just initialize COMPREPLY and
+  # initialize variables by setting manualy.
+  _init_completion -n 2> /dev/null
+  if [[ "$?" != 0 ]]; then
+    COMPREPLY=()
+    cword=$COMP_CWORD
+    cur="${COMP_WORDS[$cword]}"
+  fi
 
   # bash always separates '=' as a token even if there's no space before/after '='.
   # On the other hand, '=' is just a regular character for clang options that
   # contain '='. For example, "-stdlib=" is defined as is, instead of "-stdlib" and "=".
   # So, we need to partially undo bash tokenization here for integrity.
-  local w1="${COMP_WORDS[$cword - 1]}"
-  local w2="${COMP_WORDS[$cword - 2]}"
+  w1="${COMP_WORDS[$cword - 1]}"
+  if [[ $cword > 1 ]]; then
+    w2="${COMP_WORDS[$cword - 2]}"
+  # Clang want to know if -cc1 or -Xclang option is specified or not, because we don't want to show
+  # cc1 options otherwise.
+  if [[ "${COMP_WORDS[1]}" == "-cc1" || "$w1" == "-Xclang" ]]; then
+    arg="#"
+  fi
   if [[ "$cur" == -* ]]; then
     # -foo<tab>
-    arg="$cur"
+    arg="$arg$cur"
   elif [[ "$w1" == -*  && "$cur" == '=' ]]; then
     # -foo=<tab>
-    arg="$w1=,"
+    arg="$arg$w1=,"
+  elif [[ "$cur" == -*= ]]; then
+    # -foo=<tab>
+    arg="$arg$cur,"
   elif [[ "$w1" == -* ]]; then
     # -foo <tab> or -foo bar<tab>
-    arg="$w1,$cur"
+    arg="$arg$w1,$cur"
   elif [[ "$w2" == -* && "$w1" == '=' ]]; then
     # -foo=bar<tab>
-    arg="$w2=,$cur"
+    arg="$arg$w2=,$cur"
+  elif [[ ${cur: -1} != '=' && ${cur/=} != $cur ]]; then
+    # -foo=bar<tab>
+    arg="$arg${cur%=*}=,${cur#*=}"
   fi
 
-  flags=$( "${COMP_WORDS[0]}" --autocomplete="$arg" 2>/dev/null )
+  # expand ~ to $HOME
+  eval local path=${COMP_WORDS[0]}
+  flags=$( "$path" --autocomplete="$arg" 2>/dev/null )
   # If clang is old that it does not support --autocomplete,
   # fall back to the filename completion.
   if [[ "$?" != 0 ]]; then
-    _filedir
+    _clang_filedir
     return
   fi
 
   if [[ "$cur" == '=' ]]; then
     COMPREPLY=( $( compgen -W "$flags" -- "") )
   elif [[ "$flags" == "" || "$arg" == "" ]]; then
-    _filedir
+    _clang_filedir
   else
     # Bash automatically appends a space after '=' by default.
     # Disable it so that it works nicely for options in the form of -foo=bar.
-    [[ "${flags: -1}" == '=' ]] && compopt -o nospace
+    [[ "${flags: -1}" == '=' ]] && compopt -o nospace 2> /dev/null
     COMPREPLY=( $( compgen -W "$flags" -- "$cur" ) )
   fi
 }
diff --git a/utils/perf-training/lit.cfg b/utils/perf-training/lit.cfg
index edae551bed3f..671d44f83b94 100644
--- a/utils/perf-training/lit.cfg
+++ b/utils/perf-training/lit.cfg
@@ -3,13 +3,14 @@
 from lit import Test
 import lit.formats
 import lit.util
+import subprocess
 
 def getSysrootFlagsOnDarwin(config, lit_config):
     # On Darwin, support relocatable SDKs by providing Clang with a
     # default system root path.
     if 'darwin' in config.target_triple:
         try:
-            out = lit.util.capture(['xcrun', '--show-sdk-path']).strip()
+            out = subprocess.check_output(['xcrun', '--show-sdk-path']).strip()
             res = 0
         except OSError:
             res = -1
diff --git a/utils/perf-training/order-files.lit.cfg b/utils/perf-training/order-files.lit.cfg
index a4fd81232a46..93904ec84a41 100644
--- a/utils/perf-training/order-files.lit.cfg
+++ b/utils/perf-training/order-files.lit.cfg
@@ -4,13 +4,14 @@ from lit import Test
 import lit.formats
 import lit.util
 import os
+import subprocess
 
 def getSysrootFlagsOnDarwin(config, lit_config):
     # On Darwin, support relocatable SDKs by providing Clang with a
     # default system root path.
     if 'darwin' in config.target_triple:
         try:
-            out = lit.util.capture(['xcrun', '--show-sdk-path']).strip()
+            out = subprocess.check_output(['xcrun', '--show-sdk-path']).strip()
             res = 0
         except OSError:
             res = -1
diff --git a/www/analyzer/checker_dev_manual.html b/www/analyzer/checker_dev_manual.html
index 93c98913fdc2..cdc2496b99ca 100644
--- a/www/analyzer/checker_dev_manual.html
+++ b/www/analyzer/checker_dev_manual.html
@@ -279,8 +279,8 @@ void ento::registerSimpleStreamChecker(CheckerManager &amp;mgr) {
 }
 </pre>
 <li>A package was selected for the checker and the checker was defined in the
-table of checkers at <tt>lib/StaticAnalyzer/Checkers/Checkers.td</tt>. Since all
-checkers should first be developed as "alpha", and the SimpleStreamChecker
+table of checkers at <tt>include/clang/StaticAnalyzer/Checkers/Checkers.td</tt>.
+Since all checkers should first be developed as "alpha", and the SimpleStreamChecker
 performs UNIX API checks, the correct package is "alpha.unix", and the following
 was added to the corresponding <tt>UnixAlpha</tt> section of <tt>Checkers.td</tt>:
 <pre class="code_example">
diff --git a/www/analyzer/scripts/expandcollapse.js b/www/analyzer/scripts/expandcollapse.js
index 593a9831c8b0..c3ae2864670e 100644
--- a/www/analyzer/scripts/expandcollapse.js
+++ b/www/analyzer/scripts/expandcollapse.js
@@ -81,7 +81,7 @@ function initExpandCollapse() {
       expander.onclick = function() {
         expandCollapse(this.id);
         // Hack for Opera - onmouseout callback is not invoked when page 
-        // content changes dinamically and mouse pointer goes out of an element.
+        // content changes dynamically and mouse pointer goes out of an element.
         this.src = imgPath + 
                    (getCellInfo(this.id).expanded ? "arrows_light.gif"
                                                   : "ellipses_light.gif");
diff --git a/www/cxx_status.html b/www/cxx_status.html
index c67bf65322a1..23d95bda67c0 100644
--- a/www/cxx_status.html
+++ b/www/cxx_status.html
@@ -14,6 +14,7 @@
     span:target { background-color: #FFFFBB; outline: #DDDD55 solid thin; }
     th { background-color: #FFDDAA }
     td { vertical-align: middle }
+    tt { white-space: nowrap }
   </style>
 </head>
 <body>
@@ -25,7 +26,7 @@
 <!--*************************************************************************-->
 <h1>C++ Support in Clang</h1>
 <!--*************************************************************************-->
-<p>Last updated: $Date: 2017-05-28 19:35:23 +0200 (Sun, 28 May 2017) $</p>
+<p>Last updated: $Date: 2017-07-06 02:29:13 +0200 (Thu, 06 Jul 2017) $</p>
 
 <p>Clang fully implements all published ISO C++ standards (<a
 href="#cxx98">C++98 / C++03</a>, <a
@@ -832,7 +833,7 @@ and library features that are not part of standard C++.</p>
     <tr>
       <td>[DRAFT TS] Coroutines</td>
       <td><a href="https://isocpp.org/files/papers/N4663.pdf">N4663</a></td>
-      <td><tt>&#8209;fcoroutines&#8209;ts<br>-stdlib=libc++</tt></td>
+      <td><tt>-fcoroutines-ts<br>-stdlib=libc++</tt></td>
       <td class="svn" align="center">SVN</td>
     </tr>
     <tr>
@@ -849,7 +850,7 @@ and library features that are not part of standard C++.</p>
     </tr>
     <tr>
       <td>[DRAFT TS] Modules</td>
-      <td><a href="http://wg21.link/n4592">N4592</a></td>
+      <td><a href="http://wg21.link/n4667">N4667</a></td>
       <td><tt>-fmodules-ts</tt></td>
       <td class="none" align="center">WIP</td>
     </tr>

From 93c91e39b29142dec1d03a30df9f6e757f56c193 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 19 Jul 2017 07:02:10 +0000
Subject: [PATCH 2/4] Vendor import of llvm trunk r308421:
 https://llvm.org/svn/llvm-project/llvm/trunk@308421

---
 RELEASE_TESTERS.TXT                           |    7 +-
 docs/AliasAnalysis.rst                        |    3 +-
 docs/CodingStandards.rst                      |    6 +-
 docs/CommandGuide/lit.rst                     |    7 +
 include/llvm/Analysis/DominanceFrontier.h     |   55 +-
 include/llvm/Analysis/DominanceFrontierImpl.h |   36 +-
 .../llvm/Analysis/IteratedDominanceFrontier.h |   20 +-
 include/llvm/Analysis/LazyCallGraph.h         |   28 +-
 include/llvm/Analysis/LoopInfo.h              |    9 +-
 include/llvm/Analysis/LoopInfoImpl.h          |   17 +-
 include/llvm/Analysis/PostDominators.h        |    6 +-
 include/llvm/Analysis/ScalarEvolution.h       |   48 +-
 include/llvm/Analysis/TargetTransformInfo.h   |   11 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    6 +
 include/llvm/CodeGen/BasicTTIImpl.h           |   12 +
 .../llvm/CodeGen/MachineDominanceFrontier.h   |   31 +-
 include/llvm/CodeGen/MachineDominators.h      |   15 +-
 include/llvm/CodeGen/MachinePostDominators.h  |    2 +-
 .../llvm/DebugInfo/CodeView/CVTypeVisitor.h   |   16 +-
 .../DebugInfo/CodeView/CodeViewRecordIO.h     |    2 +-
 include/llvm/DebugInfo/CodeView/Formatters.h  |   10 +-
 include/llvm/DebugInfo/CodeView/GUID.h        |   55 +
 .../llvm/DebugInfo/CodeView/SymbolRecord.h    |    2 +-
 include/llvm/DebugInfo/CodeView/TypeRecord.h  |   13 +-
 .../DebugInfo/CodeView/TypeServerHandler.h    |   38 -
 .../DebugInfo/CodeView/TypeStreamMerger.h     |   14 +-
 include/llvm/DebugInfo/DWARF/DWARFUnit.h      |   28 +
 include/llvm/DebugInfo/DWARF/DWARFVerifier.h  |   44 +-
 include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h |    2 +-
 include/llvm/DebugInfo/PDB/GenericError.h     |    1 +
 include/llvm/DebugInfo/PDB/IPDBRawSymbol.h    |    2 +-
 .../llvm/DebugInfo/PDB/Native/Formatters.h    |    7 -
 .../llvm/DebugInfo/PDB/Native/InfoStream.h    |    5 +-
 .../DebugInfo/PDB/Native/InfoStreamBuilder.h  |    4 +-
 .../DebugInfo/PDB/Native/NativeExeSymbol.h    |    2 +-
 .../DebugInfo/PDB/Native/NativeRawSymbol.h    |    2 +-
 .../PDB/Native/PDBTypeServerHandler.h         |   46 -
 include/llvm/DebugInfo/PDB/Native/RawTypes.h  |   14 +-
 .../llvm/DebugInfo/PDB/Native/TpiHashing.h    |   73 +-
 include/llvm/DebugInfo/PDB/PDBExtras.h        |    1 -
 .../Orc/CompileOnDemandLayer.h                |   24 +-
 .../ExecutionEngine/RTDyldMemoryManager.h     |    5 +-
 include/llvm/IR/CallingConv.h                 |   14 +-
 include/llvm/IR/Constants.h                   |    8 +
 include/llvm/IR/DIBuilder.h                   |   37 +-
 include/llvm/IR/DebugInfoMetadata.h           |   28 +-
 include/llvm/IR/Dominators.h                  |   43 +-
 include/llvm/IR/IntrinsicsHexagon.td          |   26 +-
 include/llvm/IR/IntrinsicsSystemZ.td          |   43 +
 include/llvm/MC/LaneBitmask.h                 |    3 +
 include/llvm/MC/MCFixup.h                     |    2 +-
 include/llvm/MC/MCInstrDesc.h                 |    9 +
 include/llvm/Object/COFFImportFile.h          |    2 +-
 include/llvm/Object/COFFModuleDefinition.h    |    8 +-
 include/llvm/ObjectYAML/CodeViewYAMLTypes.h   |    2 +
 include/llvm/Support/AArch64TargetParser.def  |    5 +-
 include/llvm/Support/BinaryItemStream.h       |   39 +-
 include/llvm/Support/Format.h                 |   25 +-
 include/llvm/Support/GenericDomTree.h         |  160 +-
 .../llvm/Support/GenericDomTreeConstruction.h |  688 +++-
 include/llvm/Support/TargetParser.h           |    4 +-
 include/llvm/Support/YAMLTraits.h             |    4 +
 .../Target/GlobalISel/SelectionDAGCompat.td   |    1 +
 include/llvm/Target/TargetLowering.h          |   29 +
 .../ToolDrivers/llvm-dlltool/DlltoolDriver.h  |   24 +
 lib/Analysis/CGSCCPassManager.cpp             |    9 +-
 lib/Analysis/DominanceFrontier.cpp            |    3 +-
 lib/Analysis/InstCount.cpp                    |    8 -
 lib/Analysis/InstructionSimplify.cpp          |   21 +-
 lib/Analysis/IteratedDominanceFrontier.cpp    |    8 +-
 lib/Analysis/LazyCallGraph.cpp                |   49 +-
 lib/Analysis/LoopInfo.cpp                     |    2 +-
 lib/Analysis/MemorySSA.cpp                    |    1 -
 lib/Analysis/PostDominators.cpp               |    2 +
 lib/Analysis/ScalarEvolution.cpp              |  385 +-
 lib/Analysis/TargetTransformInfo.cpp          |    5 +
 lib/AsmParser/LLLexer.cpp                     |    2 +-
 lib/AsmParser/LLParser.cpp                    |   10 +-
 lib/AsmParser/LLToken.h                       |    2 +-
 lib/Bitcode/Reader/MetadataLoader.cpp         |    8 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp          |    1 +
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |    5 +-
 lib/CodeGen/CodeGenPrepare.cpp                |   79 +-
 lib/CodeGen/GlobalISel/LegalizerHelper.cpp    |    3 +
 lib/CodeGen/MachineCombiner.cpp               |    4 +-
 lib/CodeGen/MachineDominanceFrontier.cpp      |    3 +-
 lib/CodeGen/MachineDominators.cpp             |    6 +-
 lib/CodeGen/MachinePostDominators.cpp         |    7 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  161 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   21 +-
 lib/CodeGen/XRayInstrumentation.cpp           |    6 +-
 lib/DebugInfo/CodeView/CVTypeVisitor.cpp      |   77 +-
 lib/DebugInfo/CodeView/CodeViewRecordIO.cpp   |    9 +-
 lib/DebugInfo/CodeView/Formatters.cpp         |    7 +
 lib/DebugInfo/CodeView/SymbolDumper.cpp       |    2 +-
 lib/DebugInfo/CodeView/TypeDumpVisitor.cpp    |    2 +-
 lib/DebugInfo/CodeView/TypeStreamMerger.cpp   |  173 +-
 lib/DebugInfo/DWARF/DWARFVerifier.cpp         |  195 +-
 lib/DebugInfo/PDB/CMakeLists.txt              |    1 -
 lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp        |   12 +-
 lib/DebugInfo/PDB/GenericError.cpp            |    2 +
 lib/DebugInfo/PDB/Native/InfoStream.cpp       |    2 +-
 .../PDB/Native/InfoStreamBuilder.cpp          |    2 +-
 lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp  |    4 +-
 lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp  |    4 +-
 .../PDB/Native/PDBTypeServerHandler.cpp       |  126 -
 lib/DebugInfo/PDB/Native/TpiHashing.cpp       |  124 +-
 lib/DebugInfo/PDB/Native/TpiStream.cpp        |    1 -
 lib/DebugInfo/PDB/PDBExtras.cpp               |    6 -
 .../RuntimeDyld/Targets/RuntimeDyldMachOARM.h |    1 -
 lib/Fuzzer/CMakeLists.txt                     |    1 -
 lib/Fuzzer/FuzzerCorpus.h                     |   40 +-
 lib/Fuzzer/FuzzerDriver.cpp                   |   20 +-
 lib/Fuzzer/FuzzerFlags.def                    |    3 +
 lib/Fuzzer/FuzzerInternal.h                   |   14 +-
 lib/Fuzzer/FuzzerLoop.cpp                     |   30 +-
 lib/Fuzzer/FuzzerMerge.cpp                    |    8 +-
 lib/Fuzzer/FuzzerMutate.cpp                   |   23 +-
 lib/Fuzzer/FuzzerMutate.h                     |    6 -
 lib/Fuzzer/FuzzerTracePC.cpp                  |   83 +
 lib/Fuzzer/FuzzerTracePC.h                    |   23 +
 lib/Fuzzer/FuzzerTraceState.cpp               |  181 -
 lib/Fuzzer/FuzzerUtil.cpp                     |    7 +
 lib/Fuzzer/FuzzerUtil.h                       |   10 +
 lib/Fuzzer/afl/afl_driver.cpp                 |    4 +-
 lib/Fuzzer/test/CMakeLists.txt                |    3 +-
 lib/Fuzzer/test/FlagsTest.cpp                 |   32 +
 lib/Fuzzer/test/FuzzerUnittest.cpp            |   31 +-
 lib/Fuzzer/test/fuzzer-flags.test             |   17 +-
 lib/Fuzzer/test/fuzzer-traces-hooks.test      |    2 +-
 lib/Fuzzer/test/reduce_inputs.test            |    3 +-
 lib/IR/AsmWriter.cpp                          |    3 +-
 lib/IR/Constants.cpp                          |   86 +-
 lib/IR/Core.cpp                               |    1 -
 lib/IR/DIBuilder.cpp                          |   26 +-
 lib/IR/DebugInfoMetadata.cpp                  |    9 +-
 lib/IR/Dominators.cpp                         |   40 +-
 lib/IR/LLVMContextImpl.h                      |   16 +-
 lib/IR/LegacyPassManager.cpp                  |   51 +-
 lib/IR/Module.cpp                             |    2 +-
 lib/Object/ArchiveWriter.cpp                  |    3 +-
 lib/Object/COFFImportFile.cpp                 |  144 +-
 lib/Object/COFFModuleDefinition.cpp           |   36 +-
 lib/Object/COFFObjectFile.cpp                 |    5 +-
 lib/ObjectYAML/CodeViewYAMLTypes.cpp          |   27 +
 lib/Option/OptTable.cpp                       |   25 +-
 lib/Support/ErrorHandling.cpp                 |   22 +
 lib/Support/Host.cpp                          |   43 +-
 lib/Support/Path.cpp                          |    2 -
 lib/Support/TargetParser.cpp                  |    2 +
 lib/Support/YAMLTraits.cpp                    |    8 +
 lib/Support/raw_ostream.cpp                   |   29 +-
 lib/Target/AArch64/AArch64.h                  |    4 +
 lib/Target/AArch64/AArch64.td                 |    4 +
 .../AArch64/AArch64CallingConvention.td       |    7 +
 .../AArch64DeadRegisterDefinitionsPass.cpp    |    4 +
 lib/Target/AArch64/AArch64FalkorHWPFFix.cpp   |  790 ++++
 lib/Target/AArch64/AArch64FastISel.cpp        |    1 +
 lib/Target/AArch64/AArch64FrameLowering.cpp   |   12 +-
 lib/Target/AArch64/AArch64ISelDAGToDAG.cpp    |   15 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp    |   64 +-
 lib/Target/AArch64/AArch64ISelLowering.h      |   16 +
 lib/Target/AArch64/AArch64InstrAtomics.td     |   10 +
 lib/Target/AArch64/AArch64InstrInfo.cpp       |   13 +-
 lib/Target/AArch64/AArch64InstrInfo.h         |   12 +-
 lib/Target/AArch64/AArch64InstrInfo.td        |    2 +
 lib/Target/AArch64/AArch64LegalizerInfo.cpp   |    2 +-
 lib/Target/AArch64/AArch64RegisterInfo.cpp    |    4 +-
 lib/Target/AArch64/AArch64Subtarget.cpp       |    7 +-
 lib/Target/AArch64/AArch64Subtarget.h         |   13 +
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   23 +-
 lib/Target/AArch64/AArch64TargetMachine.h     |    2 +
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   21 +-
 lib/Target/AArch64/CMakeLists.txt             |    1 +
 .../MCTargetDesc/AArch64AsmBackend.cpp        |    6 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   61 +-
 lib/Target/AMDGPU/AMDGPU.h                    |    2 +-
 .../AMDGPU/AMDGPUAnnotateKernelFeatures.cpp   |  278 +-
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |    9 -
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |   32 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.h        |    1 +
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp         |    2 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |    4 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |    2 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |    5 +
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |    1 +
 lib/Target/AMDGPU/SIFoldOperands.cpp          |    1 +
 lib/Target/AMDGPU/SIFrameLowering.cpp         |   11 +-
 lib/Target/AMDGPU/SIISelLowering.cpp          |  119 +-
 lib/Target/AMDGPU/SIISelLowering.h            |    2 +
 lib/Target/AMDGPU/SIInstrInfo.cpp             |   24 +-
 lib/Target/AMDGPU/SIInstrInfo.h               |   19 +-
 lib/Target/AMDGPU/SIInstrInfo.td              |    2 +-
 lib/Target/AMDGPU/SIInstructions.td           |    2 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |   68 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.h     |   10 +-
 lib/Target/AMDGPU/SIRegisterInfo.cpp          |    4 +
 lib/Target/AMDGPU/SIRegisterInfo.td           |   17 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   18 +-
 lib/Target/AMDGPU/VOP2Instructions.td         |   11 +-
 lib/Target/AMDGPU/VOP3Instructions.td         |   11 +-
 lib/Target/AMDGPU/VOP3PInstructions.td        |   10 +-
 lib/Target/AMDGPU/VOPCInstructions.td         |   24 +-
 lib/Target/AMDGPU/VOPInstructions.td          |    2 +
 lib/Target/ARM/ARM.td                         |  429 ++-
 lib/Target/ARM/ARMBaseRegisterInfo.cpp        |    4 +-
 lib/Target/ARM/ARMFastISel.cpp                |   43 +-
 lib/Target/ARM/ARMISelDAGToDAG.cpp            |   20 +-
 lib/Target/ARM/ARMInstructionSelector.cpp     |   23 +
 lib/Target/ARM/ARMLegalizerInfo.cpp           |    8 +-
 lib/Target/ARM/ARMMCInstLower.cpp             |    4 +-
 lib/Target/ARM/ARMRegisterBankInfo.cpp        |    9 +
 lib/Target/ARM/ARMTargetMachine.h             |    2 +
 lib/Target/BPF/BPFISelLowering.cpp            |   45 +-
 lib/Target/BPF/BPFInstrInfo.td                |    5 +
 lib/Target/Hexagon/HexagonBitSimplify.cpp     |    4 +-
 lib/Target/Hexagon/HexagonDepInstrInfo.td     |    4 +
 lib/Target/Hexagon/HexagonEarlyIfConv.cpp     |    4 +-
 lib/Target/Hexagon/HexagonExpandCondsets.cpp  |    4 +-
 lib/Target/Hexagon/HexagonFrameLowering.cpp   |    4 +-
 lib/Target/Hexagon/HexagonGenInsert.cpp       |    4 +-
 lib/Target/Hexagon/HexagonGenPredicate.cpp    |    4 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp    |  147 +-
 lib/Target/Hexagon/HexagonISelLowering.h      |    4 +-
 lib/Target/Hexagon/HexagonIntrinsics.td       |   12 +
 lib/Target/Hexagon/HexagonOptAddrMode.cpp     |    4 +-
 lib/Target/Hexagon/HexagonPatterns.td         |   86 +-
 .../Hexagon/HexagonTargetObjectFile.cpp       |   47 +
 lib/Target/Hexagon/HexagonTargetObjectFile.h  |    6 +
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp   |  219 --
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   15 -
 lib/Target/Mips/Mips.td                       |    3 +
 lib/Target/Mips/MipsISelLowering.cpp          |  146 +-
 lib/Target/Mips/MipsInstrFPU.td               |    9 +
 lib/Target/Mips/MipsMTInstrFormats.td         |   21 -
 lib/Target/Mips/MipsMTInstrInfo.td            |  110 -
 lib/Target/Mips/MipsSEISelDAGToDAG.cpp        |   99 +-
 lib/Target/Mips/MipsSEISelDAGToDAG.h          |    3 +-
 lib/Target/Mips/MipsSEISelLowering.cpp        |  203 +-
 lib/Target/Mips/MipsSchedule.td               |    4 -
 lib/Target/Mips/MipsSubtarget.h               |    5 +
 lib/Target/Mips/MipsTargetStreamer.h          |    3 -
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp |    3 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp        |   28 +-
 lib/Target/PowerPC/PPCISelLowering.cpp        |   18 +-
 lib/Target/PowerPC/PPCISelLowering.h          |    2 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp           |   10 +-
 lib/Target/PowerPC/PPCInstrInfo.td            |   22 +-
 lib/Target/PowerPC/PPCInstrVSX.td             |   88 +-
 lib/Target/PowerPC/PPCRegisterInfo.cpp        |   30 +-
 lib/Target/PowerPC/PPCTargetMachine.h         |    2 +
 lib/Target/Sparc/Sparc.td                     |    6 +-
 lib/Target/Sparc/SparcISelLowering.cpp        |   13 +
 lib/Target/Sparc/SparcInstrInfo.td            |    3 +
 lib/Target/Sparc/SparcSubtarget.cpp           |    1 +
 lib/Target/Sparc/SparcSubtarget.h             |    2 +
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |   15 +-
 lib/Target/SystemZ/LLVMBuild.txt              |    2 +-
 lib/Target/SystemZ/SystemZFeatures.td         |   58 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp    |  104 +-
 lib/Target/SystemZ/SystemZISelLowering.h      |    7 +
 lib/Target/SystemZ/SystemZInstrFP.td          |   88 +-
 lib/Target/SystemZ/SystemZInstrFormats.td     |  330 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp       |   32 +
 lib/Target/SystemZ/SystemZInstrInfo.td        |   68 +-
 lib/Target/SystemZ/SystemZInstrSystem.td      |    4 +
 lib/Target/SystemZ/SystemZInstrVector.td      |  366 +-
 lib/Target/SystemZ/SystemZOperators.td        |   20 +
 lib/Target/SystemZ/SystemZPatterns.td         |    7 +
 lib/Target/SystemZ/SystemZProcessors.td       |    3 +
 lib/Target/SystemZ/SystemZRegisterInfo.td     |   14 +-
 lib/Target/SystemZ/SystemZSchedule.td         |    3 +-
 lib/Target/SystemZ/SystemZScheduleZ14.td      | 1611 ++++++++
 lib/Target/SystemZ/SystemZScheduleZ196.td     |  163 +-
 lib/Target/SystemZ/SystemZScheduleZEC12.td    |  161 +-
 lib/Target/SystemZ/SystemZShortenInst.cpp     |   40 +
 lib/Target/SystemZ/SystemZSubtarget.cpp       |    4 +
 lib/Target/SystemZ/SystemZSubtarget.h         |   34 +
 lib/Target/SystemZ/SystemZTargetMachine.cpp   |    4 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |    3 +
 .../SystemZ/SystemZTargetTransformInfo.h      |    4 +
 lib/Target/X86/CMakeLists.txt                 |    1 +
 lib/Target/X86/X86.h                          |    3 +
 lib/Target/X86/X86.td                         |    6 +-
 lib/Target/X86/X86CallingConv.td              |    4 +-
 lib/Target/X86/X86CmovConversion.cpp          |  611 ++++
 lib/Target/X86/X86FastISel.cpp                |    4 +-
 lib/Target/X86/X86FixupBWInsts.cpp            |    2 +-
 lib/Target/X86/X86ISelLowering.cpp            |   66 +-
 lib/Target/X86/X86InstrAVX512.td              |  231 +-
 lib/Target/X86/X86RegisterInfo.cpp            |    6 +-
 lib/Target/X86/X86Schedule.td                 |    1 +
 lib/Target/X86/X86ScheduleBtVer2.td           |   16 +
 lib/Target/X86/X86ScheduleZnver1.td           |  223 ++
 lib/Target/X86/X86Subtarget.h                 |    2 +-
 lib/Target/X86/X86TargetMachine.cpp           |    1 +
 lib/Target/X86/X86TargetMachine.h             |    2 +
 lib/ToolDrivers/CMakeLists.txt                |    1 +
 lib/ToolDrivers/LLVMBuild.txt                 |    2 +-
 lib/ToolDrivers/llvm-dlltool/CMakeLists.txt   |    9 +
 .../llvm-dlltool/DlltoolDriver.cpp            |  160 +
 lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt    |   22 +
 lib/ToolDrivers/llvm-dlltool/Options.td       |   26 +
 lib/Transforms/IPO/GlobalOpt.cpp              |   18 +
 lib/Transforms/IPO/Inliner.cpp                |    2 +-
 lib/Transforms/IPO/SampleProfile.cpp          |   11 +-
 .../InstCombine/InstCombineAndOrXor.cpp       |   56 +-
 .../InstCombine/InstCombineCompares.cpp       |   36 +-
 .../InstCombineLoadStoreAlloca.cpp            |    5 +-
 .../InstCombineSimplifyDemanded.cpp           |    6 +-
 .../InstCombine/InstructionCombining.cpp      |   76 +-
 .../Instrumentation/AddressSanitizer.cpp      |   38 +
 .../Instrumentation/MemorySanitizer.cpp       |    4 +-
 .../Instrumentation/SanitizerCoverage.cpp     |   10 +
 lib/Transforms/Scalar/EarlyCSE.cpp            |   16 +-
 lib/Transforms/Scalar/GVN.cpp                 |    1 +
 .../Scalar/InductiveRangeCheckElimination.cpp |   72 +-
 lib/Transforms/Scalar/JumpThreading.cpp       |   82 +-
 lib/Transforms/Scalar/LoopInterchange.cpp     |  119 +-
 .../Scalar/TailRecursionElimination.cpp       |   15 +-
 lib/Transforms/Utils/LoopUnrollRuntime.cpp    |   30 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   64 +-
 lib/Transforms/Vectorize/SLPVectorizer.cpp    |   30 +-
 runtimes/CMakeLists.txt                       |    9 +-
 test/Analysis/CostModel/SystemZ/fp-arith.ll   |   53 +-
 test/Assembler/diimportedentity.ll            |    4 +-
 test/Bitcode/DIGlobalVariableExpression.ll    |    2 +-
 test/Bitcode/compatibility-3.6.ll             |    6 +-
 test/Bitcode/compatibility-3.7.ll             |    6 +-
 test/Bitcode/compatibility-3.8.ll             |    6 +-
 test/Bitcode/compatibility-3.9.ll             |    6 +-
 test/Bitcode/compatibility-4.0.ll             |    6 +-
 test/Bitcode/compatibility.ll                 |    6 +-
 test/Bitcode/upgrade-importedentity.ll        |   15 +
 test/Bitcode/upgrade-importedentity.ll.bc     |  Bin 0 -> 1216 bytes
 test/CMakeLists.txt                           |    3 +
 .../CodeGen/AArch64/GlobalISel/select-fma.mir |   41 +
 .../CodeGen/AArch64/aarch64_win64cc_vararg.ll |   74 +
 test/CodeGen/AArch64/arm64-abi-varargs.ll     |    3 +-
 test/CodeGen/AArch64/arm64-abi_align.ll       |   32 +-
 .../arm64-alloca-frame-pointer-offset.ll      |    6 +-
 test/CodeGen/AArch64/arm64-extern-weak.ll     |    2 +-
 test/CodeGen/AArch64/arm64-inline-asm.ll      |   10 +
 test/CodeGen/AArch64/arm64-platform-reg.ll    |    1 +
 test/CodeGen/AArch64/arm64-vext.ll            |    8 +-
 test/CodeGen/AArch64/atomic-ops-lse.ll        |  161 +
 .../CodeGen/AArch64/dag-combine-invaraints.ll |    2 +-
 test/CodeGen/AArch64/extern-weak.ll           |    2 +-
 test/CodeGen/AArch64/falkor-hwpf-fix.ll       |   67 +
 test/CodeGen/AArch64/falkor-hwpf-fix.mir      |   52 +
 test/CodeGen/AArch64/falkor-hwpf.ll           |  106 +
 .../AArch64/preferred-function-alignment.ll   |    2 +-
 test/CodeGen/AArch64/swifterror.ll            |   12 +-
 test/CodeGen/AArch64/win64_vararg.ll          |   95 +
 .../annotate-kernel-features-hsa-call.ll      |  312 ++
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   11 +
 .../attr-amdgpu-flat-work-group-size.ll       |    2 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |   14 +-
 .../AMDGPU/fcanonicalize-elimination.ll       |   62 +-
 test/CodeGen/AMDGPU/function-args.ll          |   16 +
 test/CodeGen/AMDGPU/hsa.ll                    |   10 +-
 .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll |   12 +
 test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll    |    2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll  |    4 +-
 test/CodeGen/AMDGPU/move-to-valu-worklist.ll  |   29 +
 test/CodeGen/AMDGPU/mubuf-offset-private.ll   |   26 +-
 test/CodeGen/AMDGPU/parallelandifcollapse.ll  |    2 +-
 test/CodeGen/AMDGPU/parallelorifcollapse.ll   |    2 +-
 .../AMDGPU/private-access-no-objects.ll       |   10 +-
 ...ename-independent-subregs-mac-operands.mir |    2 +-
 test/CodeGen/AMDGPU/scratch-simple.ll         |   72 +-
 test/CodeGen/AMDGPU/sdwa-peephole-instr.mir   |   35 +-
 test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir       |    4 +-
 test/CodeGen/AMDGPU/trap.ll                   |    8 +-
 .../AMDGPU/vccz-corrupt-bug-workaround.mir    |    2 +-
 ...vgpr-spill-emergency-stack-slot-compute.ll |   36 +-
 .../AMDGPU/vgpr-spill-emergency-stack-slot.ll |    6 +-
 .../ARM/GlobalISel/arm-instruction-select.mir |   39 +
 .../CodeGen/ARM/GlobalISel/arm-isel-divmod.ll |   52 +
 test/CodeGen/ARM/GlobalISel/arm-isel.ll       |   39 +
 .../ARM/GlobalISel/arm-legalize-divmod.mir    |  174 +
 test/CodeGen/ARM/GlobalISel/arm-legalizer.mir |   36 +
 .../ARM/GlobalISel/arm-regbankselect.mir      |   30 +
 test/CodeGen/ARM/atomic-op.ll                 |   15 +
 test/CodeGen/AVR/branch-relaxation.ll         |    4 +-
 test/CodeGen/BPF/select_ri.ll                 |   27 +
 test/CodeGen/BPF/setcc.ll                     |    4 +-
 .../Generic/2003-07-29-BadConstSbyte.ll       |    1 -
 .../Generic/2011-07-07-ScheduleDAGCrash.ll    |    3 -
 test/CodeGen/Generic/print-mul-exp.ll         |    1 +
 test/CodeGen/Generic/print-mul.ll             |    1 +
 test/CodeGen/Generic/print-shift.ll           |    1 +
 test/CodeGen/Generic/v-split.ll               |    3 -
 test/CodeGen/Generic/vector-redux.ll          |    3 -
 test/CodeGen/Generic/vector.ll                |    3 -
 .../CodeGen/Hexagon/intrinsics/system_user.ll |   76 +-
 .../Hexagon/switch-lut-explicit-section.ll    |   32 +
 .../Hexagon/switch-lut-function-section.ll    |   30 +
 .../Hexagon/switch-lut-multiple-functions.ll  |   42 +
 .../Hexagon/switch-lut-text-section.ll        |   27 +
 test/CodeGen/Hexagon/v6vec-vprint.ll          |    2 +-
 test/CodeGen/Hexagon/vect/vect-load-v4i16.ll  |   23 +
 .../vect/{vect-loadv4i16.ll => vect-v4i16.ll} |    0
 .../MIR/AArch64/target-memoperands.mir        |    4 +
 test/CodeGen/MIR/AMDGPU/fold-multiple.mir     |   40 +
 test/CodeGen/MSP430/vararg.ll                 |    4 +-
 test/CodeGen/Mips/2008-06-05-Carry.ll         |   13 +-
 test/CodeGen/Mips/dins.ll                     |    4 +-
 test/CodeGen/Mips/dsp-patterns.ll             |    4 +-
 test/CodeGen/Mips/llcarry.ll                  |   11 +-
 test/CodeGen/Mips/llvm-ir/add.ll              |  380 +-
 test/CodeGen/Mips/llvm-ir/sub.ll              |  170 +-
 test/CodeGen/Mips/long-calls.ll               |   57 +
 test/CodeGen/Mips/madd-msub.ll                |   81 +-
 test/CodeGen/Mips/msa/f16-llvm-ir.ll          |   12 +-
 test/CodeGen/PowerPC/PR33671.ll               |   32 +
 test/CodeGen/PowerPC/build-vector-tests.ll    |   40 +-
 test/CodeGen/PowerPC/ppc64-i128-abi.ll        |    6 +-
 test/CodeGen/PowerPC/swaps-le-6.ll            |    8 +-
 test/CodeGen/PowerPC/vsx-p9.ll                |   48 +-
 test/CodeGen/SPARC/soft-mul-div.ll            |   65 +
 test/CodeGen/SystemZ/branch-11.ll             |   56 +
 test/CodeGen/SystemZ/fp-abs-03.ll             |   43 +
 test/CodeGen/SystemZ/fp-abs-04.ll             |   46 +
 test/CodeGen/SystemZ/fp-add-01.ll             |    6 +-
 test/CodeGen/SystemZ/fp-add-04.ll             |   17 +
 test/CodeGen/SystemZ/fp-cmp-01.ll             |  102 +-
 test/CodeGen/SystemZ/fp-cmp-06.ll             |   33 +
 test/CodeGen/SystemZ/fp-const-11.ll           |   40 +
 test/CodeGen/SystemZ/fp-conv-15.ll            |   50 +
 test/CodeGen/SystemZ/fp-conv-16.ll            |   99 +
 test/CodeGen/SystemZ/fp-copysign-02.ll        |   81 +
 test/CodeGen/SystemZ/fp-div-01.ll             |    6 +-
 test/CodeGen/SystemZ/fp-div-04.ll             |   17 +
 test/CodeGen/SystemZ/fp-move-13.ll            |   46 +
 test/CodeGen/SystemZ/fp-mul-01.ll             |    6 +-
 test/CodeGen/SystemZ/fp-mul-06.ll             |   31 +-
 test/CodeGen/SystemZ/fp-mul-08.ll             |   31 +-
 test/CodeGen/SystemZ/fp-mul-10.ll             |   43 +
 test/CodeGen/SystemZ/fp-mul-11.ll             |   32 +
 test/CodeGen/SystemZ/fp-mul-12.ll             |   72 +
 test/CodeGen/SystemZ/fp-neg-02.ll             |   41 +
 test/CodeGen/SystemZ/fp-round-03.ll           |  207 ++
 test/CodeGen/SystemZ/fp-sqrt-01.ll            |    8 +-
 test/CodeGen/SystemZ/fp-sqrt-04.ll            |   17 +
 test/CodeGen/SystemZ/fp-sub-01.ll             |    6 +-
 test/CodeGen/SystemZ/fp-sub-04.ll             |   17 +
 test/CodeGen/SystemZ/int-add-17.ll            |   95 +
 test/CodeGen/SystemZ/int-mul-09.ll            |   95 +
 test/CodeGen/SystemZ/int-mul-10.ll            |  165 +
 test/CodeGen/SystemZ/int-mul-11.ll            |   32 +
 test/CodeGen/SystemZ/int-sub-10.ll            |   95 +
 test/CodeGen/SystemZ/tdc-07.ll                |   18 +
 test/CodeGen/SystemZ/vec-abs-06.ll            |   47 +
 test/CodeGen/SystemZ/vec-add-02.ll            |   24 +
 test/CodeGen/SystemZ/vec-and-04.ll            |   47 +
 test/CodeGen/SystemZ/vec-cmp-07.ll            |  349 ++
 test/CodeGen/SystemZ/vec-ctpop-02.ll          |   45 +
 test/CodeGen/SystemZ/vec-div-02.ll            |   24 +
 ...vec-intrinsics.ll => vec-intrinsics-01.ll} |    0
 test/CodeGen/SystemZ/vec-intrinsics-02.ll     |  441 +++
 test/CodeGen/SystemZ/vec-max-05.ll            |  175 +
 test/CodeGen/SystemZ/vec-min-05.ll            |  175 +
 test/CodeGen/SystemZ/vec-move-18.ll           |   24 +
 test/CodeGen/SystemZ/vec-mul-03.ll            |   24 +
 test/CodeGen/SystemZ/vec-mul-04.ll            |   31 +
 test/CodeGen/SystemZ/vec-mul-05.ll            |   63 +
 test/CodeGen/SystemZ/vec-neg-02.ll            |   23 +
 test/CodeGen/SystemZ/vec-or-03.ll             |   91 +
 test/CodeGen/SystemZ/vec-round-02.ll          |  118 +
 test/CodeGen/SystemZ/vec-sqrt-02.ll           |   23 +
 test/CodeGen/SystemZ/vec-sub-02.ll            |   31 +
 test/CodeGen/SystemZ/vec-xor-02.ll            |   47 +
 test/CodeGen/Thumb/litpoolremat.ll            |   28 +
 test/CodeGen/Thumb/select.ll                  |    4 +-
 test/CodeGen/WebAssembly/indirect-import.ll   |    9 +-
 test/CodeGen/WebAssembly/userstack.ll         |   10 +-
 test/CodeGen/X86/2008-01-08-SchedulerCrash.ll |    2 +-
 .../X86/2009-06-03-Win64DisableRedZone.ll     |    2 +-
 test/CodeGen/X86/2011-10-19-widen_vselect.ll  |    7 +-
 .../X86/DynamicCalleeSavedRegisters.ll        |    2 +-
 test/CodeGen/X86/alias-static-alloca.ll       |   37 +
 test/CodeGen/X86/atomic-minmax-i6432.ll       |   16 +-
 test/CodeGen/X86/atomic128.ll                 |   64 +-
 test/CodeGen/X86/avx-schedule.ll              |  508 +--
 test/CodeGen/X86/avx2-arith.ll                |    8 +-
 test/CodeGen/X86/avx2-schedule.ll             |  116 +-
 test/CodeGen/X86/avx2-vector-shifts.ll        |    4 +-
 test/CodeGen/X86/avx512-cvt.ll                |    2 +-
 test/CodeGen/X86/avx512-mask-op.ll            |    5 +-
 test/CodeGen/X86/avx512-rotate.ll             |  256 ++
 test/CodeGen/X86/avx512-shift.ll              |  148 +-
 test/CodeGen/X86/bmi-schedule.ll              |  529 +++
 test/CodeGen/X86/bmi2-schedule.ll             |  180 +
 test/CodeGen/X86/bool-ext-inc.ll              |    8 +-
 test/CodeGen/X86/bswap-rotate.ll              |   27 +
 test/CodeGen/X86/clobber-fi0.ll               |   14 +-
 test/CodeGen/X86/combine-rotates.ll           |   27 +-
 test/CodeGen/X86/combine-shl.ll               |   12 +-
 test/CodeGen/X86/combine-srl.ll               |    8 +-
 test/CodeGen/X86/combine-udiv.ll              |    2 +-
 test/CodeGen/X86/combine-urem.ll              |    8 +-
 test/CodeGen/X86/f16c-schedule.ll             |  144 +
 test/CodeGen/X86/fast-isel-x86-64.ll          |    2 +-
 test/CodeGen/X86/hipe-cc.ll                   |    6 +-
 test/CodeGen/X86/hipe-cc64.ll                 |    6 +-
 test/CodeGen/X86/lea32-schedule.ll            |  653 ++++
 test/CodeGen/X86/lea64-schedule.ll            |  534 +++
 test/CodeGen/X86/legalize-shift-64.ll         |    8 +-
 test/CodeGen/X86/lzcnt-schedule.ll            |  119 +
 .../CodeGen/X86/machine-outliner-debuginfo.ll |    1 +
 test/CodeGen/X86/machine-outliner.ll          |    1 +
 test/CodeGen/X86/memcmp-minsize.ll            |  721 ++++
 test/CodeGen/X86/memcmp-optsize.ll            |  871 +++++
 test/CodeGen/X86/memcmp.ll                    |  827 +++--
 test/CodeGen/X86/pmul.ll                      |    6 +-
 test/CodeGen/X86/popcnt-schedule.ll           |  167 +
 test/CodeGen/X86/pr32282.ll                   |  104 +
 test/CodeGen/X86/pr32515.ll                   |   29 +
 test/CodeGen/X86/pr33772.ll                   |   15 +
 test/CodeGen/X86/pr33828.ll                   |   48 +
 test/CodeGen/X86/regparm.ll                   |    2 +-
 test/CodeGen/X86/rotate_vec.ll                |   54 +
 test/CodeGen/X86/sibcall-win64.ll             |   22 +-
 test/CodeGen/X86/sse-schedule.ll              |  327 +-
 test/CodeGen/X86/sse2-schedule.ll             |  824 ++++-
 test/CodeGen/X86/sse3-schedule.ll             |   64 +-
 test/CodeGen/X86/sse41-schedule.ll            |  311 +-
 test/CodeGen/X86/sse42-schedule.ll            |   81 +-
 test/CodeGen/X86/sse4a-schedule.ll            |   40 +-
 test/CodeGen/X86/ssse3-schedule.ll            |   98 +-
 test/CodeGen/X86/statepoint-invoke.ll         |    2 +-
 test/CodeGen/X86/statepoint-stack-usage.ll    |   42 +-
 test/CodeGen/X86/statepoint-vector.ll         |    4 +-
 test/CodeGen/X86/vec_cmp_uint-128.ll          |    8 +-
 test/CodeGen/X86/vector-idiv-sdiv-128.ll      |    6 +-
 test/CodeGen/X86/vector-idiv-sdiv-256.ll      |    6 +-
 test/CodeGen/X86/vector-idiv-udiv-128.ll      |    6 +-
 test/CodeGen/X86/vector-idiv-udiv-256.ll      |    6 +-
 test/CodeGen/X86/vector-idiv.ll               |    2 +-
 test/CodeGen/X86/vector-rotate-128.ll         |  201 +-
 test/CodeGen/X86/vector-rotate-256.ll         |  256 +-
 test/CodeGen/X86/vector-rotate-512.ll         |  831 +++++
 test/CodeGen/X86/vector-shift-ashr-256.ll     |   10 +-
 test/CodeGen/X86/vector-tzcnt-128.ll          |    4 +-
 test/CodeGen/X86/vector-tzcnt-256.ll          |    8 +-
 test/CodeGen/X86/vector-tzcnt-512.ll          |    8 +-
 test/CodeGen/X86/vselect-avx.ll               |    8 +-
 test/CodeGen/X86/widen_arith-2.ll             |   15 +-
 test/CodeGen/X86/widen_cast-4.ll              |   34 +-
 test/CodeGen/X86/win64-nosse-csrs.ll          |    2 +-
 test/CodeGen/X86/win64_nonvol.ll              |    2 +-
 test/CodeGen/X86/win64_params.ll              |    2 +-
 test/CodeGen/X86/win_chkstk.ll                |    2 +-
 test/CodeGen/X86/win_coreclr_chkstk.ll        |    4 +-
 test/CodeGen/X86/x86-64-ms_abi-vararg.ll      |   14 +-
 test/CodeGen/X86/x86-cmov-converter.ll        |  321 ++
 test/CodeGen/XCore/varargs.ll                 |    8 +-
 test/DebugInfo/Generic/namespace.ll           |   41 +-
 test/DebugInfo/PDB/pdbdump-headers.test       |  184 +-
 test/DebugInfo/X86/DIModule.ll                |    2 +-
 test/DebugInfo/X86/DIModuleContext.ll         |    2 +-
 test/DebugInfo/X86/fission-inline.ll          |    2 +-
 test/DebugInfo/X86/gnu-public-names.ll        |    4 +-
 .../X86/lexical-block-file-inline.ll          |    2 +-
 test/DebugInfo/X86/pr19307.ll                 |   12 +-
 test/DllTool/coff-exports.def                 |   13 +
 test/DllTool/coff-weak-exports.def            |   19 +
 test/DllTool/lit.local.cfg                    |    1 +
 test/FileCheck/regex-scope.txt                |    2 +-
 .../Instrumentation/AddressSanitizer/basic.ll |   20 +
 .../stack-poisoning-byval-args.ll             |   48 +
 .../unordered_atomic_mem_intrins.ll           |   37 +
 .../EfficiencySanitizer/working_set_basic.ll  |   33 +
 .../EfficiencySanitizer/working_set_slow.ll   |   32 +
 .../MemorySanitizer/msan_basic.ll             |   35 +
 .../cmp-tracing-api-x86_32.ll                 |   22 +
 .../cmp-tracing-api-x86_64.ll                 |   22 +
 test/Linker/pr26037.ll                        |    4 +-
 test/MC/AArch64/coff-relocations.s            |   52 +
 .../AArch64/invalid-instructions-spellcheck.s |   37 +
 test/MC/AMDGPU/gfx9_asm_all.s                 |  351 ++
 test/MC/AMDGPU/vop3-errs.s                    |   38 +-
 test/MC/ARM/virtexts-thumb.s                  |    2 +-
 test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt |  405 ++
 test/MC/Disassembler/Mips/mt/valid-r2-el.txt  |   21 +-
 test/MC/Disassembler/Mips/mt/valid-r2.txt     |   21 +-
 test/MC/Disassembler/SystemZ/insns-z14.txt    | 3253 +++++++++++++++++
 test/MC/Mips/mt/invalid-wrong-error.s         |    3 -
 test/MC/Mips/mt/invalid.s                     |   38 +-
 .../mftr-mttr-aliases-invalid-wrong-error.s   |   18 -
 test/MC/Mips/mt/mftr-mttr-aliases-invalid.s   |   23 -
 test/MC/Mips/mt/mftr-mttr-aliases.s           |   47 -
 test/MC/Mips/mt/mftr-mttr-reserved-valid.s    |    8 -
 test/MC/Mips/mt/valid.s                       |   42 +-
 test/MC/SystemZ/insn-bad-z13.s                |  705 ++++
 test/MC/SystemZ/insn-bad-z14.s                |  752 ++++
 test/MC/SystemZ/insn-good-z14.s               | 2674 ++++++++++++++
 .../SystemZ/invalid-instructions-spellcheck.s |   66 +
 test/MC/X86/pr22028.s                         |    4 +-
 test/Object/no-section-table.test             |    2 +-
 test/Object/readobj-shared-object.test        |   12 +-
 test/ObjectYAML/CodeView/guid.yaml            |   59 +
 test/Other/cgscc-libcall-update.ll            |   61 +
 test/Other/new-pass-manager.ll                |    2 +
 test/ThinLTO/X86/debuginfo-cu-import.ll       |    4 +-
 test/Transforms/CodeGenPrepare/X86/memcmp.ll  | 1635 ++++++++-
 .../CodeGenPrepare/X86/sink-addrmode.ll       |   35 +-
 .../EarlyCSE/globalsaa-memoryssa.ll           |   25 +
 .../GVN/PRE/2017-06-28-pre-load-dbgloc.ll     |   79 +
 test/Transforms/GVN/PRE/phi-translate.ll      |    4 +-
 test/Transforms/GlobalOpt/pr33686.ll          |   17 +
 test/Transforms/IRCE/eq_ne.ll                 |  257 ++
 test/Transforms/IRCE/pre_post_loops.ll        |  117 +
 test/Transforms/Inline/AArch64/ext.ll         |  249 ++
 test/Transforms/Inline/PowerPC/ext.ll         |  140 +
 test/Transforms/Inline/PowerPC/lit.local.cfg  |    3 +
 test/Transforms/Inline/X86/ext.ll             |  201 +
 .../InstCombine/2017-07-07-UMul-ZExt.ll       |   24 +-
 test/Transforms/InstCombine/and-not-or.ll     |   34 -
 test/Transforms/InstCombine/and.ll            |  192 +
 test/Transforms/InstCombine/and2.ll           |   85 +-
 .../InstCombine/element-atomic-memintrins.ll  |   98 +
 test/Transforms/InstCombine/icmp-logical.ll   |  165 +-
 test/Transforms/InstCombine/or-xor.ll         |   28 +-
 test/Transforms/InstCombine/or.ll             |  291 +-
 test/Transforms/InstCombine/pr33765.ll        |   32 +
 test/Transforms/JumpThreading/select.ll       |   77 +-
 .../current-limitations-lcssa.ll              |   76 +
 .../interchange-flow-dep-outer.ll             |  118 +
 .../interchange-not-profitable.ll             |   66 +
 .../interchange-output-dependencies.ll        |   86 +
 .../interchange-simple-count-down.ll          |   69 +
 .../interchange-simple-count-up.ll            |   86 +
 .../Transforms/LoopInterchange/interchange.ll |  749 ----
 .../loop-interchange-optimization-remarks.ll  |  220 ++
 .../not-interchanged-dependencies-1.ll        |   64 +
 .../not-interchanged-loop-nest-3.ll           |   87 +
 .../not-interchanged-tightly-nested.ll        |  143 +
 .../runtime-loop-multiexit-dom-verify.ll      |  126 +
 .../LoopVectorize/X86/float-induction-x86.ll  |    6 +-
 test/Transforms/LoopVectorize/debugloc.ll     |    2 +-
 .../LoopVectorize/first-order-recurrence.ll   |    8 +-
 .../LoopVectorize/float-induction.ll          |   14 +-
 .../LoopVectorize/if-conversion-nest.ll       |   25 +-
 .../LoopVectorize/induction-step.ll           |    4 +-
 test/Transforms/LoopVectorize/induction.ll    |    4 +-
 .../interleaved-accesses-pred-stores.ll       |    6 +-
 .../LoopVectorize/interleaved-accesses.ll     |   10 +-
 .../LoopVectorize/iv_outside_user.ll          |    2 +-
 test/Transforms/LoopVectorize/miniters.ll     |    4 +-
 .../pr30654-phiscev-sext-trunc.ll             |  240 ++
 .../LoopVectorize/runtime-check-readonly.ll   |    1 -
 .../Transforms/LoopVectorize/runtime-check.ll |    2 +-
 test/tools/llvm-cov/showTabsHTML.cpp          |    4 +-
 .../llvm-dwarfdump/X86/verify_debug_info.s    |  193 +
 .../X86/verify_unit_header_chain.s            |   81 +
 test/tools/llvm-mt/help.test                  |    7 +
 .../Inputs/reloc-addend.obj.macho-aarch64     |  Bin 0 -> 424 bytes
 .../AArch64/macho-reloc-addend.test           |    6 +
 .../llvm-readobj/Inputs/dynamic-table-so.x86  |  Bin 8280 -> 8256 bytes
 .../tools/llvm-readobj/Inputs/dynamic-table.c |    4 +-
 test/tools/llvm-readobj/dynamic.test          |   39 +-
 test/tools/llvm-readobj/gnu-sections.test     |   10 +-
 tools/llvm-ar/CMakeLists.txt                  |    2 +
 tools/llvm-ar/llvm-ar.cpp                     |    6 +-
 tools/llvm-mt/CMakeLists.txt                  |   13 +
 tools/llvm-mt/LLVMBuild.txt                   |   22 +
 tools/llvm-mt/Opts.td                         |   29 +
 tools/llvm-mt/llvm-mt.cpp                     |  117 +
 tools/llvm-objdump/llvm-objdump.cpp           |    5 +-
 tools/llvm-pdbutil/DumpOutputStyle.cpp        |    4 +-
 tools/llvm-pdbutil/MinimalSymbolDumper.cpp    |    5 +-
 tools/llvm-pdbutil/MinimalTypeDumper.cpp      |   20 +-
 tools/llvm-pdbutil/MinimalTypeDumper.h        |    4 +-
 tools/llvm-pdbutil/PdbYaml.cpp                |   35 -
 tools/llvm-pdbutil/PdbYaml.h                  |    2 +-
 tools/llvm-pdbutil/llvm-pdbutil.cpp           |    4 +-
 tools/llvm-readobj/CMakeLists.txt             |    2 +
 tools/llvm-readobj/COFFDumper.cpp             |    3 +-
 tools/llvm-readobj/ELFDumper.cpp              |   14 +-
 tools/llvm-readobj/llvm-readobj.cpp           |   22 +-
 tools/opt-viewer/opt-diff.py                  |   27 +-
 tools/opt-viewer/opt-stats.py                 |   14 +-
 tools/opt-viewer/opt-viewer.py                |   14 +-
 tools/opt-viewer/optpmap.py                   |    1 +
 tools/opt-viewer/optrecord.py                 |   20 +-
 unittests/Analysis/CGSCCPassManagerTest.cpp   |    2 +
 unittests/Analysis/LazyCallGraphTest.cpp      |   47 +-
 .../CodeView/RandomAccessVisitorTest.cpp      |    3 +-
 unittests/DebugInfo/PDB/CMakeLists.txt        |    3 +-
 .../DebugInfo/PDB/TypeServerHandlerTest.cpp   |  183 -
 unittests/IR/CFGBuilder.cpp                   |  269 ++
 unittests/IR/CFGBuilder.h                     |   94 +
 unittests/IR/CMakeLists.txt                   |    1 +
 unittests/IR/DominatorTreeTest.cpp            |  300 +-
 unittests/IR/IRBuilderTest.cpp                |   11 +-
 unittests/IR/MetadataTest.cpp                 |   24 +-
 unittests/Support/TargetParserTest.cpp        |    5 +-
 unittests/Support/YAMLIOTest.cpp              |   16 +
 unittests/Support/raw_ostream_test.cpp        |    5 +
 utils/TableGen/CodeGenRegisters.cpp           |   12 +-
 utils/lit/lit/LitConfig.py                    |    4 +-
 utils/lit/lit/TestRunner.py                   |    2 +
 utils/lit/lit/main.py                         |   12 +-
 utils/vim/syntax/llvm.vim                     |    4 +-
 706 files changed, 40042 insertions(+), 7075 deletions(-)
 create mode 100644 include/llvm/DebugInfo/CodeView/GUID.h
 delete mode 100644 include/llvm/DebugInfo/CodeView/TypeServerHandler.h
 delete mode 100644 include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
 create mode 100644 include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h
 delete mode 100644 lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
 delete mode 100644 lib/Fuzzer/FuzzerTraceState.cpp
 create mode 100644 lib/Fuzzer/test/FlagsTest.cpp
 create mode 100644 lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
 create mode 100644 lib/Target/SystemZ/SystemZScheduleZ14.td
 create mode 100644 lib/Target/X86/X86CmovConversion.cpp
 create mode 100644 lib/Target/X86/X86ScheduleZnver1.td
 create mode 100644 lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
 create mode 100644 lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
 create mode 100644 lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt
 create mode 100644 lib/ToolDrivers/llvm-dlltool/Options.td
 create mode 100644 test/Bitcode/upgrade-importedentity.ll
 create mode 100644 test/Bitcode/upgrade-importedentity.ll.bc
 create mode 100644 test/CodeGen/AArch64/GlobalISel/select-fma.mir
 create mode 100644 test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
 create mode 100644 test/CodeGen/AArch64/falkor-hwpf-fix.ll
 create mode 100644 test/CodeGen/AArch64/falkor-hwpf-fix.mir
 create mode 100644 test/CodeGen/AArch64/falkor-hwpf.ll
 create mode 100644 test/CodeGen/AArch64/win64_vararg.ll
 create mode 100644 test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
 create mode 100644 test/CodeGen/AMDGPU/move-to-valu-worklist.ll
 create mode 100644 test/CodeGen/BPF/select_ri.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-explicit-section.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-function-section.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-multiple-functions.ll
 create mode 100644 test/CodeGen/Hexagon/switch-lut-text-section.ll
 create mode 100644 test/CodeGen/Hexagon/vect/vect-load-v4i16.ll
 rename test/CodeGen/Hexagon/vect/{vect-loadv4i16.ll => vect-v4i16.ll} (100%)
 create mode 100644 test/CodeGen/MIR/AMDGPU/fold-multiple.mir
 create mode 100644 test/CodeGen/Mips/long-calls.ll
 create mode 100644 test/CodeGen/PowerPC/PR33671.ll
 create mode 100644 test/CodeGen/SPARC/soft-mul-div.ll
 create mode 100644 test/CodeGen/SystemZ/branch-11.ll
 create mode 100644 test/CodeGen/SystemZ/fp-abs-03.ll
 create mode 100644 test/CodeGen/SystemZ/fp-abs-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-add-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-cmp-06.ll
 create mode 100644 test/CodeGen/SystemZ/fp-const-11.ll
 create mode 100644 test/CodeGen/SystemZ/fp-conv-15.ll
 create mode 100644 test/CodeGen/SystemZ/fp-conv-16.ll
 create mode 100644 test/CodeGen/SystemZ/fp-copysign-02.ll
 create mode 100644 test/CodeGen/SystemZ/fp-div-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-move-13.ll
 create mode 100644 test/CodeGen/SystemZ/fp-mul-10.ll
 create mode 100644 test/CodeGen/SystemZ/fp-mul-11.ll
 create mode 100644 test/CodeGen/SystemZ/fp-mul-12.ll
 create mode 100644 test/CodeGen/SystemZ/fp-neg-02.ll
 create mode 100644 test/CodeGen/SystemZ/fp-round-03.ll
 create mode 100644 test/CodeGen/SystemZ/fp-sqrt-04.ll
 create mode 100644 test/CodeGen/SystemZ/fp-sub-04.ll
 create mode 100644 test/CodeGen/SystemZ/int-add-17.ll
 create mode 100644 test/CodeGen/SystemZ/int-mul-09.ll
 create mode 100644 test/CodeGen/SystemZ/int-mul-10.ll
 create mode 100644 test/CodeGen/SystemZ/int-mul-11.ll
 create mode 100644 test/CodeGen/SystemZ/int-sub-10.ll
 create mode 100644 test/CodeGen/SystemZ/tdc-07.ll
 create mode 100644 test/CodeGen/SystemZ/vec-abs-06.ll
 create mode 100644 test/CodeGen/SystemZ/vec-add-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-and-04.ll
 create mode 100644 test/CodeGen/SystemZ/vec-cmp-07.ll
 create mode 100644 test/CodeGen/SystemZ/vec-ctpop-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-div-02.ll
 rename test/CodeGen/SystemZ/{vec-intrinsics.ll => vec-intrinsics-01.ll} (100%)
 create mode 100644 test/CodeGen/SystemZ/vec-intrinsics-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-max-05.ll
 create mode 100644 test/CodeGen/SystemZ/vec-min-05.ll
 create mode 100644 test/CodeGen/SystemZ/vec-move-18.ll
 create mode 100644 test/CodeGen/SystemZ/vec-mul-03.ll
 create mode 100644 test/CodeGen/SystemZ/vec-mul-04.ll
 create mode 100644 test/CodeGen/SystemZ/vec-mul-05.ll
 create mode 100644 test/CodeGen/SystemZ/vec-neg-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-or-03.ll
 create mode 100644 test/CodeGen/SystemZ/vec-round-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-sqrt-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-sub-02.ll
 create mode 100644 test/CodeGen/SystemZ/vec-xor-02.ll
 create mode 100644 test/CodeGen/Thumb/litpoolremat.ll
 create mode 100644 test/CodeGen/X86/alias-static-alloca.ll
 create mode 100644 test/CodeGen/X86/avx512-rotate.ll
 create mode 100644 test/CodeGen/X86/bmi-schedule.ll
 create mode 100644 test/CodeGen/X86/bmi2-schedule.ll
 create mode 100644 test/CodeGen/X86/bswap-rotate.ll
 create mode 100644 test/CodeGen/X86/f16c-schedule.ll
 create mode 100644 test/CodeGen/X86/lea32-schedule.ll
 create mode 100644 test/CodeGen/X86/lea64-schedule.ll
 create mode 100644 test/CodeGen/X86/lzcnt-schedule.ll
 create mode 100644 test/CodeGen/X86/memcmp-minsize.ll
 create mode 100644 test/CodeGen/X86/memcmp-optsize.ll
 create mode 100644 test/CodeGen/X86/popcnt-schedule.ll
 create mode 100644 test/CodeGen/X86/pr32282.ll
 create mode 100644 test/CodeGen/X86/pr32515.ll
 create mode 100644 test/CodeGen/X86/pr33772.ll
 create mode 100644 test/CodeGen/X86/pr33828.ll
 create mode 100644 test/CodeGen/X86/rotate_vec.ll
 create mode 100644 test/CodeGen/X86/vector-rotate-512.ll
 create mode 100644 test/CodeGen/X86/x86-cmov-converter.ll
 create mode 100644 test/DllTool/coff-exports.def
 create mode 100644 test/DllTool/coff-weak-exports.def
 create mode 100644 test/DllTool/lit.local.cfg
 create mode 100644 test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
 create mode 100644 test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll
 create mode 100644 test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
 create mode 100644 test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
 create mode 100644 test/MC/AArch64/coff-relocations.s
 create mode 100644 test/MC/AArch64/invalid-instructions-spellcheck.s
 create mode 100644 test/MC/Disassembler/SystemZ/insns-z14.txt
 delete mode 100644 test/MC/Mips/mt/invalid-wrong-error.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-aliases.s
 delete mode 100644 test/MC/Mips/mt/mftr-mttr-reserved-valid.s
 create mode 100644 test/MC/SystemZ/insn-bad-z14.s
 create mode 100644 test/MC/SystemZ/insn-good-z14.s
 create mode 100644 test/MC/SystemZ/invalid-instructions-spellcheck.s
 create mode 100644 test/ObjectYAML/CodeView/guid.yaml
 create mode 100644 test/Other/cgscc-libcall-update.ll
 create mode 100644 test/Transforms/EarlyCSE/globalsaa-memoryssa.ll
 create mode 100644 test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll
 create mode 100644 test/Transforms/GlobalOpt/pr33686.ll
 create mode 100644 test/Transforms/IRCE/eq_ne.ll
 create mode 100644 test/Transforms/IRCE/pre_post_loops.ll
 create mode 100644 test/Transforms/Inline/AArch64/ext.ll
 create mode 100644 test/Transforms/Inline/PowerPC/ext.ll
 create mode 100644 test/Transforms/Inline/PowerPC/lit.local.cfg
 create mode 100644 test/Transforms/Inline/X86/ext.ll
 delete mode 100644 test/Transforms/InstCombine/and-not-or.ll
 create mode 100644 test/Transforms/InstCombine/element-atomic-memintrins.ll
 create mode 100644 test/Transforms/InstCombine/pr33765.ll
 create mode 100644 test/Transforms/LoopInterchange/current-limitations-lcssa.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-not-profitable.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-output-dependencies.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-simple-count-down.ll
 create mode 100644 test/Transforms/LoopInterchange/interchange-simple-count-up.ll
 delete mode 100644 test/Transforms/LoopInterchange/interchange.ll
 create mode 100644 test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
 create mode 100644 test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
 create mode 100644 test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
 create mode 100644 test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
 create mode 100644 test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
 create mode 100644 test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
 create mode 100644 test/tools/llvm-dwarfdump/X86/verify_debug_info.s
 create mode 100644 test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s
 create mode 100644 test/tools/llvm-mt/help.test
 create mode 100644 test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64
 create mode 100644 test/tools/llvm-objdump/AArch64/macho-reloc-addend.test
 create mode 100644 tools/llvm-mt/CMakeLists.txt
 create mode 100644 tools/llvm-mt/LLVMBuild.txt
 create mode 100644 tools/llvm-mt/Opts.td
 create mode 100644 tools/llvm-mt/llvm-mt.cpp
 delete mode 100644 unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
 create mode 100644 unittests/IR/CFGBuilder.cpp
 create mode 100644 unittests/IR/CFGBuilder.h

diff --git a/RELEASE_TESTERS.TXT b/RELEASE_TESTERS.TXT
index 7bfa88c6cf0e..9a01c725fb51 100644
--- a/RELEASE_TESTERS.TXT
+++ b/RELEASE_TESTERS.TXT
@@ -41,14 +41,9 @@ E: hans@chromium.org
 T: x86
 O: Windows
 
-N: Renato Golin
-E: renato.golin@linaro.org
-T: ARM
-O: Linux
-
 N: Diana Picus
 E: diana.picus@linaro.org
-T: AArch64
+T: ARM, AArch64
 O: Linux
 
 N: Simon Dardis
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index e201333f3007..0a5cb00a48d3 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -132,7 +132,8 @@ The ``MayAlias`` response is used whenever the two pointers might refer to the
 same object.
 
 The ``PartialAlias`` response is used when the two memory objects are known to
-be overlapping in some way, but do not start at the same address.
+be overlapping in some way, regardless whether they start at the same address
+or not.
 
 The ``MustAlias`` response may only be returned if the two memory objects are
 guaranteed to always start at exactly the same location. A ``MustAlias``
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 722718bf4f16..fa41198755fd 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -34,10 +34,10 @@ There are some conventions that are not uniformly followed in the code base
 (e.g. the naming convention).  This is because they are relatively new, and a
 lot of code was written before they were put in place.  Our long term goal is
 for the entire codebase to follow the convention, but we explicitly *do not*
-want patches that do large-scale reformating of existing code.  On the other
+want patches that do large-scale reformatting of existing code.  On the other
 hand, it is reasonable to rename the methods of a class if you're about to
-change it in some other way.  Just do the reformating as a separate commit from
-the functionality change.
+change it in some other way.  Just do the reformatting as a separate commit
+from the functionality change.
   
 The ultimate goal of these guidelines is to increase the readability and
 maintainability of our common source base. If you have suggestions for topics to
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index b4d15ef57b73..fbe1a9ab1843 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -80,6 +80,13 @@ OUTPUT OPTIONS
  Show more information on test failures, for example the entire test output
  instead of just the test result.
 
+.. option:: -vv, --echo-all-commands
+
+ Echo all commands to stdout, as they are being executed.
+ This can be valuable for debugging test failures, as the last echoed command
+ will be the one which has failed.
+ This option implies ``--verbose``.
+
 .. option:: -a, --show-all
 
  Show more information about all tests, for example the entire test
diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h
index 8cae63c3c869..b566aeaf1fd6 100644
--- a/include/llvm/Analysis/DominanceFrontier.h
+++ b/include/llvm/Analysis/DominanceFrontier.h
@@ -29,9 +29,9 @@ namespace llvm {
 /// DominanceFrontierBase - Common base class for computing forward and inverse
 /// dominance frontiers for a function.
 ///
-template <class BlockT>
+template <class BlockT, bool IsPostDom>
 class DominanceFrontierBase {
-public:
+ public:
   typedef std::set<BlockT *> DomSetType;                // Dom set for a bb
   typedef std::map<BlockT *, DomSetType> DomSetMapType; // Dom set map
 
@@ -40,10 +40,10 @@ class DominanceFrontierBase {
 
   DomSetMapType Frontiers;
   std::vector<BlockT *> Roots;
-  const bool IsPostDominators;
+  static constexpr bool IsPostDominators = IsPostDom;
 
-public:
-  DominanceFrontierBase(bool isPostDom) : IsPostDominators(isPostDom) {}
+ public:
+  DominanceFrontierBase() {}
 
   /// getRoots - Return the root blocks of the current CFG.  This may include
   /// multiple blocks if we are computing post dominators.  For forward
@@ -96,7 +96,7 @@ class DominanceFrontierBase {
 
   /// compare - Return true if the other dominance frontier base matches
   /// this dominance frontier base. Otherwise return false.
-  bool compare(DominanceFrontierBase<BlockT> &Other) const;
+  bool compare(DominanceFrontierBase &Other) const;
 
   /// print - Convert to human readable form
   ///
@@ -113,22 +113,21 @@ class DominanceFrontierBase {
 /// used to compute a forward dominator frontiers.
 ///
 template <class BlockT>
-class ForwardDominanceFrontierBase : public DominanceFrontierBase<BlockT> {
-private:
+class ForwardDominanceFrontierBase
+    : public DominanceFrontierBase<BlockT, false> {
+ private:
   typedef GraphTraits<BlockT *> BlockTraits;
 
 public:
-  typedef DominatorTreeBase<BlockT> DomTreeT;
-  typedef DomTreeNodeBase<BlockT> DomTreeNodeT;
-  typedef typename DominanceFrontierBase<BlockT>::DomSetType DomSetType;
+ typedef DomTreeBase<BlockT> DomTreeT;
+ typedef DomTreeNodeBase<BlockT> DomTreeNodeT;
+ typedef typename DominanceFrontierBase<BlockT, false>::DomSetType DomSetType;
 
-  ForwardDominanceFrontierBase() : DominanceFrontierBase<BlockT>(false) {}
-
-  void analyze(DomTreeT &DT) {
-    this->Roots = DT.getRoots();
-    assert(this->Roots.size() == 1 &&
-           "Only one entry block for forward domfronts!");
-    calculate(DT, DT[this->Roots[0]]);
+ void analyze(DomTreeT &DT) {
+   this->Roots = DT.getRoots();
+   assert(this->Roots.size() == 1 &&
+          "Only one entry block for forward domfronts!");
+   calculate(DT, DT[this->Roots[0]]);
   }
 
   const DomSetType &calculate(const DomTreeT &DT, const DomTreeNodeT *Node);
@@ -136,15 +135,16 @@ class ForwardDominanceFrontierBase : public DominanceFrontierBase<BlockT> {
 
 class DominanceFrontier : public ForwardDominanceFrontierBase<BasicBlock> {
 public:
-  typedef DominatorTreeBase<BasicBlock> DomTreeT;
-  typedef DomTreeNodeBase<BasicBlock> DomTreeNodeT;
-  typedef DominanceFrontierBase<BasicBlock>::DomSetType DomSetType;
-  typedef DominanceFrontierBase<BasicBlock>::iterator iterator;
-  typedef DominanceFrontierBase<BasicBlock>::const_iterator const_iterator;
+ typedef DomTreeBase<BasicBlock> DomTreeT;
+ typedef DomTreeNodeBase<BasicBlock> DomTreeNodeT;
+ typedef DominanceFrontierBase<BasicBlock, false>::DomSetType DomSetType;
+ typedef DominanceFrontierBase<BasicBlock, false>::iterator iterator;
+ typedef DominanceFrontierBase<BasicBlock, false>::const_iterator
+     const_iterator;
 
-  /// Handle invalidation explicitly.
-  bool invalidate(Function &F, const PreservedAnalyses &PA,
-                  FunctionAnalysisManager::Invalidator &);
+ /// Handle invalidation explicitly.
+ bool invalidate(Function &F, const PreservedAnalyses &PA,
+                 FunctionAnalysisManager::Invalidator &);
 };
 
 class DominanceFrontierWrapperPass : public FunctionPass {
@@ -168,7 +168,8 @@ class DominanceFrontierWrapperPass : public FunctionPass {
   void dump() const;
 };
 
-extern template class DominanceFrontierBase<BasicBlock>;
+extern template class DominanceFrontierBase<BasicBlock, false>;
+extern template class DominanceFrontierBase<BasicBlock, true>;
 extern template class ForwardDominanceFrontierBase<BasicBlock>;
 
 /// \brief Analysis pass which computes a \c DominanceFrontier.
diff --git a/include/llvm/Analysis/DominanceFrontierImpl.h b/include/llvm/Analysis/DominanceFrontierImpl.h
index 9f8cacc24f2c..5093b975e709 100644
--- a/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -39,33 +39,33 @@ class DFCalculateWorkObject {
   const DomTreeNodeT *parentNode;
 };
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::removeBlock(BlockT *BB) {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::removeBlock(BlockT *BB) {
   assert(find(BB) != end() && "Block is not in DominanceFrontier!");
   for (iterator I = begin(), E = end(); I != E; ++I)
     I->second.erase(BB);
   Frontiers.erase(BB);
 }
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::addToFrontier(iterator I,
-                                                  BlockT *Node) {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::addToFrontier(iterator I,
+                                                             BlockT *Node) {
   assert(I != end() && "BB is not in DominanceFrontier!");
   assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
   I->second.erase(Node);
 }
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::removeFromFrontier(iterator I,
-                                                       BlockT *Node) {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::removeFromFrontier(
+    iterator I, BlockT *Node) {
   assert(I != end() && "BB is not in DominanceFrontier!");
   assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
   I->second.erase(Node);
 }
 
-template <class BlockT>
-bool DominanceFrontierBase<BlockT>::compareDomSet(DomSetType &DS1,
-                                                  const DomSetType &DS2) const {
+template <class BlockT, bool IsPostDom>
+bool DominanceFrontierBase<BlockT, IsPostDom>::compareDomSet(
+    DomSetType &DS1, const DomSetType &DS2) const {
   std::set<BlockT *> tmpSet;
   for (BlockT *BB : DS2)
     tmpSet.insert(BB);
@@ -88,9 +88,9 @@ bool DominanceFrontierBase<BlockT>::compareDomSet(DomSetType &DS1,
   return false;
 }
 
-template <class BlockT>
-bool DominanceFrontierBase<BlockT>::compare(
-    DominanceFrontierBase<BlockT> &Other) const {
+template <class BlockT, bool IsPostDom>
+bool DominanceFrontierBase<BlockT, IsPostDom>::compare(
+    DominanceFrontierBase<BlockT, IsPostDom> &Other) const {
   DomSetMapType tmpFrontiers;
   for (typename DomSetMapType::const_iterator I = Other.begin(),
                                               E = Other.end();
@@ -118,8 +118,8 @@ bool DominanceFrontierBase<BlockT>::compare(
   return false;
 }
 
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::print(raw_ostream &OS) const {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::print(raw_ostream &OS) const {
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     OS << "  DomFrontier for BB ";
     if (I->first)
@@ -142,8 +142,8 @@ void DominanceFrontierBase<BlockT>::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-template <class BlockT>
-void DominanceFrontierBase<BlockT>::dump() const {
+template <class BlockT, bool IsPostDom>
+void DominanceFrontierBase<BlockT, IsPostDom>::dump() const {
   print(dbgs());
 }
 #endif
diff --git a/include/llvm/Analysis/IteratedDominanceFrontier.h b/include/llvm/Analysis/IteratedDominanceFrontier.h
index bd74d6bd14c3..edaf4e9025bc 100644
--- a/include/llvm/Analysis/IteratedDominanceFrontier.h
+++ b/include/llvm/Analysis/IteratedDominanceFrontier.h
@@ -42,11 +42,11 @@ namespace llvm {
 /// By default, liveness is not used to prune the IDF computation.
 /// The template parameters should be either BasicBlock* or Inverse<BasicBlock
 /// *>, depending on if you want the forward or reverse IDF.
-template <class NodeTy>
+template <class NodeTy, bool IsPostDom>
 class IDFCalculator {
-
-public:
-  IDFCalculator(DominatorTreeBase<BasicBlock> &DT) : DT(DT), useLiveIn(false) {}
+ public:
+  IDFCalculator(DominatorTreeBase<BasicBlock, IsPostDom> &DT)
+      : DT(DT), useLiveIn(false) {}
 
   /// \brief Give the IDF calculator the set of blocks in which the value is
   /// defined.  This is equivalent to the set of starting blocks it should be
@@ -84,12 +84,12 @@ class IDFCalculator {
   void calculate(SmallVectorImpl<BasicBlock *> &IDFBlocks);
 
 private:
-  DominatorTreeBase<BasicBlock> &DT;
-  bool useLiveIn;
-  const SmallPtrSetImpl<BasicBlock *> *LiveInBlocks;
-  const SmallPtrSetImpl<BasicBlock *> *DefBlocks;
+ DominatorTreeBase<BasicBlock, IsPostDom> &DT;
+ bool useLiveIn;
+ const SmallPtrSetImpl<BasicBlock *> *LiveInBlocks;
+ const SmallPtrSetImpl<BasicBlock *> *DefBlocks;
 };
-typedef IDFCalculator<BasicBlock *> ForwardIDFCalculator;
-typedef IDFCalculator<Inverse<BasicBlock *>> ReverseIDFCalculator;
+typedef IDFCalculator<BasicBlock *, false> ForwardIDFCalculator;
+typedef IDFCalculator<Inverse<BasicBlock *>, true> ReverseIDFCalculator;
 }
 #endif
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 3a052761ad7d..a025f2275fb4 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -43,6 +43,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -908,7 +909,7 @@ class LazyCallGraph {
   /// This sets up the graph and computes all of the entry points of the graph.
   /// No function definitions are scanned until their nodes in the graph are
   /// requested during traversal.
-  LazyCallGraph(Module &M);
+  LazyCallGraph(Module &M, TargetLibraryInfo &TLI);
 
   LazyCallGraph(LazyCallGraph &&G);
   LazyCallGraph &operator=(LazyCallGraph &&RHS);
@@ -966,6 +967,22 @@ class LazyCallGraph {
     return insertInto(F, N);
   }
 
+  /// Get the sequence of known and defined library functions.
+  ///
+  /// These functions, because they are known to LLVM, can have calls
+  /// introduced out of thin air from arbitrary IR.
+  ArrayRef<Function *> getLibFunctions() const {
+    return LibFunctions.getArrayRef();
+  }
+
+  /// Test whether a function is a known and defined library function tracked by
+  /// the call graph.
+  ///
+  /// Because these functions are known to LLVM they are specially modeled in
+  /// the call graph and even when all IR-level references have been removed
+  /// remain active and reachable.
+  bool isLibFunction(Function &F) const { return LibFunctions.count(&F); }
+
   ///@{
   /// \name Pre-SCC Mutation API
   ///
@@ -1100,6 +1117,11 @@ class LazyCallGraph {
   /// These are all of the RefSCCs which have no children.
   SmallVector<RefSCC *, 4> LeafRefSCCs;
 
+  /// Defined functions that are also known library functions which the
+  /// optimizer can reason about and therefore might introduce calls to out of
+  /// thin air.
+  SmallSetVector<Function *, 4> LibFunctions;
+
   /// Helper to insert a new function, with an already looked-up entry in
   /// the NodeMap.
   Node &insertInto(Function &F, Node *&MappedN);
@@ -1216,8 +1238,8 @@ class LazyCallGraphAnalysis : public AnalysisInfoMixin<LazyCallGraphAnalysis> {
   ///
   /// This just builds the set of entry points to the call graph. The rest is
   /// built lazily as it is walked.
-  LazyCallGraph run(Module &M, ModuleAnalysisManager &) {
-    return LazyCallGraph(M);
+  LazyCallGraph run(Module &M, ModuleAnalysisManager &AM) {
+    return LazyCallGraph(M, AM.getResult<TargetLibraryAnalysis>(M));
   }
 };
 
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 096df1e421a7..70ce9a870517 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -56,7 +56,8 @@ class Loop;
 class MDNode;
 class PHINode;
 class raw_ostream;
-template<class N> class DominatorTreeBase;
+template <class N, bool IsPostDom>
+class DominatorTreeBase;
 template<class N, class M> class LoopInfoBase;
 template<class N, class M> class LoopBase;
 
@@ -663,12 +664,12 @@ class LoopInfoBase {
   }
 
   /// Create the loop forest using a stable algorithm.
-  void analyze(const DominatorTreeBase<BlockT> &DomTree);
+  void analyze(const DominatorTreeBase<BlockT, false> &DomTree);
 
   // Debugging
   void print(raw_ostream &OS) const;
 
-  void verify(const DominatorTreeBase<BlockT> &DomTree) const;
+  void verify(const DominatorTreeBase<BlockT, false> &DomTree) const;
 };
 
 // Implementation in LoopInfoImpl.h
@@ -683,7 +684,7 @@ class LoopInfo : public LoopInfoBase<BasicBlock, Loop> {
   LoopInfo(const LoopInfo &) = delete;
 public:
   LoopInfo() {}
-  explicit LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree);
+  explicit LoopInfo(const DominatorTreeBase<BasicBlock, false> &DomTree);
 
   LoopInfo(LoopInfo &&Arg) : BaseT(std::move(static_cast<BaseT &>(Arg))) {}
   LoopInfo &operator=(LoopInfo &&RHS) {
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 372fc8b21745..e9177e68ed77 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -340,10 +340,10 @@ void LoopBase<BlockT, LoopT>::print(raw_ostream &OS, unsigned Depth,
 /// Discover a subloop with the specified backedges such that: All blocks within
 /// this loop are mapped to this loop or a subloop. And all subloops within this
 /// loop have their parent loop set to this loop or a subloop.
-template<class BlockT, class LoopT>
-static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT*> Backedges,
-                                  LoopInfoBase<BlockT, LoopT> *LI,
-                                  const DominatorTreeBase<BlockT> &DomTree) {
+template <class BlockT, class LoopT>
+static void discoverAndMapSubloop(
+    LoopT *L, ArrayRef<BlockT *> Backedges, LoopInfoBase<BlockT, LoopT> *LI,
+    const DomTreeBase<BlockT> &DomTree) {
   typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
 
   unsigned NumBlocks = 0;
@@ -462,10 +462,9 @@ void PopulateLoopsDFS<BlockT, LoopT>::insertIntoLoop(BlockT *Block) {
 ///
 /// The Block vectors are inclusive, so step 3 requires loop-depth number of
 /// insertions per block.
-template<class BlockT, class LoopT>
-void LoopInfoBase<BlockT, LoopT>::
-analyze(const DominatorTreeBase<BlockT> &DomTree) {
-
+template <class BlockT, class LoopT>
+void LoopInfoBase<BlockT, LoopT>::analyze(
+    const DomTreeBase<BlockT> &DomTree) {
   // Postorder traversal of the dominator tree.
   const DomTreeNodeBase<BlockT> *DomRoot = DomTree.getRootNode();
   for (auto DomNode : post_order(DomRoot)) {
@@ -607,7 +606,7 @@ static void compareLoops(const LoopT *L, const LoopT *OtherL,
 
 template <class BlockT, class LoopT>
 void LoopInfoBase<BlockT, LoopT>::verify(
-    const DominatorTreeBase<BlockT> &DomTree) const {
+    const DomTreeBase<BlockT> &DomTree) const {
   DenseSet<const LoopT*> Loops;
   for (iterator I = begin(), E = end(); I != E; ++I) {
     assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
diff --git a/include/llvm/Analysis/PostDominators.h b/include/llvm/Analysis/PostDominators.h
index 94ee3b03bb86..17f2e8eaf4a2 100644
--- a/include/llvm/Analysis/PostDominators.h
+++ b/include/llvm/Analysis/PostDominators.h
@@ -22,10 +22,8 @@ namespace llvm {
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used to
 /// compute the post-dominator tree.
 ///
-struct PostDominatorTree : public DominatorTreeBase<BasicBlock> {
-  typedef DominatorTreeBase<BasicBlock> Base;
-
-  PostDominatorTree() : DominatorTreeBase<BasicBlock>(true) {}
+struct PostDominatorTree : public PostDomTreeBase<BasicBlock> {
+  typedef PostDomTreeBase<BasicBlock> Base;
 
   /// Handle invalidation explicitly.
   bool invalidate(Function &F, const PreservedAnalyses &PA,
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index c7accfae78b0..d1b182755cf8 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -237,17 +237,15 @@ struct FoldingSetTrait<SCEVPredicate> : DefaultFoldingSetTrait<SCEVPredicate> {
 };
 
 /// This class represents an assumption that two SCEV expressions are equal,
-/// and this can be checked at run-time. We assume that the left hand side is
-/// a SCEVUnknown and the right hand side a constant.
+/// and this can be checked at run-time.
 class SCEVEqualPredicate final : public SCEVPredicate {
-  /// We assume that LHS == RHS, where LHS is a SCEVUnknown and RHS a
-  /// constant.
-  const SCEVUnknown *LHS;
-  const SCEVConstant *RHS;
+  /// We assume that LHS == RHS.
+  const SCEV *LHS;
+  const SCEV *RHS;
 
 public:
-  SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEVUnknown *LHS,
-                     const SCEVConstant *RHS);
+  SCEVEqualPredicate(const FoldingSetNodeIDRef ID, const SCEV *LHS,
+                     const SCEV *RHS);
 
   /// Implementation of the SCEVPredicate interface
   bool implies(const SCEVPredicate *N) const override;
@@ -256,10 +254,10 @@ class SCEVEqualPredicate final : public SCEVPredicate {
   const SCEV *getExpr() const override;
 
   /// Returns the left hand side of the equality.
-  const SCEVUnknown *getLHS() const { return LHS; }
+  const SCEV *getLHS() const { return LHS; }
 
   /// Returns the right hand side of the equality.
-  const SCEVConstant *getRHS() const { return RHS; }
+  const SCEV *getRHS() const { return RHS; }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const SCEVPredicate *P) {
@@ -1241,6 +1239,14 @@ class ScalarEvolution {
     SmallVector<const SCEV *, 4> NewOp(Operands.begin(), Operands.end());
     return getAddRecExpr(NewOp, L, Flags);
   }
+
+  /// Checks if \p SymbolicPHI can be rewritten as an AddRecExpr under some
+  /// Predicates. If successful return these <AddRecExpr, Predicates>;
+  /// The function is intended to be called from PSCEV (the caller will decide
+  /// whether to actually add the predicates and carry out the rewrites).
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+  createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI);
+  
   /// Returns an expression for a GEP
   ///
   /// \p GEP The GEP. The indices contained in the GEP itself are ignored,
@@ -1675,8 +1681,7 @@ class ScalarEvolution {
     return F.getParent()->getDataLayout();
   }
 
-  const SCEVPredicate *getEqualPredicate(const SCEVUnknown *LHS,
-                                         const SCEVConstant *RHS);
+  const SCEVPredicate *getEqualPredicate(const SCEV *LHS, const SCEV *RHS);
 
   const SCEVPredicate *
   getWrapPredicate(const SCEVAddRecExpr *AR,
@@ -1692,6 +1697,19 @@ class ScalarEvolution {
       SmallPtrSetImpl<const SCEVPredicate *> &Preds);
 
 private:
+  /// Similar to createAddRecFromPHI, but with the additional flexibility of 
+  /// suggesting runtime overflow checks in case casts are encountered.
+  /// If successful, the analysis records that for this loop, \p SymbolicPHI,
+  /// which is the UnknownSCEV currently representing the PHI, can be rewritten
+  /// into an AddRec, assuming some predicates; The function then returns the
+  /// AddRec and the predicates as a pair, and caches this pair in
+  /// PredicatedSCEVRewrites.
+  /// If the analysis is not successful, a mapping from the \p SymbolicPHI to 
+  /// itself (with no predicates) is recorded, and a nullptr with an empty
+  /// predicates vector is returned as a pair. 
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+  createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI);
+
   /// Compute the backedge taken count knowing the interval difference, the
   /// stride and presence of the equality in the comparison.
   const SCEV *computeBECount(const SCEV *Delta, const SCEV *Stride,
@@ -1722,6 +1740,12 @@ class ScalarEvolution {
   FoldingSet<SCEVPredicate> UniquePreds;
   BumpPtrAllocator SCEVAllocator;
 
+  /// Cache tentative mappings from UnknownSCEVs in a Loop, to a SCEV expression
+  /// they can be rewritten into under certain predicates.
+  DenseMap<std::pair<const SCEVUnknown *, const Loop *>,
+           std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+      PredicatedSCEVRewrites;
+   
   /// The head of a linked list of all SCEVUnknown values that have been
   /// allocated. This is used by releaseMemory to locate them all and call
   /// their destructors.
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index dfb525e3de7a..24edd3826a2e 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -155,6 +155,13 @@ class TargetTransformInfo {
   int getGEPCost(Type *PointeeType, const Value *Ptr,
                  ArrayRef<const Value *> Operands) const;
 
+  /// \brief Estimate the cost of a EXT operation when lowered.
+  ///
+  /// The contract for this function is the same as \c getOperationCost except
+  /// that it supports an interface that provides extra information specific to
+  /// the EXT operation.
+  int getExtCost(const Instruction *I, const Value *Src) const;
+
   /// \brief Estimate the cost of a function call when lowered.
   ///
   /// The contract for this is the same as \c getOperationCost except that it
@@ -849,6 +856,7 @@ class TargetTransformInfo::Concept {
   virtual int getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) = 0;
   virtual int getGEPCost(Type *PointeeType, const Value *Ptr,
                          ArrayRef<const Value *> Operands) = 0;
+  virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
   virtual int getCallCost(FunctionType *FTy, int NumArgs) = 0;
   virtual int getCallCost(const Function *F, int NumArgs) = 0;
   virtual int getCallCost(const Function *F,
@@ -1022,6 +1030,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                  ArrayRef<const Value *> Operands) override {
     return Impl.getGEPCost(PointeeType, Ptr, Operands);
   }
+  int getExtCost(const Instruction *I, const Value *Src) override {
+    return Impl.getExtCost(I, Src);
+  }
   int getCallCost(FunctionType *FTy, int NumArgs) override {
     return Impl.getCallCost(FTy, NumArgs);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 8740ee92eed5..0b07fe9aa232 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -120,6 +120,10 @@ class TargetTransformInfoImplBase {
     return SI.getNumCases();
   }
 
+  int getExtCost(const Instruction *I, const Value *Src) {
+    return TTI::TCC_Basic;
+  }
+
   unsigned getCallCost(FunctionType *FTy, int NumArgs) {
     assert(FTy && "FunctionType must be provided to this routine.");
 
@@ -728,6 +732,8 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       // nop on most sane targets.
       if (isa<CmpInst>(CI->getOperand(0)))
         return TTI::TCC_Free;
+      if (isa<SExtInst>(CI) || isa<ZExtInst>(CI) || isa<FPExtInst>(CI))
+        return static_cast<T *>(this)->getExtCost(CI, Operands.back());
     }
 
     return static_cast<T *>(this)->getOperationCost(
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index b59fd60e8aed..633107024792 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -155,6 +155,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
   }
 
+  int getExtCost(const Instruction *I, const Value *Src) {
+    if (getTLI()->isExtFree(I))
+      return TargetTransformInfo::TCC_Free;
+
+    if (isa<ZExtInst>(I) || isa<SExtInst>(I))
+      if (const LoadInst *LI = dyn_cast<LoadInst>(Src))
+        if (getTLI()->isExtLoad(LI, I, DL))
+          return TargetTransformInfo::TCC_Free;
+
+    return TargetTransformInfo::TCC_Basic;
+  }
+
   unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<const Value *> Arguments) {
     return BaseT::getIntrinsicCost(IID, RetTy, Arguments);
diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h
index 370ffbe4862e..6efeefd9a721 100644
--- a/include/llvm/CodeGen/MachineDominanceFrontier.h
+++ b/include/llvm/CodeGen/MachineDominanceFrontier.h
@@ -23,27 +23,24 @@ class MachineDominanceFrontier : public MachineFunctionPass {
   ForwardDominanceFrontierBase<MachineBasicBlock> Base;
 
 public:
-  using DomTreeT = DominatorTreeBase<MachineBasicBlock>;
-  using DomTreeNodeT = DomTreeNodeBase<MachineBasicBlock>;
-  using DomSetType = DominanceFrontierBase<MachineBasicBlock>::DomSetType;
-  using iterator = DominanceFrontierBase<MachineBasicBlock>::iterator;
-  using const_iterator =
-      DominanceFrontierBase<MachineBasicBlock>::const_iterator;
+ using DomTreeT = DomTreeBase<MachineBasicBlock>;
+ using DomTreeNodeT = DomTreeNodeBase<MachineBasicBlock>;
+ using DomSetType = DominanceFrontierBase<MachineBasicBlock, false>::DomSetType;
+ using iterator = DominanceFrontierBase<MachineBasicBlock, false>::iterator;
+ using const_iterator =
+     DominanceFrontierBase<MachineBasicBlock, false>::const_iterator;
 
-  MachineDominanceFrontier(const MachineDominanceFrontier &) = delete;
-  MachineDominanceFrontier &
-  operator=(const MachineDominanceFrontier &) = delete;
+ MachineDominanceFrontier(const MachineDominanceFrontier &) = delete;
+ MachineDominanceFrontier &operator=(const MachineDominanceFrontier &) = delete;
 
-  static char ID;
+ static char ID;
 
-  MachineDominanceFrontier();
+ MachineDominanceFrontier();
 
-  DominanceFrontierBase<MachineBasicBlock> &getBase() {
-    return Base;
-  }
+ DominanceFrontierBase<MachineBasicBlock, false> &getBase() { return Base; }
 
-  inline const std::vector<MachineBasicBlock*> &getRoots() const {
-    return Base.getRoots();
+ inline const std::vector<MachineBasicBlock *> &getRoots() const {
+   return Base.getRoots();
   }
 
   MachineBasicBlock *getRoot() const {
@@ -98,7 +95,7 @@ class MachineDominanceFrontier : public MachineFunctionPass {
     return Base.compareDomSet(DS1, DS2);
   }
 
-  bool compare(DominanceFrontierBase<MachineBasicBlock> &Other) const {
+  bool compare(DominanceFrontierBase<MachineBasicBlock, false> &Other) const {
     return Base.compare(Other);
   }
 
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 74a7c3ea04ae..8bf98f606495 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -28,13 +28,15 @@
 
 namespace llvm {
 
-template<>
-inline void DominatorTreeBase<MachineBasicBlock>::addRoot(MachineBasicBlock* MBB) {
+template <>
+inline void DominatorTreeBase<MachineBasicBlock, false>::addRoot(
+    MachineBasicBlock *MBB) {
   this->Roots.push_back(MBB);
 }
 
 extern template class DomTreeNodeBase<MachineBasicBlock>;
-extern template class DominatorTreeBase<MachineBasicBlock>;
+extern template class DominatorTreeBase<MachineBasicBlock, false>; // DomTree
+extern template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTree
 
 using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 
@@ -65,7 +67,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
 
   /// The DominatorTreeBase that is used to compute a normal dominator tree
-  std::unique_ptr<DominatorTreeBase<MachineBasicBlock>> DT;
+  std::unique_ptr<DomTreeBase<MachineBasicBlock>> DT;
 
   /// \brief Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
@@ -79,9 +81,8 @@ class MachineDominatorTree : public MachineFunctionPass {
 
   MachineDominatorTree();
 
-  DominatorTreeBase<MachineBasicBlock> &getBase() {
-    if (!DT)
-      DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
+  DomTreeBase<MachineBasicBlock> &getBase() {
+    if (!DT) DT.reset(new DomTreeBase<MachineBasicBlock>());
     applySplitCriticalEdges();
     return *DT;
   }
diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
index 70bdb191ad34..d29d2d85cb0a 100644
--- a/include/llvm/CodeGen/MachinePostDominators.h
+++ b/include/llvm/CodeGen/MachinePostDominators.h
@@ -26,7 +26,7 @@ namespace llvm {
 ///
 struct MachinePostDominatorTree : public MachineFunctionPass {
 private:
-  DominatorTreeBase<MachineBasicBlock> *DT;
+ PostDomTreeBase<MachineBasicBlock> *DT;
 
 public:
   static char ID;
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index 70ccc867cd38..df55e181364c 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -17,7 +17,6 @@
 namespace llvm {
 namespace codeview {
 class TypeCollection;
-class TypeServerHandler;
 class TypeVisitorCallbacks;
 
 enum VisitorDataSource {
@@ -31,11 +30,9 @@ enum VisitorDataSource {
 
 Error visitTypeRecord(CVType &Record, TypeIndex Index,
                       TypeVisitorCallbacks &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent,
-                      TypeServerHandler *TS = nullptr);
+                      VisitorDataSource Source = VDS_BytesPresent);
 Error visitTypeRecord(CVType &Record, TypeVisitorCallbacks &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent,
-                      TypeServerHandler *TS = nullptr);
+                      VisitorDataSource Source = VDS_BytesPresent);
 
 Error visitMemberRecord(CVMemberRecord Record, TypeVisitorCallbacks &Callbacks,
                         VisitorDataSource Source = VDS_BytesPresent);
@@ -46,12 +43,9 @@ Error visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
                               TypeVisitorCallbacks &Callbacks);
 
 Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks,
-                      VisitorDataSource Source = VDS_BytesPresent,
-                      TypeServerHandler *TS = nullptr);
-Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks,
-                      TypeServerHandler *TS = nullptr);
-Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks,
-                      TypeServerHandler *TS = nullptr);
+                      VisitorDataSource Source = VDS_BytesPresent);
+Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks);
+Error visitTypeStream(TypeCollection &Types, TypeVisitorCallbacks &Callbacks);
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index db944c7057f7..94f104ff772c 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -84,7 +84,7 @@ class CodeViewRecordIO {
   Error mapEncodedInteger(uint64_t &Value);
   Error mapEncodedInteger(APSInt &Value);
   Error mapStringZ(StringRef &Value);
-  Error mapGuid(StringRef &Guid);
+  Error mapGuid(GUID &Guid);
 
   Error mapStringZVectorZ(std::vector<StringRef> &Value);
 
diff --git a/include/llvm/DebugInfo/CodeView/Formatters.h b/include/llvm/DebugInfo/CodeView/Formatters.h
index 0842c1e373db..278ad02a39cd 100644
--- a/include/llvm/DebugInfo/CodeView/Formatters.h
+++ b/include/llvm/DebugInfo/CodeView/Formatters.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/FormatAdapters.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -31,7 +32,7 @@ class GuidAdapter final : public FormatAdapter<ArrayRef<uint8_t>> {
   explicit GuidAdapter(ArrayRef<uint8_t> Guid);
   explicit GuidAdapter(StringRef Guid);
 
-  void format(raw_ostream &Stream, StringRef Style) override ;
+  void format(raw_ostream &Stream, StringRef Style) override;
 };
 
 } // end namespace detail
@@ -60,6 +61,13 @@ template <> struct format_provider<codeview::TypeIndex> {
   }
 };
 
+template <> struct format_provider<codeview::GUID> {
+  static void format(const codeview::GUID &V, llvm::raw_ostream &Stream,
+                     StringRef Style) {
+    Stream << V;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_CODEVIEW_FORMATTERS_H
diff --git a/include/llvm/DebugInfo/CodeView/GUID.h b/include/llvm/DebugInfo/CodeView/GUID.h
new file mode 100644
index 000000000000..a055ce9e2e45
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/GUID.h
@@ -0,0 +1,55 @@
+//===- GUID.h ---------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_GUID_H
+#define LLVM_DEBUGINFO_CODEVIEW_GUID_H
+
+#include <cstdint>
+#include <cstring>
+
+namespace llvm {
+class raw_ostream;
+
+namespace codeview {
+
+/// This represents the 'GUID' type from windows.h.
+struct GUID {
+  uint8_t Guid[16];
+};
+
+inline bool operator==(const GUID &LHS, const GUID &RHS) {
+  return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid));
+}
+
+inline bool operator<(const GUID &LHS, const GUID &RHS) {
+  return ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)) < 0;
+}
+
+inline bool operator<=(const GUID &LHS, const GUID &RHS) {
+  return ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid)) <= 0;
+}
+
+inline bool operator>(const GUID &LHS, const GUID &RHS) {
+  return !(LHS <= RHS);
+}
+
+inline bool operator>=(const GUID &LHS, const GUID &RHS) {
+  return !(LHS < RHS);
+}
+
+inline bool operator!=(const GUID &LHS, const GUID &RHS) {
+  return !(LHS == RHS);
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid);
+
+} // namespace codeview
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index cdfc1745cea5..f3086cf3dbb9 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -848,7 +848,7 @@ class BuildInfoSym : public SymbolRecord {
       : SymbolRecord(SymbolRecordKind::BuildInfoSym),
         RecordOffset(RecordOffset) {}
 
-  uint32_t BuildId;
+  TypeIndex BuildId;
 
   uint32_t RecordOffset;
 };
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 2efeb1b3cefd..7942c0c0bc21 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Endian.h"
@@ -539,15 +540,17 @@ class TypeServer2Record : public TypeRecord {
 public:
   TypeServer2Record() = default;
   explicit TypeServer2Record(TypeRecordKind Kind) : TypeRecord(Kind) {}
-  TypeServer2Record(StringRef Guid, uint32_t Age, StringRef Name)
-      : TypeRecord(TypeRecordKind::TypeServer2), Guid(Guid), Age(Age),
-        Name(Name) {}
+  TypeServer2Record(StringRef GuidStr, uint32_t Age, StringRef Name)
+      : TypeRecord(TypeRecordKind::TypeServer2), Age(Age), Name(Name) {
+    assert(GuidStr.size() == 16 && "guid isn't 16 bytes");
+    ::memcpy(Guid.Guid, GuidStr.data(), 16);
+  }
 
-  StringRef getGuid() const { return Guid; }
+  const GUID &getGuid() const { return Guid; }
   uint32_t getAge() const { return Age; }
   StringRef getName() const { return Name; }
 
-  StringRef Guid;
+  GUID Guid;
   uint32_t Age;
   StringRef Name;
 };
diff --git a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h b/include/llvm/DebugInfo/CodeView/TypeServerHandler.h
deleted file mode 100644
index e96baad9ceae..000000000000
--- a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- TypeServerHandler.h --------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
-#define LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
-
-#include "llvm/Support/Error.h"
-
-namespace llvm {
-namespace codeview {
-
-class TypeServer2Record;
-class TypeVisitorCallbacks;
-
-class TypeServerHandler {
-public:
-  virtual ~TypeServerHandler() = default;
-
-  /// Handle a TypeServer record.  If the implementation returns true
-  /// the record will not be processed by the top-level visitor.  If
-  /// it returns false, it will be processed.  If it returns an Error,
-  /// then the top-level visitor will fail.
-  virtual Expected<bool> handle(TypeServer2Record &TS,
-                                TypeVisitorCallbacks &Callbacks) {
-    return false;
-  }
-};
-
-} // end namespace codeview
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 3ad2b4e9c92f..d78fab47db66 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -19,7 +19,6 @@ namespace llvm {
 namespace codeview {
 
 class TypeIndex;
-class TypeServerHandler;
 class TypeTableBuilder;
 
 /// \brief Merge one set of type records into another.  This method assumes
@@ -31,16 +30,13 @@ class TypeTableBuilder;
 /// type stream, that contains the index of the corresponding type record
 /// in the destination stream.
 ///
-/// \param Handler (optional) If non-null, an interface that gets invoked
-/// to handle type server records.
-///
 /// \param Types The collection of types to merge in.
 ///
 /// \returns Error::success() if the operation succeeded, otherwise an
 /// appropriate error code.
 Error mergeTypeRecords(TypeTableBuilder &Dest,
                        SmallVectorImpl<TypeIndex> &SourceToDest,
-                       TypeServerHandler *Handler, const CVTypeArray &Types);
+                       const CVTypeArray &Types);
 
 /// \brief Merge one set of id records into another.  This method assumes
 /// that all records are id records, and there are no Type records present.
@@ -65,7 +61,7 @@ Error mergeTypeRecords(TypeTableBuilder &Dest,
 /// appropriate error code.
 Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
                      SmallVectorImpl<TypeIndex> &SourceToDest,
-  const CVTypeArray &Ids);
+                     const CVTypeArray &Ids);
 
 /// \brief Merge a unified set of type and id records, splitting them into
 /// separate output streams.
@@ -78,9 +74,6 @@ Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
 /// id stream, that contains the index of the corresponding id record
 /// in the destination stream.
 ///
-/// \param Handler (optional) If non-null, an interface that gets invoked
-/// to handle type server records.
-///
 /// \param IdsAndTypes The collection of id records to merge in.
 ///
 /// \returns Error::success() if the operation succeeded, otherwise an
@@ -88,8 +81,7 @@ Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
 Error mergeTypeAndIdRecords(TypeTableBuilder &DestIds,
                             TypeTableBuilder &DestTypes,
                             SmallVectorImpl<TypeIndex> &SourceToDest,
-                            TypeServerHandler *Handler,
-  const CVTypeArray &IdsAndTypes);
+                            const CVTypeArray &IdsAndTypes);
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index ea36ab7ab5b6..056c1b77c65d 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -238,6 +238,34 @@ class DWARFUnit {
 
   uint8_t getUnitType() const { return UnitType; }
 
+  static bool isValidUnitType(uint8_t UnitType) {
+    return UnitType == dwarf::DW_UT_compile || UnitType == dwarf::DW_UT_type ||
+           UnitType == dwarf::DW_UT_partial ||
+           UnitType == dwarf::DW_UT_skeleton ||
+           UnitType == dwarf::DW_UT_split_compile ||
+           UnitType == dwarf::DW_UT_split_type;
+  }
+
+  /// \brief Return the number of bytes for the header of a unit of
+  /// UnitType type.
+  ///
+  /// This function must be called with a valid unit type which in
+  /// DWARF5 is defined as one of the following six types.
+  static uint32_t getDWARF5HeaderSize(uint8_t UnitType) {
+    switch (UnitType) {
+    case dwarf::DW_UT_compile:
+    case dwarf::DW_UT_partial:
+      return 12;
+    case dwarf::DW_UT_skeleton:
+    case dwarf::DW_UT_split_compile:
+      return 20;
+    case dwarf::DW_UT_type:
+    case dwarf::DW_UT_split_type:
+      return 24;
+    }
+    llvm_unreachable("Invalid UnitType.");
+  }
+
   uint64_t getBaseAddress() const { return BaseAddr; }
 
   void setBaseAddress(uint64_t base_addr) {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 9eb5c45faba8..c0291a83ed97 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -21,6 +21,7 @@ class DWARFContext;
 class DWARFDie;
 class DWARFUnit;
 class DWARFAcceleratorTable;
+class DWARFDataExtractor;
 
 /// A class that verifies DWARF debug information given a DWARF Context.
 class DWARFVerifier {
@@ -30,10 +31,35 @@ class DWARFVerifier {
   /// can verify each reference points to a valid DIE and not an offset that
   /// lies between to valid DIEs.
   std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
-  uint32_t NumDebugInfoErrors = 0;
   uint32_t NumDebugLineErrors = 0;
   uint32_t NumAppleNamesErrors = 0;
 
+  /// Verifies the header of a unit in the .debug_info section.
+  ///
+  /// This function currently checks for:
+  /// - Unit is in 32-bit DWARF format. The function can be modified to
+  /// support 64-bit format.
+  /// - The DWARF version is valid
+  /// - The unit type is valid (if unit is in version >=5)
+  /// - The unit doesn't extend beyond .debug_info section
+  /// - The address size is valid
+  /// - The offset in the .debug_abbrev section is valid
+  ///
+  /// \param DebugInfoData The .debug_info section data
+  /// \param Offset A reference to the offset start of the unit. The offset will
+  /// be updated to point to the next unit in .debug_info
+  /// \param UnitIndex The index of the unit to be verified
+  /// \param UnitType A reference to the type of the unit
+  /// \param isUnitDWARF64 A reference to a flag that shows whether the unit is
+  /// in 64-bit format.
+  ///
+  /// \returns true if the header is verified successfully, false otherwise.
+  bool verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
+                        uint32_t *Offset, unsigned UnitIndex, uint8_t &UnitType,
+                        bool &isUnitDWARF64);
+
+
+  bool verifyUnitContents(DWARFUnit Unit);
   /// Verifies the attribute's DWARF attribute and its value.
   ///
   /// This function currently checks for:
@@ -42,7 +68,11 @@ class DWARFVerifier {
   ///
   /// \param Die          The DWARF DIE that owns the attribute value
   /// \param AttrValue    The DWARF attribute value to check
-  void verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue);
+  ///
+  /// \returns NumErrors The number of errors occured during verification of
+  /// attributes' values in a .debug_info section unit
+  unsigned verifyDebugInfoAttribute(const DWARFDie &Die,
+                                    DWARFAttribute &AttrValue);
 
   /// Verifies the attribute's DWARF form.
   ///
@@ -53,7 +83,10 @@ class DWARFVerifier {
   ///
   /// \param Die          The DWARF DIE that owns the attribute value
   /// \param AttrValue    The DWARF attribute value to check
-  void verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
+  ///
+  /// \returns NumErrors The number of errors occured during verification of
+  /// attributes' forms in a .debug_info section unit
+  unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
 
   /// Verifies the all valid references that were found when iterating through
   /// all of the DIE attributes.
@@ -62,7 +95,10 @@ class DWARFVerifier {
   /// offset matches. This helps to ensure if a DWARF link phase moved things
   /// around, that it doesn't create invalid references by failing to relocate
   /// CU relative and absolute references.
-  void verifyDebugInfoReferences();
+  ///
+  /// \returns NumErrors The number of errors occured during verification of
+  /// references for the .debug_info section
+  unsigned verifyDebugInfoReferences();
 
   /// Verify the the DW_AT_stmt_list encoding and value and ensure that no
   /// compile units that have the same DW_AT_stmt_list value.
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
index 3710eb29e7f9..d37b48540ffa 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIARawSymbol.h
@@ -106,7 +106,7 @@ class DIARawSymbol : public IPDBRawSymbol {
   getVirtualBaseTableType() const override;
   PDB_DataKind getDataKind() const override;
   PDB_SymType getSymTag() const override;
-  PDB_UniqueId getGuid() const override;
+  codeview::GUID getGuid() const override;
   int32_t getOffset() const override;
   int32_t getThisAdjust() const override;
   int32_t getVirtualBasePointerOffset() const override;
diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h
index 466cb455651b..03205a986f1a 100644
--- a/include/llvm/DebugInfo/PDB/GenericError.h
+++ b/include/llvm/DebugInfo/PDB/GenericError.h
@@ -19,6 +19,7 @@ namespace pdb {
 enum class generic_error_code {
   invalid_path = 1,
   dia_sdk_not_present,
+  type_server_not_found,
   unspecified,
 };
 
diff --git a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
index fab086c62c72..eefc36518728 100644
--- a/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
+++ b/include/llvm/DebugInfo/PDB/IPDBRawSymbol.h
@@ -118,7 +118,7 @@ class IPDBRawSymbol {
   virtual uint32_t getVirtualTableShapeId() const = 0;
   virtual PDB_DataKind getDataKind() const = 0;
   virtual PDB_SymType getSymTag() const = 0;
-  virtual PDB_UniqueId getGuid() const = 0;
+  virtual codeview::GUID getGuid() const = 0;
   virtual int32_t getOffset() const = 0;
   virtual int32_t getThisAdjust() const = 0;
   virtual int32_t getVirtualBasePointerOffset() const = 0;
diff --git a/include/llvm/DebugInfo/PDB/Native/Formatters.h b/include/llvm/DebugInfo/PDB/Native/Formatters.h
index 183f0ad8307e..7d5eab2e2a09 100644
--- a/include/llvm/DebugInfo/PDB/Native/Formatters.h
+++ b/include/llvm/DebugInfo/PDB/Native/Formatters.h
@@ -23,13 +23,6 @@
     break;
 
 namespace llvm {
-template <> struct format_provider<pdb::PDB_UniqueId> {
-  static void format(const pdb::PDB_UniqueId &V, llvm::raw_ostream &Stream,
-                     StringRef Style) {
-    codeview::fmt_guid(V.Guid).format(Stream, Style);
-  }
-};
-
 template <> struct format_provider<pdb::PdbRaw_ImplVer> {
   static void format(const pdb::PdbRaw_ImplVer &V, llvm::raw_ostream &Stream,
                      StringRef Style) {
diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStream.h b/include/llvm/DebugInfo/PDB/Native/InfoStream.h
index 37bf5f3b573c..fb8271cb5ebc 100644
--- a/include/llvm/DebugInfo/PDB/Native/InfoStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/InfoStream.h
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
@@ -39,7 +40,7 @@ class InfoStream {
   PdbRaw_ImplVer getVersion() const;
   uint32_t getSignature() const;
   uint32_t getAge() const;
-  PDB_UniqueId getGuid() const;
+  codeview::GUID getGuid() const;
   uint32_t getNamedStreamMapByteSize() const;
 
   PdbRaw_Features getFeatures() const;
@@ -71,7 +72,7 @@ class InfoStream {
   // Due to the aforementioned limitations with `Signature`, this is a new
   // signature present on VC70 and higher PDBs which is guaranteed to be
   // universally unique.
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
 
   BinarySubstreamRef SubNamedStreams;
 
diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
index 90c28a90d252..c6cb0e221e70 100644
--- a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
@@ -37,7 +37,7 @@ class InfoStreamBuilder {
   void setVersion(PdbRaw_ImplVer V);
   void setSignature(uint32_t S);
   void setAge(uint32_t A);
-  void setGuid(PDB_UniqueId G);
+  void setGuid(codeview::GUID G);
   void addFeature(PdbRaw_FeatureSig Sig);
 
   uint32_t finalize();
@@ -54,7 +54,7 @@ class InfoStreamBuilder {
   PdbRaw_ImplVer Ver;
   uint32_t Sig;
   uint32_t Age;
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
 
   NamedStreamMap &NamedStreams;
 };
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
index ddb7f811da38..587c7ff2b092 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
@@ -27,7 +27,7 @@ class NativeExeSymbol : public NativeRawSymbol {
 
   uint32_t getAge() const override;
   std::string getSymbolsFileName() const override;
-  PDB_UniqueId getGuid() const override;
+  codeview::GUID getGuid() const override;
   bool hasCTypes() const override;
   bool hasPrivateSymbols() const override;
 
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
index 66a9eae28e23..2c6548dcce21 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
@@ -111,7 +111,7 @@ class NativeRawSymbol : public IPDBRawSymbol {
   getVirtualBaseTableType() const override;
   PDB_DataKind getDataKind() const override;
   PDB_SymType getSymTag() const override;
-  PDB_UniqueId getGuid() const override;
+  codeview::GUID getGuid() const override;
   int32_t getOffset() const override;
   int32_t getThisAdjust() const override;
   int32_t getVirtualBasePointerOffset() const override;
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
deleted file mode 100644
index 196ba4d6ffbd..000000000000
--- a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- PDBTypeServerHandler.h -----------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
-#define LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
-
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/PDBTypes.h"
-
-#include <memory>
-#include <string>
-
-namespace llvm {
-namespace pdb {
-class NativeSession;
-
-class PDBTypeServerHandler : public codeview::TypeServerHandler {
-public:
-  PDBTypeServerHandler(bool RevisitAlways = false);
-
-  void addSearchPath(StringRef Path);
-  Expected<bool> handle(codeview::TypeServer2Record &TS,
-                        codeview::TypeVisitorCallbacks &Callbacks) override;
-
-private:
-  Expected<bool> handleInternal(PDBFile &File,
-                                codeview::TypeVisitorCallbacks &Callbacks);
-
-  bool RevisitAlways;
-  std::unique_ptr<NativeSession> Session;
-  StringSet<> SearchPaths;
-};
-}
-}
-
-#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
index a3cdd3f09a44..b6321cbf45a8 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_RAW_RAWTYPES_H
 #define LLVM_DEBUGINFO_PDB_RAW_RAWTYPES_H
 
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Endian.h"
 
@@ -268,17 +269,6 @@ struct PublicsStreamHeader {
   support::ulittle32_t NumSections;
 };
 
-/// Defines a 128-bit unique identifier.  This maps to a GUID on Windows, but
-/// is abstracted here for the purposes of non-Windows platforms that don't have
-/// the GUID structure defined.
-struct PDB_UniqueId {
-  uint8_t Guid[16];
-};
-
-inline bool operator==(const PDB_UniqueId &LHS, const PDB_UniqueId &RHS) {
-  return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid));
-}
-
 // The header preceeding the global TPI stream.
 // This corresponds to `HDR` in PDB/dbi/tpi.h.
 struct TpiStreamHeader {
@@ -312,7 +302,7 @@ struct InfoStreamHeader {
   support::ulittle32_t Version;
   support::ulittle32_t Signature;
   support::ulittle32_t Age;
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
 };
 
 /// The header preceeding the /names stream.
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
index 156abb59a6be..c1edec7a26fe 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
@@ -10,84 +10,13 @@
 #ifndef LLVM_DEBUGINFO_PDB_TPIHASHING_H
 #define LLVM_DEBUGINFO_PDB_TPIHASHING_H
 
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
-#include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
-#include <cstdint>
-#include <string>
 
 namespace llvm {
 namespace pdb {
 
-class TpiHashUpdater : public codeview::TypeVisitorCallbacks {
-public:
-  TpiHashUpdater() = default;
-
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  virtual Error visitKnownRecord(codeview::CVType &CVR,                        \
-                                 codeview::Name##Record &Record) override {    \
-    visitKnownRecordImpl(CVR, Record);                                         \
-    return Error::success();                                                   \
-  }
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD(EnumName, EnumVal, Name)
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
-
-private:
-  template <typename RecordKind>
-  void visitKnownRecordImpl(codeview::CVType &CVR, RecordKind &Record) {
-    CVR.Hash = 0;
-  }
-
-  void visitKnownRecordImpl(codeview::CVType &CVR,
-                            codeview::UdtSourceLineRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR,
-                            codeview::UdtModSourceLineRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR, codeview::ClassRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR, codeview::EnumRecord &Rec);
-  void visitKnownRecordImpl(codeview::CVType &CVR, codeview::UnionRecord &Rec);
-};
-
-class TpiHashVerifier : public codeview::TypeVisitorCallbacks {
-public:
-  TpiHashVerifier(FixedStreamArray<support::ulittle32_t> &HashValues,
-                  uint32_t NumHashBuckets)
-      : HashValues(HashValues), NumHashBuckets(NumHashBuckets) {}
-
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::UdtSourceLineRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::UdtModSourceLineRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::ClassRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::EnumRecord &Rec) override;
-  Error visitKnownRecord(codeview::CVType &CVR,
-                         codeview::UnionRecord &Rec) override;
-  Error visitTypeBegin(codeview::CVType &CVR) override;
-
-private:
-  Error verifySourceLine(codeview::TypeIndex TI);
-
-  Error errorInvalidHash() {
-    return make_error<RawError>(
-        raw_error_code::invalid_tpi_hash,
-        "Type index is 0x" +
-            utohexstr(codeview::TypeIndex::FirstNonSimpleIndex + Index));
-  }
-
-  FixedStreamArray<support::ulittle32_t> HashValues;
-  codeview::CVType RawRecord;
-  uint32_t NumHashBuckets;
-  uint32_t Index = -1;
-};
+Expected<uint32_t> hashTypeRecord(const llvm::codeview::CVType &Type);
 
 } // end namespace pdb
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/PDBExtras.h b/include/llvm/DebugInfo/PDB/PDBExtras.h
index 3a38f21b94c8..778121c8eb79 100644
--- a/include/llvm/DebugInfo/PDB/PDBExtras.h
+++ b/include/llvm/DebugInfo/PDB/PDBExtras.h
@@ -32,7 +32,6 @@ raw_ostream &operator<<(raw_ostream &OS, const PDB_Checksum &Checksum);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Lang &Lang);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_SymType &Tag);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_MemberAccess &Access);
-raw_ostream &operator<<(raw_ostream &OS, const PDB_UniqueId &Guid);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_UdtType &Type);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Machine &Machine);
 
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index c1acca386820..27b5457fc8ff 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -22,6 +22,7 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
@@ -289,21 +290,21 @@ class CompileOnDemandLayer {
   // FIXME: We should track and free associated resources (unused compile
   //        callbacks, uncompiled IR, and no-longer-needed/reachable function
   //        implementations).
-  // FIXME: Return Error once the JIT APIs are Errorized.
-  bool updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) {
+  Error updatePointer(std::string FuncName, JITTargetAddress FnBodyAddr) {
     //Find out which logical dylib contains our symbol
     auto LDI = LogicalDylibs.begin();
     for (auto LDE = LogicalDylibs.end(); LDI != LDE; ++LDI) {
-      if (auto LMResources = LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) {
+      if (auto LMResources =
+            LDI->getLogicalModuleResourcesForSymbol(FuncName, false)) {
         Module &SrcM = LMResources->SourceModule->getResource();
         std::string CalledFnName = mangle(FuncName, SrcM.getDataLayout());
-        if (auto EC = LMResources->StubsMgr->updatePointer(CalledFnName, FnBodyAddr))
-          return false;
-        else
-          return true;
+        if (auto Err = LMResources->StubsMgr->updatePointer(CalledFnName,
+                                                            FnBodyAddr))
+          return Err;
+        return Error::success();
       }
     }
-    return false;
+    return make_error<JITSymbolNotFound>(FuncName);
   }
 
 private:
@@ -363,11 +364,8 @@ class CompileOnDemandLayer {
           });
       }
 
-      auto EC = LD.StubsMgr->createStubs(StubInits);
-      (void)EC;
-      // FIXME: This should be propagated back to the user. Stub creation may
-      //        fail for remote JITs.
-      assert(!EC && "Error generating stubs");
+      if (auto Err = LD.StubsMgr->createStubs(StubInits))
+        return Err;
     }
 
     // If this module doesn't contain any globals, aliases, or module flags then
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index a9778514b9f1..0c1862c5c3ea 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -135,12 +135,13 @@ class RTDyldMemoryManager : public MCJITMemoryManager,
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true);
 
-private:
+protected:
   struct EHFrame {
     uint8_t *Addr;
     size_t Size;
   };
-  std::vector<EHFrame> EHFrames;
+  typedef std::vector<EHFrame> EHFrameInfos;
+  EHFrameInfos EHFrames;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 801e88aba4d1..850964afc307 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -143,11 +143,15 @@ namespace CallingConv {
     /// System V ABI, used on most non-Windows systems.
     X86_64_SysV = 78,
 
-    /// \brief The C convention as implemented on Windows/x86-64. This
-    /// convention differs from the more common \c X86_64_SysV convention
-    /// in a number of ways, most notably in that XMM registers used to pass
-    /// arguments are shadowed by GPRs, and vice versa.
-    X86_64_Win64 = 79,
+    /// \brief The C convention as implemented on Windows/x86-64 and
+    /// AArch64. This convention differs from the more common
+    /// \c X86_64_SysV convention in a number of ways, most notably in
+    /// that XMM registers used to pass arguments are shadowed by GPRs,
+    /// and vice versa.
+    /// On AArch64, this is identical to the normal C (AAPCS) calling
+    /// convention for normal functions, but floats are passed in integer
+    /// registers to variadic functions.
+    Win64 = 79,
 
     /// \brief MSVC calling convention that passes vectors and vector aggregates
     /// in SSE registers.
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 2e72c41ccee3..0094fd54992a 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -598,6 +598,10 @@ class ConstantDataSequential : public ConstantData {
   /// specified element in the low bits of a uint64_t.
   uint64_t getElementAsInteger(unsigned i) const;
 
+  /// If this is a sequential container of integers (of any size), return the
+  /// specified element as an APInt.
+  APInt getElementAsAPInt(unsigned i) const;
+
   /// If this is a sequential container of floating point type, return the
   /// specified element as an APFloat.
   APFloat getElementAsAPFloat(unsigned i) const;
@@ -761,6 +765,10 @@ class ConstantDataVector final : public ConstantDataSequential {
   /// i32/i64/float/double) and must be a ConstantFP or ConstantInt.
   static Constant *getSplat(unsigned NumElts, Constant *Elt);
 
+  /// Returns true if this is a splat constant, meaning that all elements have
+  /// the same value.
+  bool isSplat() const;
+
   /// If this is a splat constant, meaning that all of the elements have the
   /// same value, return that value. Otherwise return NULL.
   Constant *getSplatValue() const;
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 8e6bb4baccaf..6a14f783005d 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -674,32 +674,37 @@ namespace llvm {
 
     /// Create a descriptor for an imported module.
     /// \param Context The scope this module is imported into
-    /// \param NS The namespace being imported here
-    /// \param Line Line number
+    /// \param NS      The namespace being imported here.
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedModule(DIScope *Context, DINamespace *NS,
-                                           unsigned Line);
+                                           DIFile *File, unsigned Line);
 
     /// Create a descriptor for an imported module.
-    /// \param Context The scope this module is imported into
-    /// \param NS An aliased namespace
-    /// \param Line Line number
+    /// \param Context The scope this module is imported into.
+    /// \param NS      An aliased namespace.
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedModule(DIScope *Context,
-                                           DIImportedEntity *NS, unsigned Line);
+                                           DIImportedEntity *NS, DIFile *File,
+                                           unsigned Line);
 
     /// Create a descriptor for an imported module.
-    /// \param Context The scope this module is imported into
-    /// \param M The module being imported here
-    /// \param Line Line number
+    /// \param Context The scope this module is imported into.
+    /// \param M       The module being imported here
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedModule(DIScope *Context, DIModule *M,
-                                           unsigned Line);
+                                           DIFile *File, unsigned Line);
 
     /// Create a descriptor for an imported function.
-    /// \param Context The scope this module is imported into
-    /// \param Decl The declaration (or definition) of a function, type, or
-    ///             variable
-    /// \param Line Line number
+    /// \param Context The scope this module is imported into.
+    /// \param Decl    The declaration (or definition) of a function, type, or
+    ///                variable.
+    /// \param File    File where the declaration is located.
+    /// \param Line    Line number of the declaration.
     DIImportedEntity *createImportedDeclaration(DIScope *Context, DINode *Decl,
-                                                unsigned Line,
+                                                DIFile *File, unsigned Line,
                                                 StringRef Name = "");
 
     /// Insert a new llvm.dbg.declare intrinsic call.
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 9374fe4fae76..678a43ae7926 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -435,10 +435,10 @@ class DIScope : public DINode {
 
   /// Return the raw underlying file.
   ///
-  /// A \a DIFile is a \a DIScope, but it doesn't point at a separate file
-  /// (it\em is the file).  If \c this is an \a DIFile, we need to return \c
-  /// this.  Otherwise, return the first operand, which is where all other
-  /// subclasses store their file pointer.
+  /// A \a DIFile is a \a DIScope, but it doesn't point at a separate file (it
+  /// \em is the file).  If \c this is an \a DIFile, we need to return \c this.
+  /// Otherwise, return the first operand, which is where all other subclasses
+  /// store their file pointer.
   Metadata *getRawFile() const {
     return isa<DIFile>(this) ? const_cast<DIScope *>(this)
                              : static_cast<Metadata *>(getOperand(0));
@@ -2551,32 +2551,32 @@ class DIImportedEntity : public DINode {
 
   static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
                                    DIScope *Scope, DINodeRef Entity,
-                                   unsigned Line, StringRef Name,
+                                   DIFile *File, unsigned Line, StringRef Name,
                                    StorageType Storage,
                                    bool ShouldCreate = true) {
-    return getImpl(Context, Tag, Scope, Entity, Line,
+    return getImpl(Context, Tag, Scope, Entity, File, Line,
                    getCanonicalMDString(Context, Name), Storage, ShouldCreate);
   }
   static DIImportedEntity *getImpl(LLVMContext &Context, unsigned Tag,
                                    Metadata *Scope, Metadata *Entity,
-                                   unsigned Line, MDString *Name,
-                                   StorageType Storage,
+                                   Metadata *File, unsigned Line,
+                                   MDString *Name, StorageType Storage,
                                    bool ShouldCreate = true);
 
   TempDIImportedEntity cloneImpl() const {
     return getTemporary(getContext(), getTag(), getScope(), getEntity(),
-                        getLine(), getName());
+                        getFile(), getLine(), getName());
   }
 
 public:
   DEFINE_MDNODE_GET(DIImportedEntity,
                     (unsigned Tag, DIScope *Scope, DINodeRef Entity,
-                     unsigned Line, StringRef Name = ""),
-                    (Tag, Scope, Entity, Line, Name))
+                     DIFile *File, unsigned Line, StringRef Name = ""),
+                    (Tag, Scope, Entity, File, Line, Name))
   DEFINE_MDNODE_GET(DIImportedEntity,
                     (unsigned Tag, Metadata *Scope, Metadata *Entity,
-                     unsigned Line, MDString *Name),
-                    (Tag, Scope, Entity, Line, Name))
+                     Metadata *File, unsigned Line, MDString *Name),
+                    (Tag, Scope, Entity, File, Line, Name))
 
   TempDIImportedEntity clone() const { return cloneImpl(); }
 
@@ -2584,10 +2584,12 @@ class DIImportedEntity : public DINode {
   DIScope *getScope() const { return cast_or_null<DIScope>(getRawScope()); }
   DINodeRef getEntity() const { return DINodeRef(getRawEntity()); }
   StringRef getName() const { return getStringOperand(2); }
+  DIFile *getFile() const { return cast_or_null<DIFile>(getRawFile()); }
 
   Metadata *getRawScope() const { return getOperand(0); }
   Metadata *getRawEntity() const { return getOperand(1); }
   MDString *getRawName() const { return getOperandAs<MDString>(2); }
+  Metadata *getRawFile() const { return getOperand(3); }
 
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DIImportedEntityKind;
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index e10d14c19793..5b21a2c83e4a 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -34,22 +34,31 @@ class Module;
 class raw_ostream;
 
 extern template class DomTreeNodeBase<BasicBlock>;
-extern template class DominatorTreeBase<BasicBlock>;
+extern template class DominatorTreeBase<BasicBlock, false>; // DomTree
+extern template class DominatorTreeBase<BasicBlock, true>; // PostDomTree
 
 namespace DomTreeBuilder {
-extern template void Calculate<Function, BasicBlock *>(
-    DominatorTreeBaseByGraphTraits<GraphTraits<BasicBlock *>> &DT, Function &F);
+using BBDomTree = DomTreeBase<BasicBlock>;
+using BBPostDomTree = PostDomTreeBase<BasicBlock>;
 
-extern template void Calculate<Function, Inverse<BasicBlock *>>(
-    DominatorTreeBaseByGraphTraits<GraphTraits<Inverse<BasicBlock *>>> &DT,
-    Function &F);
+extern template void Calculate<BBDomTree, Function>(BBDomTree &DT, Function &F);
+extern template void Calculate<BBPostDomTree, Function>(BBPostDomTree &DT,
+                                                        Function &F);
 
-extern template bool Verify<BasicBlock *>(
-    const DominatorTreeBaseByGraphTraits<GraphTraits<BasicBlock *>> &DT);
+extern template void InsertEdge<BBDomTree>(BBDomTree &DT, BasicBlock *From,
+                                           BasicBlock *To);
+extern template void InsertEdge<BBPostDomTree>(BBPostDomTree &DT,
+                                               BasicBlock *From,
+                                               BasicBlock *To);
 
-extern template bool Verify<Inverse<BasicBlock *>>(
-    const DominatorTreeBaseByGraphTraits<GraphTraits<Inverse<BasicBlock *>>>
-        &DT);
+extern template void DeleteEdge<BBDomTree>(BBDomTree &DT, BasicBlock *From,
+                                           BasicBlock *To);
+extern template void DeleteEdge<BBPostDomTree>(BBPostDomTree &DT,
+                                               BasicBlock *From,
+                                               BasicBlock *To);
+
+extern template bool Verify<BBDomTree>(const BBDomTree &DT);
+extern template bool Verify<BBPostDomTree>(const BBPostDomTree &DT);
 }  // namespace DomTreeBuilder
 
 using DomTreeNode = DomTreeNodeBase<BasicBlock>;
@@ -122,14 +131,12 @@ template <> struct DenseMapInfo<BasicBlockEdge> {
 /// the dominator tree is initially constructed may still exist in the tree,
 /// even if the tree is properly updated. Calling code should not rely on the
 /// preceding statements; this is stated only to assist human understanding.
-class DominatorTree : public DominatorTreeBase<BasicBlock> {
-public:
-  using Base = DominatorTreeBase<BasicBlock>;
+class DominatorTree : public DominatorTreeBase<BasicBlock, false> {
+ public:
+  using Base = DominatorTreeBase<BasicBlock, false>;
 
-  DominatorTree() : DominatorTreeBase<BasicBlock>(false) {}
-  explicit DominatorTree(Function &F) : DominatorTreeBase<BasicBlock>(false) {
-    recalculate(F);
-  }
+  DominatorTree() = default;
+  explicit DominatorTree(Function &F) { recalculate(F); }
 
   /// Handle invalidation explicitly.
   bool invalidate(Function &F, const PreservedAnalyses &PA,
diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td
index 8ac56e03be6a..098245344725 100644
--- a/include/llvm/IR/IntrinsicsHexagon.td
+++ b/include/llvm/IR/IntrinsicsHexagon.td
@@ -32,16 +32,6 @@ class Hexagon_qi_mem_Intrinsic<string GCCIntSuffix>
   : Hexagon_Intrinsic<GCCIntSuffix,
                           [llvm_i1_ty], [llvm_ptr_ty],
                           [IntrNoMem]>;
-
-//
-// DEF_FUNCTION_TYPE_1(void_ftype_SI,BT_VOID,BT_INT) ->
-// Hexagon_void_si_Intrinsic<string GCCIntSuffix>
-//
-class Hexagon_void_si_Intrinsic<string GCCIntSuffix>
-  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [], [llvm_ptr_ty],
-                          []>;
-
 //
 // DEF_FUNCTION_TYPE_1(HI_ftype_SI,BT_I16,BT_INT) ->
 // Hexagon_hi_si_Intrinsic<string GCCIntSuffix>
@@ -4959,11 +4949,25 @@ Hexagon_di_di_Intrinsic<"HEXAGON_S2_interleave">;
 //
 def int_hexagon_S2_deinterleave :
 Hexagon_di_di_Intrinsic<"HEXAGON_S2_deinterleave">;
+
 //
 // BUILTIN_INFO(HEXAGON.dcfetch_A,v_ftype_DI*,1)
 //
 def int_hexagon_prefetch :
-Hexagon_void_si_Intrinsic<"HEXAGON_prefetch">;
+Hexagon_Intrinsic<"HEXAGON_prefetch", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dccleana :
+Hexagon_Intrinsic<"HEXAGON_Y2_dccleana", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dccleaninva :
+Hexagon_Intrinsic<"HEXAGON_Y2_dccleaninva", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dcinva :
+Hexagon_Intrinsic<"HEXAGON_Y2_dcinva", [], [llvm_ptr_ty], []>;
+def int_hexagon_Y2_dczeroa :
+Hexagon_Intrinsic<"HEXAGON_Y2_dczeroa", [], [llvm_ptr_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrHasSideEffects]>;
+def int_hexagon_Y4_l2fetch :
+Hexagon_Intrinsic<"HEXAGON_Y4_l2fetch", [], [llvm_ptr_ty, llvm_i32_ty], []>;
+def int_hexagon_Y5_l2fetch :
+Hexagon_Intrinsic<"HEXAGON_Y5_l2fetch", [], [llvm_ptr_ty, llvm_i64_ty], []>;
 
 def llvm_ptr32_ty : LLVMPointerType<llvm_i32_ty>;
 def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
diff --git a/include/llvm/IR/IntrinsicsSystemZ.td b/include/llvm/IR/IntrinsicsSystemZ.td
index 9be37d3645b2..98065bc51d99 100644
--- a/include/llvm/IR/IntrinsicsSystemZ.td
+++ b/include/llvm/IR/IntrinsicsSystemZ.td
@@ -373,6 +373,49 @@ let TargetPrefix = "s390" in {
   def int_s390_vfidb : Intrinsic<[llvm_v2f64_ty],
                                  [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
                                  [IntrNoMem]>;
+
+  // Instructions from the Vector Enhancements Facility 1
+  def int_s390_vbperm : SystemZBinaryConv<"vbperm", llvm_v2i64_ty,
+                                          llvm_v16i8_ty>;
+
+  def int_s390_vmslg  : GCCBuiltin<"__builtin_s390_vmslg">,
+                        Intrinsic<[llvm_v16i8_ty],
+                                  [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v16i8_ty,
+                                   llvm_i32_ty], [IntrNoMem]>;
+
+  def int_s390_vfmaxdb : Intrinsic<[llvm_v2f64_ty],
+                                   [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+  def int_s390_vfmindb : Intrinsic<[llvm_v2f64_ty],
+                                   [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+  def int_s390_vfmaxsb : Intrinsic<[llvm_v4f32_ty],
+                                   [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+  def int_s390_vfminsb : Intrinsic<[llvm_v4f32_ty],
+                                   [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+                                   [IntrNoMem]>;
+
+  def int_s390_vfcesbs  : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+  def int_s390_vfchsbs  : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+  def int_s390_vfchesbs : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+
+  def int_s390_vftcisb : SystemZBinaryConvIntCC<llvm_v4i32_ty, llvm_v4f32_ty>;
+
+  def int_s390_vfisb : Intrinsic<[llvm_v4f32_ty],
+                                 [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty],
+                                 [IntrNoMem]>;
+
+  // Instructions from the Vector Packed Decimal Facility
+  def int_s390_vlrl : GCCBuiltin<"__builtin_s390_vlrl">,
+                      Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty],
+                                [IntrReadMem, IntrArgMemOnly]>;
+
+  def int_s390_vstrl : GCCBuiltin<"__builtin_s390_vstrl">,
+                       Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty],
+                                 // In fact write-only but there's no property
+                                 // for that.
+                                 [IntrArgMemOnly]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/MC/LaneBitmask.h b/include/llvm/MC/LaneBitmask.h
index 5ca06d1148e2..73b987b074db 100644
--- a/include/llvm/MC/LaneBitmask.h
+++ b/include/llvm/MC/LaneBitmask.h
@@ -75,6 +75,9 @@ namespace llvm {
 
     static LaneBitmask getNone() { return LaneBitmask(0); }
     static LaneBitmask getAll()  { return ~LaneBitmask(0); }
+    static LaneBitmask getLane(unsigned Lane) {
+      return LaneBitmask(Type(1) << Lane);
+    }
 
   private:
     Type Mask = 0;
diff --git a/include/llvm/MC/MCFixup.h b/include/llvm/MC/MCFixup.h
index b493ca0b0ea7..b83086c327f2 100644
--- a/include/llvm/MC/MCFixup.h
+++ b/include/llvm/MC/MCFixup.h
@@ -69,7 +69,7 @@ class MCFixup {
   /// an instruction or an assembler directive.
   const MCExpr *Value;
 
-  /// The byte index of start of the relocation inside the encoded instruction.
+  /// The byte index of start of the relocation inside the MCFragment.
   uint32_t Offset;
 
   /// The target dependent kind of fixup item this is. The kind is used to
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 340d8253b8c9..9150a8b5c80a 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -209,6 +209,15 @@ class MCInstrDesc {
   /// well.
   unsigned getNumOperands() const { return NumOperands; }
 
+  using const_opInfo_iterator = const MCOperandInfo *;
+
+  const_opInfo_iterator opInfo_begin() const { return OpInfo; }
+  const_opInfo_iterator opInfo_end() const { return OpInfo + NumOperands; }
+
+  iterator_range<const_opInfo_iterator> operands() const {
+    return make_range(opInfo_begin(), opInfo_end());
+  }
+
   /// \brief Return the number of MachineOperands that are register
   /// definitions.  Register definitions always occur at the start of the
   /// machine operand list.  This is the number of "outs" in the .td file,
diff --git a/include/llvm/Object/COFFImportFile.h b/include/llvm/Object/COFFImportFile.h
index 060f965233e1..8e215b565fc4 100644
--- a/include/llvm/Object/COFFImportFile.h
+++ b/include/llvm/Object/COFFImportFile.h
@@ -95,7 +95,7 @@ struct COFFShortExport {
   }
 };
 
-std::error_code writeImportLibrary(StringRef DLLName,
+std::error_code writeImportLibrary(StringRef ImportName,
                                    StringRef Path,
                                    ArrayRef<COFFShortExport> Exports,
                                    COFF::MachineTypes Machine);
diff --git a/include/llvm/Object/COFFModuleDefinition.h b/include/llvm/Object/COFFModuleDefinition.h
index a0e8eacdb7a3..be139a2833b0 100644
--- a/include/llvm/Object/COFFModuleDefinition.h
+++ b/include/llvm/Object/COFFModuleDefinition.h
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef LLVM_OBJECT_COFF_MODULE_DEFINITION_H
 #define LLVM_OBJECT_COFF_MODULE_DEFINITION_H
 
@@ -29,6 +28,7 @@ namespace object {
 struct COFFModuleDefinition {
   std::vector<COFFShortExport> Exports;
   std::string OutputFile;
+  std::string ImportName;
   uint64_t ImageBase = 0;
   uint64_t StackReserve = 0;
   uint64_t StackCommit = 0;
@@ -40,8 +40,12 @@ struct COFFModuleDefinition {
   uint32_t MinorOSVersion = 0;
 };
 
+// mingw and wine def files do not mangle _ for x86 which
+// is a consequence of legacy binutils' dlltool functionality.
+// This MingwDef flag should be removed once mingw stops this pratice.
 Expected<COFFModuleDefinition>
-parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine);
+parseCOFFModuleDefinition(MemoryBufferRef MB, COFF::MachineTypes Machine,
+                          bool MingwDef = false);
 
 } // End namespace object.
 } // End namespace llvm.
diff --git a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
index 6746fd60b6cb..88a5668f0a14 100644
--- a/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
+++ b/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
@@ -60,6 +60,8 @@ ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc);
 
 } // end namespace llvm
 
+LLVM_YAML_DECLARE_SCALAR_TRAITS(codeview::GUID, true)
+
 LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::LeafRecord)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(CodeViewYAML::MemberRecord)
 
diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def
index 8eccebcd932a..09f9602a24d9 100644
--- a/include/llvm/Support/AArch64TargetParser.def
+++ b/include/llvm/Support/AArch64TargetParser.def
@@ -43,8 +43,9 @@ AARCH64_ARCH_EXT_NAME("crypto",   AArch64::AEK_CRYPTO,   "+crypto","-crypto")
 AARCH64_ARCH_EXT_NAME("fp",       AArch64::AEK_FP,       "+fp-armv8",  "-fp-armv8")
 AARCH64_ARCH_EXT_NAME("simd",     AArch64::AEK_SIMD,     "+neon",  "-neon")
 AARCH64_ARCH_EXT_NAME("fp16",     AArch64::AEK_FP16,     "+fullfp16",  "-fullfp16")
-AARCH64_ARCH_EXT_NAME("profile",     AArch64::AEK_PROFILE,     "+spe",  "-spe")
-AARCH64_ARCH_EXT_NAME("ras",     AArch64::AEK_RAS,     "+ras",  "-ras")
+AARCH64_ARCH_EXT_NAME("profile",  AArch64::AEK_PROFILE,  "+spe",  "-spe")
+AARCH64_ARCH_EXT_NAME("ras",      AArch64::AEK_RAS,      "+ras",  "-ras")
+AARCH64_ARCH_EXT_NAME("sve",      AArch64::AEK_SVE,      "+sve",  "-sve")
 #undef AARCH64_ARCH_EXT_NAME
 
 #ifndef AARCH64_CPU_NAME
diff --git a/include/llvm/Support/BinaryItemStream.h b/include/llvm/Support/BinaryItemStream.h
index f4b319217819..fe7e6caeaafb 100644
--- a/include/llvm/Support/BinaryItemStream.h
+++ b/include/llvm/Support/BinaryItemStream.h
@@ -62,32 +62,45 @@ class BinaryItemStream : public BinaryStream {
     return Error::success();
   }
 
-  void setItems(ArrayRef<T> ItemArray) { Items = ItemArray; }
+  void setItems(ArrayRef<T> ItemArray) {
+    Items = ItemArray;
+    computeItemOffsets();
+  }
 
   uint32_t getLength() override {
-    uint32_t Size = 0;
-    for (const auto &Item : Items)
-      Size += Traits::length(Item);
-    return Size;
+    return ItemEndOffsets.empty() ? 0 : ItemEndOffsets.back();
   }
 
 private:
-  Expected<uint32_t> translateOffsetIndex(uint32_t Offset) const {
+  void computeItemOffsets() {
+    ItemEndOffsets.clear();
+    ItemEndOffsets.reserve(Items.size());
     uint32_t CurrentOffset = 0;
-    uint32_t CurrentIndex = 0;
     for (const auto &Item : Items) {
-      if (CurrentOffset >= Offset)
-        break;
-      CurrentOffset += Traits::length(Item);
-      ++CurrentIndex;
+      uint32_t Len = Traits::length(Item);
+      assert(Len > 0 && "no empty items");
+      CurrentOffset += Len;
+      ItemEndOffsets.push_back(CurrentOffset);
     }
-    if (CurrentOffset != Offset)
+  }
+
+  Expected<uint32_t> translateOffsetIndex(uint32_t Offset) {
+    // Make sure the offset is somewhere in our items array.
+    if (Offset >= getLength())
       return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
-    return CurrentIndex;
+    ++Offset;
+    auto Iter =
+        std::lower_bound(ItemEndOffsets.begin(), ItemEndOffsets.end(), Offset);
+    size_t Idx = std::distance(ItemEndOffsets.begin(), Iter);
+    assert(Idx < Items.size() && "binary search for offset failed");
+    return Idx;
   }
 
   llvm::support::endianness Endian;
   ArrayRef<T> Items;
+
+  // Sorted vector of offsets to accelerate lookup.
+  std::vector<uint32_t> ItemEndOffsets;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index 017b4973f1ff..bcbd2bec5722 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -125,30 +125,39 @@ inline format_object<Ts...> format(const char *Fmt, const Ts &... Vals) {
   return format_object<Ts...>(Fmt, Vals...);
 }
 
-/// This is a helper class used for left_justify() and right_justify().
+/// This is a helper class for left_justify, right_justify, and center_justify.
 class FormattedString {
+public:
+  enum Justification { JustifyNone, JustifyLeft, JustifyRight, JustifyCenter };
+  FormattedString(StringRef S, unsigned W, Justification J)
+      : Str(S), Width(W), Justify(J) {}
+
+private:
   StringRef Str;
   unsigned Width;
-  bool RightJustify;
+  Justification Justify;
   friend class raw_ostream;
-
-public:
-    FormattedString(StringRef S, unsigned W, bool R)
-      : Str(S), Width(W), RightJustify(R) { }
 };
 
 /// left_justify - append spaces after string so total output is
 /// \p Width characters.  If \p Str is larger that \p Width, full string
 /// is written with no padding.
 inline FormattedString left_justify(StringRef Str, unsigned Width) {
-  return FormattedString(Str, Width, false);
+  return FormattedString(Str, Width, FormattedString::JustifyLeft);
 }
 
 /// right_justify - add spaces before string so total output is
 /// \p Width characters.  If \p Str is larger that \p Width, full string
 /// is written with no padding.
 inline FormattedString right_justify(StringRef Str, unsigned Width) {
-  return FormattedString(Str, Width, true);
+  return FormattedString(Str, Width, FormattedString::JustifyRight);
+}
+
+/// center_justify - add spaces before and after string so total output is
+/// \p Width characters.  If \p Str is larger that \p Width, full string
+/// is written with no padding.
+inline FormattedString center_justify(StringRef Str, unsigned Width) {
+  return FormattedString(Str, Width, FormattedString::JustifyCenter);
 }
 
 /// This is a helper class used for format_hex() and format_decimal().
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 394a45387d8a..706320fed9a7 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -41,27 +41,21 @@
 
 namespace llvm {
 
-template <class NodeT> class DominatorTreeBase;
+template <typename NodeT, bool IsPostDom>
+class DominatorTreeBase;
 
-namespace detail {
-
-template <typename GT> struct DominatorTreeBaseTraits {
-  static_assert(std::is_pointer<typename GT::NodeRef>::value,
-                "Currently NodeRef must be a pointer type.");
-  using type = DominatorTreeBase<
-      typename std::remove_pointer<typename GT::NodeRef>::type>;
-};
-
-} // end namespace detail
-
-template <typename GT>
-using DominatorTreeBaseByGraphTraits =
-    typename detail::DominatorTreeBaseTraits<GT>::type;
+namespace DomTreeBuilder {
+template <class DomTreeT>
+struct SemiNCAInfo;
+}  // namespace DomTreeBuilder
 
 /// \brief Base class for the actual dominator tree node.
 template <class NodeT> class DomTreeNodeBase {
   friend struct PostDominatorTree;
-  template <class N> friend class DominatorTreeBase;
+  friend class DominatorTreeBase<NodeT, false>;
+  friend class DominatorTreeBase<NodeT, true>;
+  friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase<NodeT, false>>;
+  friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase<NodeT, true>>;
 
   NodeT *TheBB;
   DomTreeNodeBase *IDom;
@@ -192,58 +186,69 @@ void PrintDomTree(const DomTreeNodeBase<NodeT> *N, raw_ostream &O,
 }
 
 namespace DomTreeBuilder {
-template <class NodeT>
-struct SemiNCAInfo;
+// The routines below are provided in a separate header but referenced here.
+template <typename DomTreeT, typename FuncT>
+void Calculate(DomTreeT &DT, FuncT &F);
 
-// The calculate routine is provided in a separate header but referenced here.
-template <class FuncT, class N>
-void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<N>> &DT, FuncT &F);
+template <class DomTreeT>
+void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To);
 
-// The verify function is provided in a separate header but referenced here.
-template <class N>
-bool Verify(const DominatorTreeBaseByGraphTraits<GraphTraits<N>> &DT);
+template <class DomTreeT>
+void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To);
+
+template <typename DomTreeT>
+bool Verify(const DomTreeT &DT);
 }  // namespace DomTreeBuilder
 
 /// \brief Core dominator tree base class.
 ///
 /// This class is a generic template over graph nodes. It is instantiated for
 /// various graphs in the LLVM IR or in the code generator.
-template <class NodeT> class DominatorTreeBase {
+template <typename NodeT, bool IsPostDom>
+class DominatorTreeBase {
  protected:
   std::vector<NodeT *> Roots;
-  bool IsPostDominators;
 
   using DomTreeNodeMapType =
      DenseMap<NodeT *, std::unique_ptr<DomTreeNodeBase<NodeT>>>;
   DomTreeNodeMapType DomTreeNodes;
   DomTreeNodeBase<NodeT> *RootNode;
+  using ParentPtr = decltype(std::declval<NodeT *>()->getParent());
+  ParentPtr Parent = nullptr;
 
   mutable bool DFSInfoValid = false;
   mutable unsigned int SlowQueries = 0;
 
-  friend struct DomTreeBuilder::SemiNCAInfo<NodeT>;
-  using SNCAInfoTy = DomTreeBuilder::SemiNCAInfo<NodeT>;
+  friend struct DomTreeBuilder::SemiNCAInfo<DominatorTreeBase>;
 
  public:
-  explicit DominatorTreeBase(bool isPostDom) : IsPostDominators(isPostDom) {}
+  static_assert(std::is_pointer<typename GraphTraits<NodeT *>::NodeRef>::value,
+                "Currently DominatorTreeBase supports only pointer nodes");
+  using NodeType = NodeT;
+  using NodePtr = NodeT *;
+  static constexpr bool IsPostDominator = IsPostDom;
+
+  DominatorTreeBase() {}
 
   DominatorTreeBase(DominatorTreeBase &&Arg)
       : Roots(std::move(Arg.Roots)),
-        IsPostDominators(Arg.IsPostDominators),
         DomTreeNodes(std::move(Arg.DomTreeNodes)),
-        RootNode(std::move(Arg.RootNode)),
-        DFSInfoValid(std::move(Arg.DFSInfoValid)),
-        SlowQueries(std::move(Arg.SlowQueries)) {
+        RootNode(Arg.RootNode),
+        Parent(Arg.Parent),
+        DFSInfoValid(Arg.DFSInfoValid),
+        SlowQueries(Arg.SlowQueries) {
     Arg.wipe();
   }
 
   DominatorTreeBase &operator=(DominatorTreeBase &&RHS) {
     Roots = std::move(RHS.Roots);
-    IsPostDominators = RHS.IsPostDominators;
     DomTreeNodes = std::move(RHS.DomTreeNodes);
-    RootNode = std::move(RHS.RootNode);
-    DFSInfoValid = std::move(RHS.DFSInfoValid);
-    SlowQueries = std::move(RHS.SlowQueries);
+    RootNode = RHS.RootNode;
+    Parent = RHS.Parent;
+    DFSInfoValid = RHS.DFSInfoValid;
+    SlowQueries = RHS.SlowQueries;
     RHS.wipe();
     return *this;
   }
@@ -259,11 +264,12 @@ template <class NodeT> class DominatorTreeBase {
 
   /// isPostDominator - Returns true if analysis based of postdoms
   ///
-  bool isPostDominator() const { return IsPostDominators; }
+  bool isPostDominator() const { return IsPostDominator; }
 
   /// compare - Return false if the other dominator tree base matches this
   /// dominator tree base. Otherwise return true.
   bool compare(const DominatorTreeBase &Other) const {
+    if (Parent != Other.Parent) return true;
 
     const DomTreeNodeMapType &OtherDomTreeNodes = Other.DomTreeNodes;
     if (DomTreeNodes.size() != OtherDomTreeNodes.size())
@@ -443,10 +449,50 @@ template <class NodeT> class DominatorTreeBase {
                                       const_cast<NodeT *>(B));
   }
 
+  bool isVirtualRoot(const DomTreeNodeBase<NodeT> *A) const {
+    return isPostDominator() && !A->getBlock();
+  }
+
   //===--------------------------------------------------------------------===//
   // API to update (Post)DominatorTree information based on modifications to
   // the CFG...
 
+  /// Inform the dominator tree about a CFG edge insertion and update the tree.
+  ///
+  /// This function has to be called just before or just after making the update
+  /// on the actual CFG. There cannot be any other updates that the dominator
+  /// tree doesn't know about.
+  ///
+  /// Note that for postdominators it automatically takes care of inserting
+  /// a reverse edge internally (so there's no need to swap the parameters).
+  ///
+  void insertEdge(NodeT *From, NodeT *To) {
+    assert(From);
+    assert(To);
+    assert(From->getParent() == Parent);
+    assert(To->getParent() == Parent);
+    DomTreeBuilder::InsertEdge(*this, From, To);
+  }
+
+  /// Inform the dominator tree about a CFG edge deletion and update the tree.
+  ///
+  /// This function has to be called just after making the update
+  /// on the actual CFG. There cannot be any other updates that the dominator
+  /// tree doesn't know about. The only exception is when the deletion that the
+  /// tree is informed about makes some (domominator) subtree unreachable -- in
+  /// this case, it is fine to perform deletions within this subtree.
+  ///
+  /// Note that for postdominators it automatically takes care of deleting
+  /// a reverse edge internally (so there's no need to swap the parameters).
+  ///
+  void deleteEdge(NodeT *From, NodeT *To) {
+    assert(From);
+    assert(To);
+    assert(From->getParent() == Parent);
+    assert(To->getParent() == Parent);
+    DomTreeBuilder::DeleteEdge(*this, From, To);
+  }
+
   /// Add a new node to the dominator tree information.
   ///
   /// This creates a new node as a child of DomBB dominator node, linking it
@@ -530,7 +576,7 @@ template <class NodeT> class DominatorTreeBase {
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
   void splitBlock(NodeT *NewBB) {
-    if (this->IsPostDominators)
+    if (IsPostDominator)
       Split<Inverse<NodeT *>>(NewBB);
     else
       Split<NodeT *>(NewBB);
@@ -607,37 +653,33 @@ template <class NodeT> class DominatorTreeBase {
   template <class FT> void recalculate(FT &F) {
     using TraitsTy = GraphTraits<FT *>;
     reset();
+    Parent = &F;
 
-    if (!this->IsPostDominators) {
+    if (!IsPostDominator) {
       // Initialize root
       NodeT *entry = TraitsTy::getEntryNode(&F);
       addRoot(entry);
-
-      DomTreeBuilder::Calculate<FT, NodeT *>(*this, F);
     } else {
       // Initialize the roots list
       for (auto *Node : nodes(&F))
         if (TraitsTy::child_begin(Node) == TraitsTy::child_end(Node))
           addRoot(Node);
-
-      DomTreeBuilder::Calculate<FT, Inverse<NodeT *>>(*this, F);
     }
+
+    DomTreeBuilder::Calculate(*this, F);
   }
 
   /// verify - check parent and sibling property
-  bool verify() const {
-    return this->isPostDominator()
-           ? DomTreeBuilder::Verify<Inverse<NodeT *>>(*this)
-           : DomTreeBuilder::Verify<NodeT *>(*this);
-  }
+  bool verify() const { return DomTreeBuilder::Verify(*this); }
 
  protected:
   void addRoot(NodeT *BB) { this->Roots.push_back(BB); }
 
   void reset() {
     DomTreeNodes.clear();
-    this->Roots.clear();
+    Roots.clear();
     RootNode = nullptr;
+    Parent = nullptr;
     DFSInfoValid = false;
     SlowQueries = 0;
   }
@@ -719,13 +761,21 @@ template <class NodeT> class DominatorTreeBase {
   void wipe() {
     DomTreeNodes.clear();
     RootNode = nullptr;
+    Parent = nullptr;
   }
 };
 
+template <typename T>
+using DomTreeBase = DominatorTreeBase<T, false>;
+
+template <typename T>
+using PostDomTreeBase = DominatorTreeBase<T, true>;
+
 // These two functions are declared out of line as a workaround for building
 // with old (< r147295) versions of clang because of pr11642.
-template <class NodeT>
-bool DominatorTreeBase<NodeT>::dominates(const NodeT *A, const NodeT *B) const {
+template <typename NodeT, bool IsPostDom>
+bool DominatorTreeBase<NodeT, IsPostDom>::dominates(const NodeT *A,
+                                                    const NodeT *B) const {
   if (A == B)
     return true;
 
@@ -735,9 +785,9 @@ bool DominatorTreeBase<NodeT>::dominates(const NodeT *A, const NodeT *B) const {
   return dominates(getNode(const_cast<NodeT *>(A)),
                    getNode(const_cast<NodeT *>(B)));
 }
-template <class NodeT>
-bool DominatorTreeBase<NodeT>::properlyDominates(const NodeT *A,
-                                                 const NodeT *B) const {
+template <typename NodeT, bool IsPostDom>
+bool DominatorTreeBase<NodeT, IsPostDom>::properlyDominates(
+    const NodeT *A, const NodeT *B) const {
   if (A == B)
     return false;
 
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index a0fec668e05c..be90afa4c3c8 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -20,15 +20,28 @@
 /// out that the theoretically slower O(n*log(n)) implementation is actually
 /// faster than the almost-linear O(n*alpha(n)) version, even for large CFGs.
 ///
+/// The file uses the Depth Based Search algorithm to perform incremental
+/// upates (insertion and deletions). The implemented algorithm is based on this
+/// publication:
+///
+///   An Experimental Study of Dynamic Dominators
+///   Loukas Georgiadis, et al., April 12 2016, pp. 5-7, 9-10:
+///   https://arxiv.org/pdf/1604.02711.pdf
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 #define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 
+#include <queue>
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/GenericDomTree.h"
 
+#define DEBUG_TYPE "dom-tree-builder"
+
 namespace llvm {
 namespace DomTreeBuilder {
 
@@ -46,13 +59,14 @@ struct ChildrenGetter<NodePtr, true> {
   }
 };
 
-// Information record used by Semi-NCA during tree construction.
-template <typename NodeT>
+template <typename DomTreeT>
 struct SemiNCAInfo {
-  using NodePtr = NodeT *;
-  using DomTreeT = DominatorTreeBase<NodeT>;
+  using NodePtr = typename DomTreeT::NodePtr;
+  using NodeT = typename DomTreeT::NodeType;
   using TreeNodePtr = DomTreeNodeBase<NodeT> *;
+  static constexpr bool IsPostDom = DomTreeT::IsPostDominator;
 
+  // Information record used by Semi-NCA during tree construction.
   struct InfoRec {
     unsigned DFSNum = 0;
     unsigned Parent = 0;
@@ -62,11 +76,13 @@ struct SemiNCAInfo {
     SmallVector<NodePtr, 2> ReverseChildren;
   };
 
-  std::vector<NodePtr> NumToNode;
+  // Number to node mapping is 1-based. Initialize the mapping to start with
+  // a dummy element.
+  std::vector<NodePtr> NumToNode = {nullptr};
   DenseMap<NodePtr, InfoRec> NodeToInfo;
 
   void clear() {
-    NumToNode.clear();
+    NumToNode = {nullptr}; // Restore to initial state with a dummy start node.
     NodeToInfo.clear();
   }
 
@@ -90,12 +106,28 @@ struct SemiNCAInfo {
     // Add a new tree node for this NodeT, and link it as a child of
     // IDomNode
     return (DT.DomTreeNodes[BB] = IDomNode->addChild(
-                llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
+        llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
         .get();
   }
 
   static bool AlwaysDescend(NodePtr, NodePtr) { return true; }
 
+  struct BlockNamePrinter {
+    NodePtr N;
+
+    BlockNamePrinter(NodePtr Block) : N(Block) {}
+    BlockNamePrinter(TreeNodePtr TN) : N(TN ? TN->getBlock() : nullptr) {}
+
+    friend raw_ostream &operator<<(raw_ostream &O, const BlockNamePrinter &BP) {
+      if (!BP.N)
+        O << "nullptr";
+      else
+        BP.N->printAsOperand(O, false);
+
+      return O;
+    }
+  };
+
   // Custom DFS implementation which can skip nodes based on a provided
   // predicate. It also collects ReverseChildren so that we don't have to spend
   // time getting predecessors in SemiNCA.
@@ -177,44 +209,42 @@ struct SemiNCAInfo {
     return VInInfo.Label;
   }
 
-  template <typename NodeType>
-  void runSemiNCA(DomTreeT &DT, unsigned NumBlocks) {
-    // Step #1: Number blocks in depth-first order and initialize variables used
-    // in later stages of the algorithm.
-    const unsigned N = doFullDFSWalk(DT, AlwaysDescend);
-
-    // It might be that some blocks did not get a DFS number (e.g., blocks of
-    // infinite loops). In these cases an artificial exit node is required.
-    const bool MultipleRoots =
-        DT.Roots.size() > 1 || (DT.isPostDominator() && N != NumBlocks);
-
+  // This function requires DFS to be run before calling it.
+  void runSemiNCA(DomTreeT &DT, const unsigned MinLevel = 0) {
+    const unsigned NextDFSNum(NumToNode.size());
     // Initialize IDoms to spanning tree parents.
-    for (unsigned i = 1; i <= N; ++i) {
+    for (unsigned i = 1; i < NextDFSNum; ++i) {
       const NodePtr V = NumToNode[i];
       auto &VInfo = NodeToInfo[V];
       VInfo.IDom = NumToNode[VInfo.Parent];
     }
 
-    // Step #2: Calculate the semidominators of all vertices.
-    for (unsigned i = N; i >= 2; --i) {
+    // Step #1: Calculate the semidominators of all vertices.
+    for (unsigned i = NextDFSNum - 1; i >= 2; --i) {
       NodePtr W = NumToNode[i];
       auto &WInfo = NodeToInfo[W];
 
       // Initialize the semi dominator to point to the parent node.
       WInfo.Semi = WInfo.Parent;
-      for (const auto &N : WInfo.ReverseChildren)
-        if (NodeToInfo.count(N)) {  // Only if this predecessor is reachable!
-          unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi;
-          if (SemiU < WInfo.Semi)
-            WInfo.Semi = SemiU;
-        }
+      for (const auto &N : WInfo.ReverseChildren) {
+        if (NodeToInfo.count(N) == 0)  // Skip unreachable predecessors.
+          continue;
+
+        const TreeNodePtr TN = DT.getNode(N);
+        // Skip predecessors whose level is above the subtree we are processing.
+        if (TN && TN->getLevel() < MinLevel)
+          continue;
+
+        unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi;
+        if (SemiU < WInfo.Semi) WInfo.Semi = SemiU;
+      }
     }
 
-    // Step #3: Explicitly define the immediate dominator of each vertex.
+    // Step #2: Explicitly define the immediate dominator of each vertex.
     //          IDom[i] = NCA(SDom[i], SpanningTreeParent(i)).
     // Note that the parents were stored in IDoms and later got invalidated
     // during path compression in Eval.
-    for (unsigned i = 2; i <= N; ++i) {
+    for (unsigned i = 2; i < NextDFSNum; ++i) {
       const NodePtr W = NumToNode[i];
       auto &WInfo = NodeToInfo[W];
       const unsigned SDomNum = NodeToInfo[NumToNode[WInfo.Semi]].DFSNum;
@@ -224,46 +254,11 @@ struct SemiNCAInfo {
 
       WInfo.IDom = WIDomCandidate;
     }
-
-    if (DT.Roots.empty()) return;
-
-    // Add a node for the root.  This node might be the actual root, if there is
-    // one exit block, or it may be the virtual exit (denoted by
-    // (BasicBlock *)0) which postdominates all real exits if there are multiple
-    // exit blocks, or an infinite loop.
-    NodePtr Root = !MultipleRoots ? DT.Roots[0] : nullptr;
-
-    DT.RootNode =
-        (DT.DomTreeNodes[Root] =
-             llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
-            .get();
-
-    // Loop over all of the reachable blocks in the function...
-    for (unsigned i = 2; i <= N; ++i) {
-      NodePtr W = NumToNode[i];
-
-      // Don't replace this with 'count', the insertion side effect is important
-      if (DT.DomTreeNodes[W])
-        continue; // Haven't calculated this node yet?
-
-      NodePtr ImmDom = getIDom(W);
-
-      assert(ImmDom || DT.DomTreeNodes[nullptr]);
-
-      // Get or calculate the node for the immediate dominator
-      TreeNodePtr IDomNode = getNodeForBlock(ImmDom, DT);
-
-      // Add a new tree node for this BasicBlock, and link it as a child of
-      // IDomNode
-      DT.DomTreeNodes[W] = IDomNode->addChild(
-          llvm::make_unique<DomTreeNodeBase<NodeT>>(W, IDomNode));
-    }
   }
 
   template <typename DescendCondition>
   unsigned doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) {
     unsigned Num = 0;
-    NumToNode.push_back(nullptr);
 
     if (DT.Roots.size() > 1) {
       auto &BBInfo = NodeToInfo[nullptr];
@@ -283,11 +278,257 @@ struct SemiNCAInfo {
     return Num;
   }
 
-  static void PrintBlockOrNullptr(raw_ostream &O, NodePtr Obj) {
-    if (!Obj)
-      O << "nullptr";
+  void calculateFromScratch(DomTreeT &DT, const unsigned NumBlocks) {
+    // Step #0: Number blocks in depth-first order and initialize variables used
+    // in later stages of the algorithm.
+    const unsigned LastDFSNum = doFullDFSWalk(DT, AlwaysDescend);
+
+    runSemiNCA(DT);
+
+    if (DT.Roots.empty()) return;
+
+    // Add a node for the root.  This node might be the actual root, if there is
+    // one exit block, or it may be the virtual exit (denoted by
+    // (BasicBlock *)0) which postdominates all real exits if there are multiple
+    // exit blocks, or an infinite loop.
+    // It might be that some blocks did not get a DFS number (e.g., blocks of
+    // infinite loops). In these cases an artificial exit node is required.
+    const bool MultipleRoots = DT.Roots.size() > 1 || (DT.isPostDominator() &&
+                                                       LastDFSNum != NumBlocks);
+    NodePtr Root = !MultipleRoots ? DT.Roots[0] : nullptr;
+
+    DT.RootNode = (DT.DomTreeNodes[Root] =
+                       llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
+        .get();
+    attachNewSubtree(DT, DT.RootNode);
+  }
+
+  void attachNewSubtree(DomTreeT& DT, const TreeNodePtr AttachTo) {
+    // Attach the first unreachable block to AttachTo.
+    NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
+    // Loop over all of the discovered blocks in the function...
+    for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
+      NodePtr W = NumToNode[i];
+      DEBUG(dbgs() << "\tdiscovered a new reachable node "
+                   << BlockNamePrinter(W) << "\n");
+
+      // Don't replace this with 'count', the insertion side effect is important
+      if (DT.DomTreeNodes[W]) continue;  // Haven't calculated this node yet?
+
+      NodePtr ImmDom = getIDom(W);
+
+      // Get or calculate the node for the immediate dominator
+      TreeNodePtr IDomNode = getNodeForBlock(ImmDom, DT);
+
+      // Add a new tree node for this BasicBlock, and link it as a child of
+      // IDomNode
+      DT.DomTreeNodes[W] = IDomNode->addChild(
+          llvm::make_unique<DomTreeNodeBase<NodeT>>(W, IDomNode));
+    }
+  }
+
+  void reattachExistingSubtree(DomTreeT &DT, const TreeNodePtr AttachTo) {
+    NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
+    for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
+      const NodePtr N = NumToNode[i];
+      const TreeNodePtr TN = DT.getNode(N);
+      assert(TN);
+      const TreeNodePtr NewIDom = DT.getNode(NodeToInfo[N].IDom);
+      TN->setIDom(NewIDom);
+    }
+  }
+
+  // Helper struct used during edge insertions.
+  struct InsertionInfo {
+    using BucketElementTy = std::pair<unsigned, TreeNodePtr>;
+    struct DecreasingLevel {
+      bool operator()(const BucketElementTy &First,
+                      const BucketElementTy &Second) const {
+        return First.first > Second.first;
+      }
+    };
+
+    std::priority_queue<BucketElementTy, SmallVector<BucketElementTy, 8>,
+        DecreasingLevel>
+        Bucket;  // Queue of tree nodes sorted by level in descending order.
+    SmallDenseSet<TreeNodePtr, 8> Affected;
+    SmallDenseSet<TreeNodePtr, 8> Visited;
+    SmallVector<TreeNodePtr, 8> AffectedQueue;
+    SmallVector<TreeNodePtr, 8> VisitedNotAffectedQueue;
+  };
+
+  static void InsertEdge(DomTreeT &DT, const NodePtr From, const NodePtr To) {
+    assert(From && To && "Cannot connect nullptrs");
+    DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> "
+                 << BlockNamePrinter(To) << "\n");
+    const TreeNodePtr FromTN = DT.getNode(From);
+
+    // Ignore edges from unreachable nodes.
+    if (!FromTN) return;
+
+    DT.DFSInfoValid = false;
+
+    const TreeNodePtr ToTN = DT.getNode(To);
+    if (!ToTN)
+      InsertUnreachable(DT, FromTN, To);
     else
-      Obj->printAsOperand(O, false);
+      InsertReachable(DT, FromTN, ToTN);
+  }
+
+  // Handles insertion to a node already in the dominator tree.
+  static void InsertReachable(DomTreeT &DT, const TreeNodePtr From,
+                              const TreeNodePtr To) {
+    DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock())
+                 << " -> " << BlockNamePrinter(To->getBlock()) << "\n");
+    const NodePtr NCDBlock =
+        DT.findNearestCommonDominator(From->getBlock(), To->getBlock());
+    assert(NCDBlock || DT.isPostDominator());
+    const TreeNodePtr NCD = DT.getNode(NCDBlock);
+    assert(NCD);
+
+    DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n");
+    const TreeNodePtr ToIDom = To->getIDom();
+
+    // Nothing affected -- NCA property holds.
+    // (Based on the lemma 2.5 from the second paper.)
+    if (NCD == To || NCD == ToIDom) return;
+
+    // Identify and collect affected nodes.
+    InsertionInfo II;
+    DEBUG(dbgs() << "Marking " << BlockNamePrinter(To) << " as affected\n");
+    II.Affected.insert(To);
+    const unsigned ToLevel = To->getLevel();
+    DEBUG(dbgs() << "Putting " << BlockNamePrinter(To) << " into a Bucket\n");
+    II.Bucket.push({ToLevel, To});
+
+    while (!II.Bucket.empty()) {
+      const TreeNodePtr CurrentNode = II.Bucket.top().second;
+      II.Bucket.pop();
+      DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
+                   << BlockNamePrinter(CurrentNode) << "\n");
+      II.Visited.insert(CurrentNode);
+      II.AffectedQueue.push_back(CurrentNode);
+
+      // Discover and collect affected successors of the current node.
+      VisitInsertion(DT, CurrentNode, CurrentNode->getLevel(), NCD, II);
+    }
+
+    // Finish by updating immediate dominators and levels.
+    UpdateInsertion(DT, NCD, II);
+  }
+
+  // Visits an affected node and collect its affected successors.
+  static void VisitInsertion(DomTreeT &DT, const TreeNodePtr TN,
+                             const unsigned RootLevel, const TreeNodePtr NCD,
+                             InsertionInfo &II) {
+    const unsigned NCDLevel = NCD->getLevel();
+    DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << "\n");
+
+    assert(TN->getBlock());
+    for (const NodePtr Succ :
+        ChildrenGetter<NodePtr, IsPostDom>::Get(TN->getBlock())) {
+      const TreeNodePtr SuccTN = DT.getNode(Succ);
+      assert(SuccTN && "Unreachable successor found at reachable insertion");
+      const unsigned SuccLevel = SuccTN->getLevel();
+
+      DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ)
+                   << ", level = " << SuccLevel << "\n");
+
+      // Succ dominated by subtree From -- not affected.
+      // (Based on the lemma 2.5 from the second paper.)
+      if (SuccLevel > RootLevel) {
+        DEBUG(dbgs() << "\t\tDominated by subtree From\n");
+        if (II.Visited.count(SuccTN) != 0) continue;
+
+        DEBUG(dbgs() << "\t\tMarking visited not affected "
+                     << BlockNamePrinter(Succ) << "\n");
+        II.Visited.insert(SuccTN);
+        II.VisitedNotAffectedQueue.push_back(SuccTN);
+        VisitInsertion(DT, SuccTN, RootLevel, NCD, II);
+      } else if ((SuccLevel > NCDLevel + 1) && II.Affected.count(SuccTN) == 0) {
+        DEBUG(dbgs() << "\t\tMarking affected and adding "
+                     << BlockNamePrinter(Succ) << " to a Bucket\n");
+        II.Affected.insert(SuccTN);
+        II.Bucket.push({SuccLevel, SuccTN});
+      }
+    }
+  }
+
+  // Updates immediate dominators and levels after insertion.
+  static void UpdateInsertion(DomTreeT &DT, const TreeNodePtr NCD,
+                              InsertionInfo &II) {
+    DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n");
+
+    for (const TreeNodePtr TN : II.AffectedQueue) {
+      DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN)
+                   << ") = " << BlockNamePrinter(NCD) << "\n");
+      TN->setIDom(NCD);
+    }
+
+    UpdateLevelsAfterInsertion(II);
+  }
+
+  static void UpdateLevelsAfterInsertion(InsertionInfo &II) {
+    DEBUG(dbgs() << "Updating levels for visited but not affected nodes\n");
+
+    for (const TreeNodePtr TN : II.VisitedNotAffectedQueue) {
+      DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = ("
+                   << BlockNamePrinter(TN->getIDom()) << ") "
+                   << TN->getIDom()->getLevel() << " + 1\n");
+      TN->UpdateLevel();
+    }
+  }
+
+  // Handles insertion to previously unreachable nodes.
+  static void InsertUnreachable(DomTreeT &DT, const TreeNodePtr From,
+                                const NodePtr To) {
+    DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From)
+                 << " -> (unreachable) " << BlockNamePrinter(To) << "\n");
+
+    // Collect discovered edges to already reachable nodes.
+    SmallVector<std::pair<NodePtr, TreeNodePtr>, 8> DiscoveredEdgesToReachable;
+    // Discover and connect nodes that became reachable with the insertion.
+    ComputeUnreachableDominators(DT, To, From, DiscoveredEdgesToReachable);
+
+    DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From)
+                 << " -> (prev unreachable) " << BlockNamePrinter(To) << "\n");
+
+    DEBUG(DT.print(dbgs()));
+
+    // Used the discovered edges and inset discovered connecting (incoming)
+    // edges.
+    for (const auto &Edge : DiscoveredEdgesToReachable) {
+      DEBUG(dbgs() << "\tInserting discovered connecting edge "
+                   << BlockNamePrinter(Edge.first) << " -> "
+                   << BlockNamePrinter(Edge.second) << "\n");
+      InsertReachable(DT, DT.getNode(Edge.first), Edge.second);
+    }
+  }
+
+  // Connects nodes that become reachable with an insertion.
+  static void ComputeUnreachableDominators(
+      DomTreeT &DT, const NodePtr Root, const TreeNodePtr Incoming,
+      SmallVectorImpl<std::pair<NodePtr, TreeNodePtr>>
+      &DiscoveredConnectingEdges) {
+    assert(!DT.getNode(Root) && "Root must not be reachable");
+
+    // Visit only previously unreachable nodes.
+    auto UnreachableDescender = [&DT, &DiscoveredConnectingEdges](NodePtr From,
+                                                                  NodePtr To) {
+      const TreeNodePtr ToTN = DT.getNode(To);
+      if (!ToTN) return true;
+
+      DiscoveredConnectingEdges.push_back({From, ToTN});
+      return false;
+    };
+
+    SemiNCAInfo SNCA;
+    SNCA.runDFS<IsPostDom>(Root, 0, UnreachableDescender, 0);
+    SNCA.runSemiNCA(DT);
+    SNCA.attachNewSubtree(DT, Incoming);
+
+    DEBUG(dbgs() << "After adding unreachable nodes\n");
+    DEBUG(DT.print(dbgs()));
   }
 
   // Checks if the tree contains all reachable nodes in the input graph.
@@ -298,12 +539,23 @@ struct SemiNCAInfo {
     for (auto &NodeToTN : DT.DomTreeNodes) {
       const TreeNodePtr TN = NodeToTN.second.get();
       const NodePtr BB = TN->getBlock();
-      if (!BB) continue;
+
+      // Virtual root has a corresponding virtual CFG node.
+      if (DT.isVirtualRoot(TN)) continue;
 
       if (NodeToInfo.count(BB) == 0) {
-        errs() << "DomTree node ";
-        PrintBlockOrNullptr(errs(), BB);
-        errs() << " not found by DFS walk!\n";
+        errs() << "DomTree node " << BlockNamePrinter(BB)
+               << " not found by DFS walk!\n";
+        errs().flush();
+
+        return false;
+      }
+    }
+
+    for (const NodePtr N : NumToNode) {
+      if (N && !DT.getNode(N)) {
+        errs() << "CFG node " << BlockNamePrinter(N)
+               << " not found in the DomTree!\n";
         errs().flush();
 
         return false;
@@ -313,6 +565,215 @@ struct SemiNCAInfo {
     return true;
   }
 
+  static void DeleteEdge(DomTreeT &DT, const NodePtr From, const NodePtr To) {
+    assert(From && To && "Cannot disconnect nullptrs");
+    DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> "
+                 << BlockNamePrinter(To) << "\n");
+
+#ifndef NDEBUG
+    // Ensure that the edge was in fact deleted from the CFG before informing
+    // the DomTree about it.
+    // The check is O(N), so run it only in debug configuration.
+    auto IsSuccessor = [](const NodePtr SuccCandidate, const NodePtr Of) {
+      auto Successors = ChildrenGetter<NodePtr, IsPostDom>::Get(Of);
+      return llvm::find(Successors, SuccCandidate) != Successors.end();
+    };
+    (void)IsSuccessor;
+    assert(!IsSuccessor(To, From) && "Deleted edge still exists in the CFG!");
+#endif
+
+    const TreeNodePtr FromTN = DT.getNode(From);
+    // Deletion in an unreachable subtree -- nothing to do.
+    if (!FromTN) return;
+
+    const TreeNodePtr ToTN = DT.getNode(To);
+    assert(ToTN && "To already unreachable -- there is no edge to delete");
+    const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To);
+    const TreeNodePtr NCD = DT.getNode(NCDBlock);
+
+    // To dominates From -- nothing to do.
+    if (ToTN == NCD) return;
+
+    const TreeNodePtr ToIDom = ToTN->getIDom();
+    DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
+                 << BlockNamePrinter(ToIDom) << "\n");
+
+    // To remains reachable after deletion.
+    // (Based on the caption under Figure 4. from the second paper.)
+    if (FromTN != ToIDom || HasProperSupport(DT, ToTN))
+      DeleteReachable(DT, FromTN, ToTN);
+    else
+      DeleteUnreachable(DT, ToTN);
+  }
+
+  // Handles deletions that leave destination nodes reachable.
+  static void DeleteReachable(DomTreeT &DT, const TreeNodePtr FromTN,
+                              const TreeNodePtr ToTN) {
+    DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN) << " -> "
+                 << BlockNamePrinter(ToTN) << "\n");
+    DEBUG(dbgs() << "\tRebuilding subtree\n");
+
+    // Find the top of the subtree that needs to be rebuilt.
+    // (Based on the lemma 2.6 from the second paper.)
+    const NodePtr ToIDom =
+        DT.findNearestCommonDominator(FromTN->getBlock(), ToTN->getBlock());
+    assert(ToIDom || DT.isPostDominator());
+    const TreeNodePtr ToIDomTN = DT.getNode(ToIDom);
+    assert(ToIDomTN);
+    const TreeNodePtr PrevIDomSubTree = ToIDomTN->getIDom();
+    // Top of the subtree to rebuild is the root node. Rebuild the tree from
+    // scratch.
+    if (!PrevIDomSubTree) {
+      DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
+      DT.recalculate(*DT.Parent);
+      return;
+    }
+
+    // Only visit nodes in the subtree starting at To.
+    const unsigned Level = ToIDomTN->getLevel();
+    auto DescendBelow = [Level, &DT](NodePtr, NodePtr To) {
+      return DT.getNode(To)->getLevel() > Level;
+    };
+
+    DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN) << "\n");
+
+    SemiNCAInfo SNCA;
+    SNCA.runDFS<IsPostDom>(ToIDom, 0, DescendBelow, 0);
+    DEBUG(dbgs() << "\tRunning Semi-NCA\n");
+    SNCA.runSemiNCA(DT, Level);
+    SNCA.reattachExistingSubtree(DT, PrevIDomSubTree);
+  }
+
+  // Checks if a node has proper support, as defined on the page 3 and later
+  // explained on the page 7 of the second paper.
+  static bool HasProperSupport(DomTreeT &DT, const TreeNodePtr TN) {
+    DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n");
+    for (const NodePtr Pred :
+        ChildrenGetter<NodePtr, !IsPostDom>::Get(TN->getBlock())) {
+      DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
+      if (!DT.getNode(Pred)) continue;
+
+      const NodePtr Support =
+          DT.findNearestCommonDominator(TN->getBlock(), Pred);
+      DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
+      if (Support != TN->getBlock()) {
+        DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
+                     << " is reachable from support "
+                     << BlockNamePrinter(Support) << "\n");
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  // Handle deletions that make destination node unreachable.
+  // (Based on the lemma 2.7 from the second paper.)
+  static void DeleteUnreachable(DomTreeT &DT, const TreeNodePtr ToTN) {
+    DEBUG(dbgs() << "Deleting unreachable subtree " << BlockNamePrinter(ToTN)
+                 << "\n");
+    assert(ToTN);
+    assert(ToTN->getBlock());
+
+    SmallVector<NodePtr, 16> AffectedQueue;
+    const unsigned Level = ToTN->getLevel();
+
+    // Traverse destination node's descendants with greater level in the tree
+    // and collect visited nodes.
+    auto DescendAndCollect = [Level, &AffectedQueue, &DT](NodePtr, NodePtr To) {
+      const TreeNodePtr TN = DT.getNode(To);
+      assert(TN);
+      if (TN->getLevel() > Level) return true;
+      if (llvm::find(AffectedQueue, To) == AffectedQueue.end())
+        AffectedQueue.push_back(To);
+
+      return false;
+    };
+
+    SemiNCAInfo SNCA;
+    unsigned LastDFSNum =
+        SNCA.runDFS<IsPostDom>(ToTN->getBlock(), 0, DescendAndCollect, 0);
+
+    TreeNodePtr MinNode = ToTN;
+
+    // Identify the top of the subtree to rebuilt by finding the NCD of all
+    // the affected nodes.
+    for (const NodePtr N : AffectedQueue) {
+      const TreeNodePtr TN = DT.getNode(N);
+      const NodePtr NCDBlock =
+          DT.findNearestCommonDominator(TN->getBlock(), ToTN->getBlock());
+      assert(NCDBlock || DT.isPostDominator());
+      const TreeNodePtr NCD = DT.getNode(NCDBlock);
+      assert(NCD);
+
+      DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN)
+                   << " with NCD = " << BlockNamePrinter(NCD)
+                   << ", MinNode =" << BlockNamePrinter(MinNode) << "\n");
+      if (NCD != TN && NCD->getLevel() < MinNode->getLevel()) MinNode = NCD;
+    }
+
+    // Root reached, rebuild the whole tree from scratch.
+    if (!MinNode->getIDom()) {
+      DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
+      DT.recalculate(*DT.Parent);
+      return;
+    }
+
+    // Erase the unreachable subtree in reverse preorder to process all children
+    // before deleting their parent.
+    for (unsigned i = LastDFSNum; i > 0; --i) {
+      const NodePtr N = SNCA.NumToNode[i];
+      const TreeNodePtr TN = DT.getNode(N);
+      DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");
+
+      EraseNode(DT, TN);
+    }
+
+    // The affected subtree start at the To node -- there's no extra work to do.
+    if (MinNode == ToTN) return;
+
+    DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = "
+                 << BlockNamePrinter(MinNode) << "\n");
+    const unsigned MinLevel = MinNode->getLevel();
+    const TreeNodePtr PrevIDom = MinNode->getIDom();
+    assert(PrevIDom);
+    SNCA.clear();
+
+    // Identify nodes that remain in the affected subtree.
+    auto DescendBelow = [MinLevel, &DT](NodePtr, NodePtr To) {
+      const TreeNodePtr ToTN = DT.getNode(To);
+      return ToTN && ToTN->getLevel() > MinLevel;
+    };
+    SNCA.runDFS<IsPostDom>(MinNode->getBlock(), 0, DescendBelow, 0);
+
+    DEBUG(dbgs() << "Previous IDom(MinNode) = " << BlockNamePrinter(PrevIDom)
+                 << "\nRunning Semi-NCA\n");
+
+    // Rebuild the remaining part of affected subtree.
+    SNCA.runSemiNCA(DT, MinLevel);
+    SNCA.reattachExistingSubtree(DT, PrevIDom);
+  }
+
+  // Removes leaf tree nodes from the dominator tree.
+  static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) {
+    assert(TN);
+    assert(TN->getNumChildren() == 0 && "Not a tree leaf");
+
+    const TreeNodePtr IDom = TN->getIDom();
+    assert(IDom);
+
+    auto ChIt = llvm::find(IDom->Children, TN);
+    assert(ChIt != IDom->Children.end());
+    std::swap(*ChIt, IDom->Children.back());
+    IDom->Children.pop_back();
+
+    DT.DomTreeNodes.erase(TN->getBlock());
+  }
+
+  //~~
+  //===--------------- DomTree correctness verification ---------------------===
+  //~~
+
   // Check if for every parent with a level L in the tree all of its children
   // have level L + 1.
   static bool VerifyLevels(const DomTreeT &DT) {
@@ -323,20 +784,18 @@ struct SemiNCAInfo {
 
       const TreeNodePtr IDom = TN->getIDom();
       if (!IDom && TN->getLevel() != 0) {
-        errs() << "Node without an IDom ";
-        PrintBlockOrNullptr(errs(), BB);
-        errs() << " has a nonzero level " << TN->getLevel() << "!\n";
+        errs() << "Node without an IDom " << BlockNamePrinter(BB)
+               << " has a nonzero level " << TN->getLevel() << "!\n";
         errs().flush();
 
         return false;
       }
 
       if (IDom && TN->getLevel() != IDom->getLevel() + 1) {
-        errs() << "Node ";
-        PrintBlockOrNullptr(errs(), BB);
-        errs() << " has level " << TN->getLevel() << " while it's IDom ";
-        PrintBlockOrNullptr(errs(), IDom->getBlock());
-        errs() << " has level " << IDom->getLevel() << "!\n";
+        errs() << "Node " << BlockNamePrinter(BB) << " has level "
+               << TN->getLevel() << " while its IDom "
+               << BlockNamePrinter(IDom->getBlock()) << " has level "
+               << IDom->getLevel() << "!\n";
         errs().flush();
 
         return false;
@@ -363,18 +822,14 @@ struct SemiNCAInfo {
       assert(ToTN);
 
       const NodePtr NCD = DT.findNearestCommonDominator(From, To);
-      const TreeNodePtr NCDTN = NCD ? DT.getNode(NCD) : nullptr;
+      const TreeNodePtr NCDTN = DT.getNode(NCD);
       const TreeNodePtr ToIDom = ToTN->getIDom();
       if (NCDTN != ToTN && NCDTN != ToIDom) {
-        errs() << "NearestCommonDominator verification failed:\n\tNCD(From:";
-        PrintBlockOrNullptr(errs(), From);
-        errs() << ", To:";
-        PrintBlockOrNullptr(errs(), To);
-        errs() << ") = ";
-        PrintBlockOrNullptr(errs(), NCD);
-        errs() << ",\t (should be To or IDom[To]: ";
-        PrintBlockOrNullptr(errs(), ToIDom ? ToIDom->getBlock() : nullptr);
-        errs() << ")\n";
+        errs() << "NearestCommonDominator verification failed:\n\tNCD(From:"
+               << BlockNamePrinter(From) << ", To:" << BlockNamePrinter(To)
+               << ") = " << BlockNamePrinter(NCD)
+               << ",\t (should be To or IDom[To]: " << BlockNamePrinter(ToIDom)
+               << ")\n";
         errs().flush();
 
         return false;
@@ -440,11 +895,9 @@ struct SemiNCAInfo {
 
       for (TreeNodePtr Child : TN->getChildren())
         if (NodeToInfo.count(Child->getBlock()) != 0) {
-          errs() << "Child ";
-          PrintBlockOrNullptr(errs(), Child->getBlock());
-          errs() << " reachable after its parent ";
-          PrintBlockOrNullptr(errs(), BB);
-          errs() << " is removed!\n";
+          errs() << "Child " << BlockNamePrinter(Child)
+                 << " reachable after its parent " << BlockNamePrinter(BB)
+                 << " is removed!\n";
           errs().flush();
 
           return false;
@@ -477,11 +930,9 @@ struct SemiNCAInfo {
           if (S == N) continue;
 
           if (NodeToInfo.count(S->getBlock()) == 0) {
-            errs() << "Node ";
-            PrintBlockOrNullptr(errs(), S->getBlock());
-            errs() << " not reachable when its sibling ";
-            PrintBlockOrNullptr(errs(), N->getBlock());
-            errs() << " is removed!\n";
+            errs() << "Node " << BlockNamePrinter(S)
+                   << " not reachable when its sibling " << BlockNamePrinter(N)
+                   << " is removed!\n";
             errs().flush();
 
             return false;
@@ -494,23 +945,30 @@ struct SemiNCAInfo {
   }
 };
 
-template <class FuncT, class NodeT>
-void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT,
-               FuncT &F) {
-  using NodePtr = typename GraphTraits<NodeT>::NodeRef;
-  static_assert(std::is_pointer<NodePtr>::value,
-                "NodePtr should be a pointer type");
-  SemiNCAInfo<typename std::remove_pointer<NodePtr>::type> SNCA;
-  SNCA.template runSemiNCA<NodeT>(DT, GraphTraits<FuncT *>::size(&F));
+
+template <class DomTreeT, class FuncT>
+void Calculate(DomTreeT &DT, FuncT &F) {
+  SemiNCAInfo<DomTreeT> SNCA;
+  SNCA.calculateFromScratch(DT, GraphTraits<FuncT *>::size(&F));
 }
 
-template <class NodeT>
-bool Verify(const DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT) {
-  using NodePtr = typename GraphTraits<NodeT>::NodeRef;
-  static_assert(std::is_pointer<NodePtr>::value,
-                "NodePtr should be a pointer type");
-  SemiNCAInfo<typename std::remove_pointer<NodePtr>::type> SNCA;
+template <class DomTreeT>
+void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To) {
+  if (DT.isPostDominator()) std::swap(From, To);
+  SemiNCAInfo<DomTreeT>::InsertEdge(DT, From, To);
+}
 
+template <class DomTreeT>
+void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
+                typename DomTreeT::NodePtr To) {
+  if (DT.isPostDominator()) std::swap(From, To);
+  SemiNCAInfo<DomTreeT>::DeleteEdge(DT, From, To);
+}
+
+template <class DomTreeT>
+bool Verify(const DomTreeT &DT) {
+  SemiNCAInfo<DomTreeT> SNCA;
   return SNCA.verifyReachability(DT) && SNCA.VerifyLevels(DT) &&
          SNCA.verifyNCD(DT) && SNCA.verifyParentProperty(DT) &&
          SNCA.verifySiblingProperty(DT);
@@ -519,4 +977,6 @@ bool Verify(const DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT) {
 }  // namespace DomTreeBuilder
 }  // namespace llvm
 
+#undef DEBUG_TYPE
+
 #endif
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 72c28865ac57..e13582f6a6d3 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -85,6 +85,7 @@ enum ArchExtKind : unsigned {
   AEK_DSP = 0x400,
   AEK_FP16 = 0x800,
   AEK_RAS = 0x1000,
+  AEK_SVE = 0x2000,
   // Unsupported extensions.
   AEK_OS = 0x8000000,
   AEK_IWMMXT = 0x10000000,
@@ -166,7 +167,8 @@ enum ArchExtKind : unsigned {
   AEK_FP16 = 0x20,
   AEK_PROFILE = 0x40,
   AEK_RAS = 0x80,
-  AEK_LSE = 0x100
+  AEK_LSE = 0x100,
+  AEK_SVE = 0x200
 };
 
 StringRef getCanonicalArchName(StringRef Arch);
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 15b3b11db045..71fdf47f1979 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -1114,6 +1114,10 @@ class Input : public IO {
         void *Ctxt = nullptr,
         SourceMgr::DiagHandlerTy DiagHandler = nullptr,
         void *DiagHandlerCtxt = nullptr);
+  Input(MemoryBufferRef Input,
+        void *Ctxt = nullptr,
+        SourceMgr::DiagHandlerTy DiagHandler = nullptr,
+        void *DiagHandlerCtxt = nullptr);
   ~Input() override;
 
   // Check if there was an syntax or semantic error during parsing.
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 178b08d7b8b7..50de41fd1320 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -58,6 +58,7 @@ def : GINodeEquiv<G_SITOFP, sint_to_fp>;
 def : GINodeEquiv<G_UITOFP, uint_to_fp>;
 def : GINodeEquiv<G_FADD, fadd>;
 def : GINodeEquiv<G_FSUB, fsub>;
+def : GINodeEquiv<G_FMA, fma>;
 def : GINodeEquiv<G_FMUL, fmul>;
 def : GINodeEquiv<G_FDIV, fdiv>;
 def : GINodeEquiv<G_FREM, frem>;
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 60a03bdc182d..23711d636c9a 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -2012,6 +2012,35 @@ class TargetLoweringBase {
     return isExtFreeImpl(I);
   }
 
+  /// Return true if \p Load and \p Ext can form an ExtLoad.
+  /// For example, in AArch64
+  ///   %L = load i8, i8* %ptr
+  ///   %E = zext i8 %L to i32
+  /// can be lowered into one load instruction
+  ///   ldrb w0, [x0]
+  bool isExtLoad(const LoadInst *Load, const Instruction *Ext,
+                 const DataLayout &DL) const {
+    EVT VT = getValueType(DL, Ext->getType());
+    EVT LoadVT = getValueType(DL, Load->getType());
+
+    // If the load has other users and the truncate is not free, the ext
+    // probably isn't free.
+    if (!Load->hasOneUse() && (isTypeLegal(LoadVT) || !isTypeLegal(VT)) &&
+        !isTruncateFree(Ext->getType(), Load->getType()))
+      return false;
+
+    // Check whether the target supports casts folded into loads.
+    unsigned LType;
+    if (isa<ZExtInst>(Ext))
+      LType = ISD::ZEXTLOAD;
+    else {
+      assert(isa<SExtInst>(Ext) && "Unexpected ext type!");
+      LType = ISD::SEXTLOAD;
+    }
+
+    return isLoadExtLegal(LType, VT, LoadVT);
+  }
+
   /// Return true if any actual instruction that defines a value of type FromTy
   /// implicitly zero-extends the value to ToTy in the result register.
   ///
diff --git a/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h b/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h
new file mode 100644
index 000000000000..964b0f7620a2
--- /dev/null
+++ b/include/llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h
@@ -0,0 +1,24 @@
+//===- DlltoolDriver.h - dlltool.exe-compatible driver ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a dlltool.exe-compatible driver.
+// Used by llvm-dlltool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLDRIVERS_LLVM_DLLTOOL_DLLTOOLDRIVER_H
+#define LLVM_TOOLDRIVERS_LLVM_DLLTOOL_DLLTOOLDRIVER_H
+
+namespace llvm {
+template <typename T> class ArrayRef;
+
+int dlltoolDriverMain(ArrayRef<const char *> ArgsArr);
+} // namespace llvm
+
+#endif
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 3ddefc6520a7..74b5d79ebac5 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -433,7 +433,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
         if (Visited.insert(C).second)
           Worklist.push_back(C);
 
-  LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
+  auto VisitRef = [&](Function &Referee) {
     Node &RefereeN = *G.lookup(Referee);
     Edge *E = N->lookup(RefereeN);
     // FIXME: Similarly to new calls, we also currently preclude
@@ -444,7 +444,12 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     RetainedEdges.insert(&RefereeN);
     if (E->isCall())
       DemotedCallTargets.insert(&RefereeN);
-  });
+  };
+  LazyCallGraph::visitReferences(Worklist, Visited, VisitRef);
+
+  // Include synthetic reference edges to known, defined lib functions.
+  for (auto *F : G.getLibFunctions())
+    VisitRef(*F);
 
   // First remove all of the edges that are no longer present in this function.
   // We have to build a list of dead targets first and then remove them as the
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 5b6e2d0476e4..c08c6cfe0c3b 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -14,7 +14,8 @@
 using namespace llvm;
 
 namespace llvm {
-template class DominanceFrontierBase<BasicBlock>;
+template class DominanceFrontierBase<BasicBlock, false>;
+template class DominanceFrontierBase<BasicBlock, true>;
 template class ForwardDominanceFrontierBase<BasicBlock>;
 }
 
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
index 27c6b580e7ac..95ab6ee3db5b 100644
--- a/lib/Analysis/InstCount.cpp
+++ b/lib/Analysis/InstCount.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 STATISTIC(TotalInsts , "Number of instructions (of all types)");
 STATISTIC(TotalBlocks, "Number of basic blocks");
 STATISTIC(TotalFuncs , "Number of non-external functions");
-STATISTIC(TotalMemInst, "Number of memory instructions");
 
 #define HANDLE_INST(N, OPCODE, CLASS) \
   STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts");
@@ -75,13 +74,6 @@ FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
 // function.
 //
 bool InstCount::runOnFunction(Function &F) {
-  unsigned StartMemInsts =
-    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst;
   visit(F);
-  unsigned EndMemInsts =
-    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst;
-  TotalMemInst += EndMemInsts-StartMemInsts;
   return false;
 }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index f6632020b8fc..b4f3b87e1846 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1745,14 +1745,11 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
-  Value *A = nullptr, *B = nullptr;
-  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_c_Or(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A & (A | ?) = A
-  if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
   // A mask that only clears known zeros of a shifted value is a no-op.
@@ -1852,26 +1849,22 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Constant::getAllOnesValue(Op0->getType());
 
   // (A & ?) | A = A
-  Value *A = nullptr, *B = nullptr;
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A | (A & ?) = A
-  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
     return Op0;
 
   // ~(A & ?) | A = -1
-  if (match(Op0, m_Not(m_And(m_Value(A), m_Value(B)))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op1->getType());
 
   // A | ~(A & ?) = -1
-  if (match(Op1, m_Not(m_And(m_Value(A), m_Value(B)))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
+  Value *A, *B;
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
   // (A & ~B) | (B ^ A) -> (B ^ A)
diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp
index 0e02850df349..3992657417c5 100644
--- a/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -17,8 +17,8 @@
 #include <queue>
 
 namespace llvm {
-template <class NodeTy>
-void IDFCalculator<NodeTy>::calculate(
+template <class NodeTy, bool IsPostDom>
+void IDFCalculator<NodeTy, IsPostDom>::calculate(
     SmallVectorImpl<BasicBlock *> &PHIBlocks) {
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
@@ -88,6 +88,6 @@ void IDFCalculator<NodeTy>::calculate(
   }
 }
 
-template class IDFCalculator<BasicBlock *>;
-template class IDFCalculator<Inverse<BasicBlock *>>;
+template class IDFCalculator<BasicBlock *, false>;
+template class IDFCalculator<Inverse<BasicBlock *>, true>;
 }
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index a4c3e43b4b0c..d287f81985fd 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -106,6 +106,13 @@ LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
             LazyCallGraph::Edge::Ref);
   });
 
+  // Add implicit reference edges to any defined libcall functions (if we
+  // haven't found an explicit edge).
+  for (auto *F : G->LibFunctions)
+    if (!Visited.count(F))
+      addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*F),
+              LazyCallGraph::Edge::Ref);
+
   return *Edges;
 }
 
@@ -120,15 +127,34 @@ LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const {
 }
 #endif
 
-LazyCallGraph::LazyCallGraph(Module &M) {
+static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
+  LibFunc LF;
+
+  // Either this is a normal library function or a "vectorizable" function.
+  return TLI.getLibFunc(F, LF) || TLI.isFunctionVectorizable(F.getName());
+}
+
+LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
   DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
                << "\n");
-  for (Function &F : M)
-    if (!F.isDeclaration() && !F.hasLocalLinkage()) {
-      DEBUG(dbgs() << "  Adding '" << F.getName()
-                   << "' to entry set of the graph.\n");
-      addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
-    }
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    // If this function is a known lib function to LLVM then we want to
+    // synthesize reference edges to it to model the fact that LLVM can turn
+    // arbitrary code into a library function call.
+    if (isKnownLibFunction(F, TLI))
+      LibFunctions.insert(&F);
+
+    if (F.hasLocalLinkage())
+      continue;
+
+    // External linkage defined functions have edges to them from other
+    // modules.
+    DEBUG(dbgs() << "  Adding '" << F.getName()
+                 << "' to entry set of the graph.\n");
+    addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
+  }
 
   // Now add entry nodes for functions reachable via initializers to globals.
   SmallVector<Constant *, 16> Worklist;
@@ -149,7 +175,8 @@ LazyCallGraph::LazyCallGraph(Module &M) {
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
     : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
       EntryEdges(std::move(G.EntryEdges)), SCCBPA(std::move(G.SCCBPA)),
-      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)) {
+      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)),
+      LibFunctions(std::move(G.LibFunctions)) {
   updateGraphPtrs();
 }
 
@@ -160,6 +187,7 @@ LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   SCCBPA = std::move(G.SCCBPA);
   SCCMap = std::move(G.SCCMap);
   LeafRefSCCs = std::move(G.LeafRefSCCs);
+  LibFunctions = std::move(G.LibFunctions);
   updateGraphPtrs();
   return *this;
 }
@@ -1580,6 +1608,11 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   assert(F.use_empty() &&
          "This routine should only be called on trivially dead functions!");
 
+  // We shouldn't remove library functions as they are never really dead while
+  // the call graph is in use -- every function definition refers to them.
+  assert(!isLibFunction(F) &&
+         "Must not remove lib functions from the call graph!");
+
   auto NI = NodeMap.find(&F);
   if (NI == NodeMap.end())
     // Not in the graph at all!
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index baf932432a0a..697b58622bb4 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -609,7 +609,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   return NearLoop;
 }
 
-LoopInfo::LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree) {
+LoopInfo::LoopInfo(const DomTreeBase<BasicBlock> &DomTree) {
   analyze(DomTree);
 }
 
diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
index 86d0d92799f2..86de474c7aa9 100644
--- a/lib/Analysis/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -39,7 +39,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Scalar.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "memoryssa"
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index 1caf151546d9..811373ac850b 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -23,6 +23,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "postdomtree"
 
+template class llvm::DominatorTreeBase<BasicBlock, true>; // PostDomTreeBase
+
 //===----------------------------------------------------------------------===//
 //  PostDominatorTree Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 3fb1ab980add..b973203a89b6 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -4173,6 +4173,319 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
   return None;
 }
 
+/// Helper function to createAddRecFromPHIWithCasts. We have a phi 
+/// node whose symbolic (unknown) SCEV is \p SymbolicPHI, which is updated via
+/// the loop backedge by a SCEVAddExpr, possibly also with a few casts on the 
+/// way. This function checks if \p Op, an operand of this SCEVAddExpr, 
+/// follows one of the following patterns:
+/// Op == (SExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
+/// Op == (ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
+/// If the SCEV expression of \p Op conforms with one of the expected patterns
+/// we return the type of the truncation operation, and indicate whether the
+/// truncated type should be treated as signed/unsigned by setting 
+/// \p Signed to true/false, respectively.
+static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
+                               bool &Signed, ScalarEvolution &SE) {
+
+  // The case where Op == SymbolicPHI (that is, with no type conversions on 
+  // the way) is handled by the regular add recurrence creating logic and 
+  // would have already been triggered in createAddRecForPHI. Reaching it here
+  // means that createAddRecFromPHI had failed for this PHI before (e.g., 
+  // because one of the other operands of the SCEVAddExpr updating this PHI is
+  // not invariant). 
+  //
+  // Here we look for the case where Op = (ext(trunc(SymbolicPHI))), and in 
+  // this case predicates that allow us to prove that Op == SymbolicPHI will
+  // be added.
+  if (Op == SymbolicPHI)
+    return nullptr;
+
+  unsigned SourceBits = SE.getTypeSizeInBits(SymbolicPHI->getType());
+  unsigned NewBits = SE.getTypeSizeInBits(Op->getType());
+  if (SourceBits != NewBits)
+    return nullptr;
+
+  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
+  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
+  if (!SExt && !ZExt)
+    return nullptr;
+  const SCEVTruncateExpr *Trunc =
+      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
+           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
+  if (!Trunc)
+    return nullptr;
+  const SCEV *X = Trunc->getOperand();
+  if (X != SymbolicPHI)
+    return nullptr;
+  Signed = SExt ? true : false; 
+  return Trunc->getType();
+}
+
+static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
+  if (!PN->getType()->isIntegerTy())
+    return nullptr;
+  const Loop *L = LI.getLoopFor(PN->getParent());
+  if (!L || L->getHeader() != PN->getParent())
+    return nullptr;
+  return L;
+}
+
+// Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the
+// computation that updates the phi follows the following pattern:
+//   (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum
+// which correspond to a phi->trunc->sext/zext->add->phi update chain.
+// If so, try to see if it can be rewritten as an AddRecExpr under some
+// Predicates. If successful, return them as a pair. Also cache the results
+// of the analysis.
+//
+// Example usage scenario:
+//    Say the Rewriter is called for the following SCEV:
+//         8 * ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
+//    where:
+//         %X = phi i64 (%Start, %BEValue)
+//    It will visitMul->visitAdd->visitSExt->visitTrunc->visitUnknown(%X),
+//    and call this function with %SymbolicPHI = %X.
+//
+//    The analysis will find that the value coming around the backedge has 
+//    the following SCEV:
+//         BEValue = ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
+//    Upon concluding that this matches the desired pattern, the function
+//    will return the pair {NewAddRec, SmallPredsVec} where:
+//         NewAddRec = {%Start,+,%Step}
+//         SmallPredsVec = {P1, P2, P3} as follows:
+//           P1(WrapPred): AR: {trunc(%Start),+,(trunc %Step)}<nsw> Flags: <nssw>
+//           P2(EqualPred): %Start == (sext i32 (trunc i64 %Start to i32) to i64)
+//           P3(EqualPred): %Step == (sext i32 (trunc i64 %Step to i32) to i64)
+//    The returned pair means that SymbolicPHI can be rewritten into NewAddRec
+//    under the predicates {P1,P2,P3}.
+//    This predicated rewrite will be cached in PredicatedSCEVRewrites:
+//         PredicatedSCEVRewrites[{%X,L}] = {NewAddRec, {P1,P2,P3)} 
+//
+// TODO's:
+//
+// 1) Extend the Induction descriptor to also support inductions that involve
+//    casts: When needed (namely, when we are called in the context of the 
+//    vectorizer induction analysis), a Set of cast instructions will be 
+//    populated by this method, and provided back to isInductionPHI. This is
+//    needed to allow the vectorizer to properly record them to be ignored by
+//    the cost model and to avoid vectorizing them (otherwise these casts,
+//    which are redundant under the runtime overflow checks, will be 
+//    vectorized, which can be costly).  
+//
+// 2) Support additional induction/PHISCEV patterns: We also want to support
+//    inductions where the sext-trunc / zext-trunc operations (partly) occur 
+//    after the induction update operation (the induction increment):
+//
+//      (Trunc iy (SExt/ZExt ix (%SymbolicPHI + InvariantAccum) to iy) to ix)
+//    which correspond to a phi->add->trunc->sext/zext->phi update chain.
+//
+//      (Trunc iy ((SExt/ZExt ix (%SymbolicPhi) to iy) + InvariantAccum) to ix)
+//    which correspond to a phi->trunc->add->sext/zext->phi update chain.
+//
+// 3) Outline common code with createAddRecFromPHI to avoid duplication.
+//
+Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI) {
+  SmallVector<const SCEVPredicate *, 3> Predicates;
+
+  // *** Part1: Analyze if we have a phi-with-cast pattern for which we can 
+  // return an AddRec expression under some predicate.
+ 
+  auto *PN = cast<PHINode>(SymbolicPHI->getValue());
+  const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
+  assert (L && "Expecting an integer loop header phi");
+
+  // The loop may have multiple entrances or multiple exits; we can analyze
+  // this phi as an addrec if it has a unique entry value and a unique
+  // backedge value.
+  Value *BEValueV = nullptr, *StartValueV = nullptr;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (L->contains(PN->getIncomingBlock(i))) {
+      if (!BEValueV) {
+        BEValueV = V;
+      } else if (BEValueV != V) {
+        BEValueV = nullptr;
+        break;
+      }
+    } else if (!StartValueV) {
+      StartValueV = V;
+    } else if (StartValueV != V) {
+      StartValueV = nullptr;
+      break;
+    }
+  }
+  if (!BEValueV || !StartValueV)
+    return None;
+
+  const SCEV *BEValue = getSCEV(BEValueV);
+
+  // If the value coming around the backedge is an add with the symbolic
+  // value we just inserted, possibly with casts that we can ignore under
+  // an appropriate runtime guard, then we found a simple induction variable!
+  const auto *Add = dyn_cast<SCEVAddExpr>(BEValue);
+  if (!Add)
+    return None;
+
+  // If there is a single occurrence of the symbolic value, possibly
+  // casted, replace it with a recurrence. 
+  unsigned FoundIndex = Add->getNumOperands();
+  Type *TruncTy = nullptr;
+  bool Signed;
+  for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+    if ((TruncTy = 
+             isSimpleCastedPHI(Add->getOperand(i), SymbolicPHI, Signed, *this)))
+      if (FoundIndex == e) {
+        FoundIndex = i;
+        break;
+      }
+
+  if (FoundIndex == Add->getNumOperands())
+    return None;
+
+  // Create an add with everything but the specified operand.
+  SmallVector<const SCEV *, 8> Ops;
+  for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+    if (i != FoundIndex)
+      Ops.push_back(Add->getOperand(i));
+  const SCEV *Accum = getAddExpr(Ops);
+
+  // The runtime checks will not be valid if the step amount is
+  // varying inside the loop.
+  if (!isLoopInvariant(Accum, L))
+    return None;
+
+  
+  // *** Part2: Create the predicates 
+
+  // Analysis was successful: we have a phi-with-cast pattern for which we
+  // can return an AddRec expression under the following predicates:
+  //
+  // P1: A Wrap predicate that guarantees that Trunc(Start) + i*Trunc(Accum)
+  //     fits within the truncated type (does not overflow) for i = 0 to n-1.
+  // P2: An Equal predicate that guarantees that 
+  //     Start = (Ext ix (Trunc iy (Start) to ix) to iy)
+  // P3: An Equal predicate that guarantees that 
+  //     Accum = (Ext ix (Trunc iy (Accum) to ix) to iy)
+  //
+  // As we next prove, the above predicates guarantee that: 
+  //     Start + i*Accum = (Ext ix (Trunc iy ( Start + i*Accum ) to ix) to iy)
+  //
+  //
+  // More formally, we want to prove that:
+  //     Expr(i+1) = Start + (i+1) * Accum 
+  //               = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum 
+  //
+  // Given that:
+  // 1) Expr(0) = Start 
+  // 2) Expr(1) = Start + Accum 
+  //            = (Ext ix (Trunc iy (Start) to ix) to iy) + Accum :: from P2
+  // 3) Induction hypothesis (step i):
+  //    Expr(i) = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum 
+  //
+  // Proof:
+  //  Expr(i+1) =
+  //   = Start + (i+1)*Accum
+  //   = (Start + i*Accum) + Accum
+  //   = Expr(i) + Accum  
+  //   = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + Accum 
+  //                                                             :: from step i
+  //
+  //   = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + Accum + Accum 
+  //
+  //   = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy)
+  //     + (Ext ix (Trunc iy (Accum) to ix) to iy)
+  //     + Accum                                                     :: from P3
+  //
+  //   = (Ext ix (Trunc iy ((Start + (i-1)*Accum) + Accum) to ix) to iy) 
+  //     + Accum                            :: from P1: Ext(x)+Ext(y)=>Ext(x+y)
+  //
+  //   = (Ext ix (Trunc iy (Start + i*Accum) to ix) to iy) + Accum
+  //   = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum 
+  //
+  // By induction, the same applies to all iterations 1<=i<n:
+  //
+  
+  // Create a truncated addrec for which we will add a no overflow check (P1).
+  const SCEV *StartVal = getSCEV(StartValueV);
+  const SCEV *PHISCEV = 
+      getAddRecExpr(getTruncateExpr(StartVal, TruncTy),
+                    getTruncateExpr(Accum, TruncTy), L, SCEV::FlagAnyWrap); 
+  const auto *AR = cast<SCEVAddRecExpr>(PHISCEV);
+
+  SCEVWrapPredicate::IncrementWrapFlags AddedFlags =
+      Signed ? SCEVWrapPredicate::IncrementNSSW
+             : SCEVWrapPredicate::IncrementNUSW;
+  const SCEVPredicate *AddRecPred = getWrapPredicate(AR, AddedFlags);
+  Predicates.push_back(AddRecPred);
+
+  // Create the Equal Predicates P2,P3:
+  auto AppendPredicate = [&](const SCEV *Expr) -> void {
+    assert (isLoopInvariant(Expr, L) && "Expr is expected to be invariant");
+    const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy);
+    const SCEV *ExtendedExpr =
+        Signed ? getSignExtendExpr(TruncatedExpr, Expr->getType())
+               : getZeroExtendExpr(TruncatedExpr, Expr->getType());
+    if (Expr != ExtendedExpr &&
+        !isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) {
+      const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr);
+      DEBUG (dbgs() << "Added Predicate: " << *Pred);
+      Predicates.push_back(Pred);
+    }
+  };
+  
+  AppendPredicate(StartVal);
+  AppendPredicate(Accum);
+  
+  // *** Part3: Predicates are ready. Now go ahead and create the new addrec in
+  // which the casts had been folded away. The caller can rewrite SymbolicPHI
+  // into NewAR if it will also add the runtime overflow checks specified in
+  // Predicates.  
+  auto *NewAR = getAddRecExpr(StartVal, Accum, L, SCEV::FlagAnyWrap);
+
+  std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>> PredRewrite =
+      std::make_pair(NewAR, Predicates);
+  // Remember the result of the analysis for this SCEV at this locayyytion.
+  PredicatedSCEVRewrites[{SymbolicPHI, L}] = PredRewrite;
+  return PredRewrite;
+}
+
+Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+ScalarEvolution::createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI) {
+
+  auto *PN = cast<PHINode>(SymbolicPHI->getValue());
+  const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
+  if (!L)
+    return None;
+
+  // Check to see if we already analyzed this PHI.
+  auto I = PredicatedSCEVRewrites.find({SymbolicPHI, L});
+  if (I != PredicatedSCEVRewrites.end()) {
+    std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>> Rewrite =
+        I->second;
+    // Analysis was done before and failed to create an AddRec:
+    if (Rewrite.first == SymbolicPHI) 
+      return None;
+    // Analysis was done before and succeeded to create an AddRec under
+    // a predicate:
+    assert(isa<SCEVAddRecExpr>(Rewrite.first) && "Expected an AddRec");
+    assert(!(Rewrite.second).empty() && "Expected to find Predicates");
+    return Rewrite;
+  }
+
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+    Rewrite = createAddRecFromPHIWithCastsImpl(SymbolicPHI);
+
+  // Record in the cache that the analysis failed
+  if (!Rewrite) {
+    SmallVector<const SCEVPredicate *, 3> Predicates;
+    PredicatedSCEVRewrites[{SymbolicPHI, L}] = {SymbolicPHI, Predicates};
+    return None;
+  }
+
+  return Rewrite;
+}
+
 /// A helper function for createAddRecFromPHI to handle simple cases.
 ///
 /// This function tries to find an AddRec expression for the simplest (yet most
@@ -5904,6 +6217,16 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
   RemoveLoopFromBackedgeMap(BackedgeTakenCounts);
   RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts);
 
+  // Drop information about predicated SCEV rewrites for this loop.
+  for (auto I = PredicatedSCEVRewrites.begin();
+       I != PredicatedSCEVRewrites.end();) {
+    std::pair<const SCEV *, const Loop *> Entry = I->first;
+    if (Entry.second == L)
+      PredicatedSCEVRewrites.erase(I++);
+    else
+      ++I;
+  }
+
   // Drop information about expressions based on loop-header PHIs.
   SmallVector<Instruction *, 16> Worklist;
   PushLoopPHIs(L, Worklist);
@@ -10062,6 +10385,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       UniqueSCEVs(std::move(Arg.UniqueSCEVs)),
       UniquePreds(std::move(Arg.UniquePreds)),
       SCEVAllocator(std::move(Arg.SCEVAllocator)),
+      PredicatedSCEVRewrites(std::move(Arg.PredicatedSCEVRewrites)),
       FirstUnknown(Arg.FirstUnknown) {
   Arg.FirstUnknown = nullptr;
 }
@@ -10462,6 +10786,15 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   HasRecMap.erase(S);
   MinTrailingZerosCache.erase(S);
 
+  for (auto I = PredicatedSCEVRewrites.begin(); 
+       I != PredicatedSCEVRewrites.end();) {
+    std::pair<const SCEV *, const Loop *> Entry = I->first;
+    if (Entry.first == S)
+      PredicatedSCEVRewrites.erase(I++);
+    else
+      ++I;
+  }
+
   auto RemoveSCEVFromBackedgeMap =
       [S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
         for (auto I = Map.begin(), E = Map.end(); I != E;) {
@@ -10621,10 +10954,11 @@ void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
 }
 
-const SCEVPredicate *
-ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS,
-                                   const SCEVConstant *RHS) {
+const SCEVPredicate *ScalarEvolution::getEqualPredicate(const SCEV *LHS,
+                                                        const SCEV *RHS) {
   FoldingSetNodeID ID;
+  assert(LHS->getType() == RHS->getType() &&
+         "Type mismatch between LHS and RHS");
   // Unique this node based on the arguments
   ID.AddInteger(SCEVPredicate::P_Equal);
   ID.AddPointer(LHS);
@@ -10687,8 +11021,7 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
           if (IPred->getLHS() == Expr)
             return IPred->getRHS();
     }
-
-    return Expr;
+    return convertToAddRecWithPreds(Expr);
   }
 
   const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
@@ -10724,17 +11057,41 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
   }
 
 private:
-  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
-                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
-    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+  bool addOverflowAssumption(const SCEVPredicate *P) {
     if (!NewPreds) {
       // Check if we've already made this assumption.
-      return Pred && Pred->implies(A);
+      return Pred && Pred->implies(P);
     }
-    NewPreds->insert(A);
+    NewPreds->insert(P);
     return true;
   }
 
+  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
+                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
+    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+    return addOverflowAssumption(A);
+  }
+
+  // If \p Expr represents a PHINode, we try to see if it can be represented
+  // as an AddRec, possibly under a predicate (PHISCEVPred). If it is possible 
+  // to add this predicate as a runtime overflow check, we return the AddRec.
+  // If \p Expr does not meet these conditions (is not a PHI node, or we 
+  // couldn't create an AddRec for it, or couldn't add the predicate), we just 
+  // return \p Expr.
+  const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) {
+    if (!isa<PHINode>(Expr->getValue()))
+      return Expr;
+    Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+    PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr);
+    if (!PredicatedRewrite)
+      return Expr;
+    for (auto *P : PredicatedRewrite->second){
+      if (!addOverflowAssumption(P))
+        return Expr;
+    }
+    return PredicatedRewrite->first;
+  }
+  
   SmallPtrSetImpl<const SCEVPredicate *> *NewPreds;
   SCEVUnionPredicate *Pred;
   const Loop *L;
@@ -10771,9 +11128,11 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
     : FastID(ID), Kind(Kind) {}
 
 SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
-                                       const SCEVUnknown *LHS,
-                                       const SCEVConstant *RHS)
-    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {}
+                                       const SCEV *LHS, const SCEV *RHS)
+    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {
+  assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match");
+  assert(LHS != RHS && "LHS and RHS are the same SCEV");
+}
 
 bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
   const auto *Op = dyn_cast<SCEVEqualPredicate>(N);
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 94bbc58541a7..25813c65037f 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -82,6 +82,11 @@ int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
   return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
 }
 
+int TargetTransformInfo::getExtCost(const Instruction *I,
+                                    const Value *Src) const {
+  return TTIImpl->getExtCost(I, Src);
+}
+
 int TargetTransformInfo::getIntrinsicCost(
     Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
   int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 428bb21fbf51..90e0d6a216ee 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -588,7 +588,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(spir_func);
   KEYWORD(intel_ocl_bicc);
   KEYWORD(x86_64_sysvcc);
-  KEYWORD(x86_64_win64cc);
+  KEYWORD(win64cc);
   KEYWORD(x86_regcallcc);
   KEYWORD(webkit_jscc);
   KEYWORD(swiftcc);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 717eb0e00f4f..13679ce1d25c 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1670,7 +1670,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'spir_func'
 ///   ::= 'spir_kernel'
 ///   ::= 'x86_64_sysvcc'
-///   ::= 'x86_64_win64cc'
+///   ::= 'win64cc'
 ///   ::= 'webkit_jscc'
 ///   ::= 'anyregcc'
 ///   ::= 'preserve_mostcc'
@@ -1712,7 +1712,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
-  case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break;
+  case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
   case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
   case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
@@ -4411,13 +4411,15 @@ bool LLParser::ParseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
   REQUIRED(tag, DwarfTagField, );                                              \
   REQUIRED(scope, MDField, );                                                  \
   OPTIONAL(entity, MDField, );                                                 \
+  OPTIONAL(file, MDField, );                                                   \
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(name, MDStringField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIImportedEntity, (Context, tag.Val, scope.Val,
-                                              entity.Val, line.Val, name.Val));
+  Result = GET_OR_DISTINCT(
+      DIImportedEntity,
+      (Context, tag.Val, scope.Val, entity.Val, file.Val, line.Val, name.Val));
   return false;
 }
 
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 9c7a06de81b4..0f3707ba0d1e 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -141,7 +141,7 @@ enum Kind {
   kw_spir_kernel,
   kw_spir_func,
   kw_x86_64_sysvcc,
-  kw_x86_64_win64cc,
+  kw_win64cc,
   kw_webkit_jscc,
   kw_anyregcc,
   kw_swiftcc,
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index b1504a8034e0..10fbcdea784f 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1671,15 +1671,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_IMPORTED_ENTITY: {
-    if (Record.size() != 6)
+    if (Record.size() != 6 && Record.size() != 7)
       return error("Invalid record");
 
     IsDistinct = Record[0];
+    bool HasFile = (Record.size() == 7);
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIImportedEntity,
                         (Context, Record[1], getMDOrNull(Record[2]),
-                         getDITypeRefOrNull(Record[3]), Record[4],
-                         getMDString(Record[5]))),
+                         getDITypeRefOrNull(Record[3]),
+                         HasFile ? getMDOrNull(Record[6]) : nullptr,
+                         HasFile ? Record[4] : 0, getMDString(Record[5]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0e518d2bbc8f..dcffde1742cd 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1718,6 +1718,7 @@ void ModuleBitcodeWriter::writeDIImportedEntity(
   Record.push_back(VE.getMetadataOrNullID(N->getEntity()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFile()));
 
   Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev);
   Record.clear();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d4a90eeabe15..676c48fe5c67 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -664,8 +664,9 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
   else
     EntityDie = getDIE(Entity);
   assert(EntityDie);
-  addSourceLine(*IMDie, Module->getLine(), Module->getScope()->getFilename(),
-                Module->getScope()->getDirectory());
+  auto *File = Module->getFile();
+  addSourceLine(*IMDie, Module->getLine(), File ? File->getFilename() : "",
+                File ? File->getDirectory() : "");
   addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
   StringRef Name = Module->getName();
   if (!Name.empty())
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index b7155ac2480a..45dc13d58de7 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -4267,9 +4267,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Use a worklist to iteratively look through PHI nodes, and ensure that
   // the addressing mode obtained from the non-PHI roots of the graph
   // are equivalent.
-  Value *Consensus = nullptr;
-  unsigned NumUsesConsensus = 0;
-  bool IsNumUsesConsensusValid = false;
+  bool AddrModeFound = false;
   bool PhiSeen = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
   ExtAddrMode AddrMode;
@@ -4280,11 +4278,17 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     Value *V = worklist.back();
     worklist.pop_back();
 
-    // Break use-def graph loops.
-    if (!Visited.insert(V).second) {
-      Consensus = nullptr;
-      break;
-    }
+    // We allow traversing cyclic Phi nodes.
+    // In case of success after this loop we ensure that traversing through
+    // Phi nodes ends up with all cases to compute address of the form
+    //    BaseGV + Base + Scale * Index + Offset
+    // where Scale and Offset are constans and BaseGV, Base and Index
+    // are exactly the same Values in all cases.
+    // It means that BaseGV, Scale and Offset dominate our memory instruction
+    // and have the same value as they had in address computation represented
+    // as Phi. So we can safely sink address computation to memory instruction.
+    if (!Visited.insert(V).second)
+      continue;
 
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
@@ -4297,47 +4301,26 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // For non-PHIs, determine the addressing mode being computed.  Note that
     // the result may differ depending on what other uses our candidate
     // addressing instructions might have.
-    SmallVector<Instruction*, 16> NewAddrModeInsts;
+    AddrModeInsts.clear();
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI,
-      InsertedInsts, PromotedInsts, TPT);
+        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
+        InsertedInsts, PromotedInsts, TPT);
 
-    // This check is broken into two cases with very similar code to avoid using
-    // getNumUses() as much as possible. Some values have a lot of uses, so
-    // calling getNumUses() unconditionally caused a significant compile-time
-    // regression.
-    if (!Consensus) {
-      Consensus = V;
+    if (!AddrModeFound) {
+      AddrModeFound = true;
       AddrMode = NewAddrMode;
-      AddrModeInsts = NewAddrModeInsts;
-      continue;
-    } else if (NewAddrMode == AddrMode) {
-      if (!IsNumUsesConsensusValid) {
-        NumUsesConsensus = Consensus->getNumUses();
-        IsNumUsesConsensusValid = true;
-      }
-
-      // Ensure that the obtained addressing mode is equivalent to that obtained
-      // for all other roots of the PHI traversal.  Also, when choosing one
-      // such root as representative, select the one with the most uses in order
-      // to keep the cost modeling heuristics in AddressingModeMatcher
-      // applicable.
-      unsigned NumUses = V->getNumUses();
-      if (NumUses > NumUsesConsensus) {
-        Consensus = V;
-        NumUsesConsensus = NumUses;
-        AddrModeInsts = NewAddrModeInsts;
-      }
       continue;
     }
+    if (NewAddrMode == AddrMode)
+      continue;
 
-    Consensus = nullptr;
+    AddrModeFound = false;
     break;
   }
 
   // If the addressing mode couldn't be determined, or if multiple different
   // ones were determined, bail out now.
-  if (!Consensus) {
+  if (!AddrModeFound) {
     TPT.rollback(LastKnownGood);
     return false;
   }
@@ -4847,25 +4830,7 @@ bool CodeGenPrepare::canFormExtLd(
   if (!HasPromoted && LI->getParent() == Inst->getParent())
     return false;
 
-  EVT VT = TLI->getValueType(*DL, Inst->getType());
-  EVT LoadVT = TLI->getValueType(*DL, LI->getType());
-
-  // If the load has other users and the truncate is not free, this probably
-  // isn't worthwhile.
-  if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
-      !TLI->isTruncateFree(Inst->getType(), LI->getType()))
-    return false;
-
-  // Check whether the target supports casts folded into loads.
-  unsigned LType;
-  if (isa<ZExtInst>(Inst))
-    LType = ISD::ZEXTLOAD;
-  else {
-    assert(isa<SExtInst>(Inst) && "Unexpected ext type!");
-    LType = ISD::SEXTLOAD;
-  }
-
-  return TLI->isLoadExtLegal(LType, VT, LoadVT);
+  return TLI->isExtLoad(LI, Inst, *DL);
 }
 
 /// Move a zext or sext fed by a load into the same basic block as the load,
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 49fb5e8f075b..5258370e6680 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -433,9 +433,12 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   }
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR: {
     unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV ||
+                             MI.getOpcode() == TargetOpcode::G_SREM ||
                              MI.getOpcode() == TargetOpcode::G_ASHR
                          ? TargetOpcode::G_SEXT
                          : TargetOpcode::G_ZEXT;
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index c176de16b593..e6f80dbb8630 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -11,8 +11,6 @@
 // instructions do not lengthen the critical path or the resource depth.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-combiner"
-
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -32,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-combiner"
+
 STATISTIC(NumInstCombined, "Number of machineinst combined");
 
 namespace {
diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp
index 28ecc8f96805..b559e4e513a6 100644
--- a/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -15,7 +15,8 @@
 using namespace llvm;
 
 namespace llvm {
-template class DominanceFrontierBase<MachineBasicBlock>;
+template class DominanceFrontierBase<MachineBasicBlock, false>;
+template class DominanceFrontierBase<MachineBasicBlock, true>;
 template class ForwardDominanceFrontierBase<MachineBasicBlock>;
 }
 
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 65e9e5d195a4..845e8232477c 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -31,7 +31,7 @@ static cl::opt<bool, true> VerifyMachineDomInfoX(
 
 namespace llvm {
 template class DomTreeNodeBase<MachineBasicBlock>;
-template class DominatorTreeBase<MachineBasicBlock>;
+template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
 }
 
 char MachineDominatorTree::ID = 0;
@@ -49,7 +49,7 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
-  DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
+  DT.reset(new DomTreeBase<MachineBasicBlock>());
   DT->recalculate(F);
   return false;
 }
@@ -144,7 +144,7 @@ void MachineDominatorTree::verifyDomTree() const {
     return;
   MachineFunction &F = *getRoot()->getParent();
 
-  DominatorTreeBase<MachineBasicBlock> OtherDT(false);
+  DomTreeBase<MachineBasicBlock> OtherDT;
   OtherDT.recalculate(F);
   if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
       DT->compare(OtherDT)) {
diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp
index c3f6e9249e7d..488377998cb3 100644
--- a/lib/CodeGen/MachinePostDominators.cpp
+++ b/lib/CodeGen/MachinePostDominators.cpp
@@ -16,6 +16,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
+}
+
 char MachinePostDominatorTree::ID = 0;
 
 //declare initializeMachinePostDominatorTreePass
@@ -24,8 +28,7 @@ INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
 
 MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
   initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
-                                                       // postdominator
+  DT = new PostDomTreeBase<MachineBasicBlock>();
 }
 
 FunctionPass *
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 71382c18fdf9..d5d3f7a61a9f 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3889,9 +3889,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   // Note: the SimplifyDemandedBits fold below can make an information-losing
   // transform, and then we have no way to find this better fold.
   if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
-    ConstantSDNode *SubLHS = isConstOrConstSplat(N0.getOperand(0));
-    SDValue SubRHS = N0.getOperand(1);
-    if (SubLHS && SubLHS->isNullValue()) {
+    if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
+      SDValue SubRHS = N0.getOperand(1);
       if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
         return SubRHS;
@@ -4586,6 +4585,20 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   return nullptr;
 }
 
+// if Left + Right == Sum (constant or constant splat vector)
+static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum,
+                             SelectionDAG &DAG, const SDLoc &DL) {
+  EVT ShiftVT = Left.getValueType();
+  if (ShiftVT != Right.getValueType()) return false;
+
+  SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT,
+                         Left.getNode(), Right.getNode());
+  if (!ShiftSum) return false;
+
+  ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum);
+  return CSum && CSum->getZExtValue() == Sum;
+}
+
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
@@ -4631,30 +4644,24 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
   // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
-  if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
-    uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
-    uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
-    if ((LShVal + RShVal) != EltSizeInBits)
-      return nullptr;
-
+  if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) {
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
                               LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
-      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+      SDValue Mask = AllOnes;
 
       if (LHSMask.getNode()) {
-        APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
+        SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
-                           DAG.getNode(ISD::OR, DL, VT, LHSMask,
-                                       DAG.getConstant(RHSBits, DL, VT)));
+                           DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
       }
       if (RHSMask.getNode()) {
-        APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
+        SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
-                           DAG.getNode(ISD::OR, DL, VT, RHSMask,
-                                       DAG.getConstant(LHSBits, DL, VT)));
+                           DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
       }
 
       Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
@@ -5272,11 +5279,21 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  unsigned Bitsize = VT.getScalarSizeInBits();
 
   // fold (rot x, 0) -> x
   if (isNullConstantOrNullSplatConstant(N1))
     return N0;
 
+  // fold (rot x, c) -> (rot x, c % BitSize)
+  if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
+    if (Cst->getAPIntValue().uge(Bitsize)) {
+      uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
+      return DAG.getNode(N->getOpcode(), dl, VT, N0,
+                         DAG.getConstant(RotAmt, dl, N1.getValueType()));
+    }
+  }
+
   // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
@@ -5286,22 +5303,24 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
 
   unsigned NextOp = N0.getOpcode();
   // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
-  if (NextOp == ISD::ROTL || NextOp == ISD::ROTR)
-    if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1))
-      if (SDNode *C2 =
-          DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
-        bool SameSide = (N->getOpcode() == NextOp);
-        unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
-        if (SDValue CombinedShift =
-            DAG.FoldConstantArithmetic(CombineOp, dl, VT, C1, C2)) {
-          unsigned Bitsize = VT.getScalarSizeInBits();
-          SDValue BitsizeC = DAG.getConstant(Bitsize, dl, VT);
-          SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
-            ISD::SREM, dl, VT, CombinedShift.getNode(), BitsizeC.getNode());
-          return DAG.getNode(
-            N->getOpcode(), dl, VT, N0->getOperand(0), CombinedShiftNorm);
-        }
+  if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
+    SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
+    SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
+    if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
+      EVT ShiftVT = C1->getValueType(0);
+      bool SameSide = (N->getOpcode() == NextOp);
+      unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
+      if (SDValue CombinedShift =
+              DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
+        SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
+        SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
+            ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
+            BitsizeC.getNode());
+        return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
+                           CombinedShiftNorm);
       }
+    }
+  }
   return SDValue();
 }
 
@@ -7152,8 +7171,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0);
     }
   }
 
@@ -7210,8 +7235,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                     SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
         ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
-        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        return CombineTo(N, And); // Return N so it doesn't get rechecked!
+        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+        CombineTo(N, And);
+        if (NoReplaceTrunc)
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        else
+          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
@@ -7451,8 +7481,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      return CombineTo(N, ExtLoad); // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
@@ -7503,8 +7539,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                     SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
         ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
-        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        return CombineTo(N, And); // Return N so it doesn't get rechecked!
+        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+        CombineTo(N, And);
+        if (NoReplaceTrunc)
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        else
+          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
@@ -7676,13 +7717,18 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ANY_EXTEND);
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = N0.hasOneUse();
+      CombineTo(N, ExtLoad); 
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
@@ -11373,12 +11419,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                                   MVT::Other, Chain, ReplLoad.getValue(1));
 
-      // Make sure the new and old chains are cleaned up.
-      AddToWorklist(Token.getNode());
-
-      // Replace uses with load result and token factor. Don't add users
-      // to work list.
-      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+      // Replace uses with load result and token factor
+      return CombineTo(N, ReplLoad.getValue(0), Token);
     }
   }
 
@@ -12744,7 +12786,12 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
              TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
             !NoVectors) {
           // Find a legal type for the vector store.
-          EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1);
+          unsigned Elts = i + 1;
+          if (MemVT.isVector()) {
+            // When merging vector stores, get the total number of elements.
+            Elts *= MemVT.getVectorNumElements();
+          }
+          EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
           if (TLI.isTypeLegal(Ty) &&
               TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
               TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
@@ -13003,7 +13050,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
     SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
     AddToWorklist(NewStoreChain.getNode());
 
-    MachineMemOperand::Flags MMOFlags = isDereferenceable ? 
+    MachineMemOperand::Flags MMOFlags = isDereferenceable ?
                                           MachineMemOperand::MODereferenceable:
                                           MachineMemOperand::MONone;
 
@@ -16703,6 +16750,20 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
     return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
 
+  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+  // able to calculate their relative offset if at least one arises
+  // from an alloca. However, these allocas cannot overlap and we
+  // can infer there is no alias.
+  if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
+    if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      // If the base are the same frame index but the we couldn't find a
+      // constant offset, (indices are different) be conservative.
+      if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
+                     !MFI.isFixedObjectIndex(B->getIndex())))
+        return false;
+    }
+
   // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis
   // modified to use BaseIndexOffset.
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ac3247948169..75fec7bd1d48 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1827,10 +1827,11 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
                                    ISD::UADDO : ISD::USUBO,
                                  TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
+
   if (hasOVF) {
     EVT OvfVT = getSetCCResultType(NVT);
     SDVTList VTList = DAG.getVTList(NVT, OvfVT);
-    TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
     int RevOpc;
     if (N->getOpcode() == ISD::ADD) {
       RevOpc = ISD::SUB;
@@ -1863,6 +1864,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
+
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) {
+      SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+      return;
+    }
+
     SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
                                    DAG.getConstant(1, dl, NVT),
                                    DAG.getConstant(0, dl, NVT));
@@ -1877,9 +1885,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     SDValue Cmp =
       DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
-    SDValue Borrow = DAG.getSelect(dl, NVT, Cmp,
-                                   DAG.getConstant(1, dl, NVT),
-                                   DAG.getConstant(0, dl, NVT));
+
+    SDValue Borrow;
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
+      Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT);
+    else
+      Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
+                             DAG.getConstant(0, dl, NVT));
+
     Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
   }
 }
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 1a8d5a4f45da..0b4c6e551667 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -142,9 +142,9 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
       return false; // Invalid value for threshold.
 
     // Count the number of MachineInstr`s in MachineFunction
-    int64_t MICount = 0;
-    for (const auto& MBB : MF)
-      MICount += MBB.size();
+    int64_t MICount = 0;
+    for (const auto& MBB : MF)
+      MICount += MBB.size();
 
     // Check if we have a loop.
     // FIXME: Maybe make this smarter, and see whether the loops are dependent
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 22f166a2335d..79b9fdefd40e 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -42,13 +41,6 @@ static Error visitKnownMember(CVMemberRecord &Record,
   return Error::success();
 }
 
-static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
-  TypeServer2Record R(TypeRecordKind::TypeServer2);
-  if (auto EC = TypeDeserializer::deserializeAs(Record, R))
-    return std::move(EC);
-  return R;
-}
-
 static Error visitMemberRecord(CVMemberRecord &Record,
                                TypeVisitorCallbacks &Callbacks) {
   if (auto EC = Callbacks.visitMemberBegin(Record))
@@ -84,8 +76,6 @@ class CVTypeVisitor {
 public:
   explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks);
 
-  void addTypeServerHandler(TypeServerHandler &Handler);
-
   Error visitTypeRecord(CVType &Record, TypeIndex Index);
   Error visitTypeRecord(CVType &Record);
 
@@ -98,45 +88,15 @@ class CVTypeVisitor {
   Error visitFieldListMemberStream(BinaryStreamReader &Stream);
 
 private:
-  Expected<bool> handleTypeServer(CVType &Record);
   Error finishVisitation(CVType &Record);
 
   /// The interface to the class that gets notified of each visitation.
   TypeVisitorCallbacks &Callbacks;
-
-  TinyPtrVector<TypeServerHandler *> Handlers;
 };
 
 CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
-void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
-  Handlers.push_back(&Handler);
-}
-
-Expected<bool> CVTypeVisitor::handleTypeServer(CVType &Record) {
-  if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
-    auto TS = deserializeTypeServerRecord(Record);
-    if (!TS)
-      return TS.takeError();
-
-    for (auto Handler : Handlers) {
-      auto ExpectedResult = Handler->handle(*TS, Callbacks);
-      // If there was an error, return the error.
-      if (!ExpectedResult)
-        return ExpectedResult.takeError();
-
-      // If the handler processed the record, return success.
-      if (*ExpectedResult)
-        return true;
-
-      // Otherwise keep searching for a handler, eventually falling out and
-      // using the default record handler.
-    }
-  }
-  return false;
-}
-
 Error CVTypeVisitor::finishVisitation(CVType &Record) {
   switch (Record.Type) {
   default:
@@ -163,12 +123,6 @@ Error CVTypeVisitor::finishVisitation(CVType &Record) {
 }
 
 Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
-  auto ExpectedResult = handleTypeServer(Record);
-  if (!ExpectedResult)
-    return ExpectedResult.takeError();
-  if (*ExpectedResult)
-    return Error::success();
-
   if (auto EC = Callbacks.visitTypeBegin(Record, Index))
     return EC;
 
@@ -176,12 +130,6 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
 }
 
 Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
-  auto ExpectedResult = handleTypeServer(Record);
-  if (!ExpectedResult)
-    return ExpectedResult.takeError();
-  if (*ExpectedResult)
-    return Error::success();
-
   if (auto EC = Callbacks.visitTypeBegin(Record))
     return EC;
 
@@ -271,52 +219,37 @@ struct VisitHelper {
 
 Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
                                       TypeVisitorCallbacks &Callbacks,
-                                      VisitorDataSource Source,
-                                      TypeServerHandler *TS) {
+                                      VisitorDataSource Source) {
   VisitHelper V(Callbacks, Source);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeRecord(Record, Index);
 }
 
 Error llvm::codeview::visitTypeRecord(CVType &Record,
                                       TypeVisitorCallbacks &Callbacks,
-                                      VisitorDataSource Source,
-                                      TypeServerHandler *TS) {
+                                      VisitorDataSource Source) {
   VisitHelper V(Callbacks, Source);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeRecord(Record);
 }
 
 Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
                                       TypeVisitorCallbacks &Callbacks,
-                                      VisitorDataSource Source,
-                                      TypeServerHandler *TS) {
+                                      VisitorDataSource Source) {
   VisitHelper V(Callbacks, Source);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
 }
 
 Error llvm::codeview::visitTypeStream(CVTypeRange Types,
-                                      TypeVisitorCallbacks &Callbacks,
-                                      TypeServerHandler *TS) {
+                                      TypeVisitorCallbacks &Callbacks) {
   VisitHelper V(Callbacks, VDS_BytesPresent);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
 }
 
 Error llvm::codeview::visitTypeStream(TypeCollection &Types,
-                                      TypeVisitorCallbacks &Callbacks,
-                                      TypeServerHandler *TS) {
+                                      TypeVisitorCallbacks &Callbacks) {
   // When the internal visitor calls Types.getType(Index) the interface is
   // required to return a CVType with the bytes filled out.  So we can assume
   // that the bytes will be present when individual records are visited.
   VisitHelper V(Callbacks, VDS_BytesPresent);
-  if (TS)
-    V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
 }
 
diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 711144fc2faa..4fc14480578e 100644
--- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -168,18 +168,19 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value) {
   return Error::success();
 }
 
-Error CodeViewRecordIO::mapGuid(StringRef &Guid) {
+Error CodeViewRecordIO::mapGuid(GUID &Guid) {
   constexpr uint32_t GuidSize = 16;
   if (maxFieldLength() < GuidSize)
     return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
 
   if (isWriting()) {
-    assert(Guid.size() == 16 && "Invalid Guid Size!");
-    if (auto EC = Writer->writeFixedString(Guid))
+    if (auto EC = Writer->writeBytes(Guid.Guid))
       return EC;
   } else {
-    if (auto EC = Reader->readFixedString(Guid, 16))
+    ArrayRef<uint8_t> GuidBytes;
+    if (auto EC = Reader->readBytes(GuidBytes, GuidSize))
       return EC;
+    memcpy(Guid.Guid, GuidBytes.data(), GuidSize);
   }
   return Error::success();
 }
diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp
index 1fa8d219d6ac..b8d89c76da3b 100644
--- a/lib/DebugInfo/CodeView/Formatters.cpp
+++ b/lib/DebugInfo/CodeView/Formatters.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -39,3 +40,9 @@ void GuidAdapter::format(raw_ostream &Stream, StringRef Style) {
   }
   Stream << "}";
 }
+
+raw_ostream &llvm::codeview::operator<<(raw_ostream &OS, const GUID &Guid) {
+  codeview::detail::GuidAdapter A(Guid.Guid);
+  A.format(OS, "");
+  return OS;
+}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index c2c02f8de03f..62e73acc72d6 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -186,7 +186,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            BuildInfoSym &BuildInfo) {
-  W.printNumber("BuildId", BuildInfo.BuildId);
+  printTypeIndex("BuildId", BuildInfo.BuildId);
   return Error::success();
 }
 
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 589966705015..e18a35ca1f38 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -354,7 +354,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
-  W->printString("Guid", formatv("{0}", fmt_guid(TS.getGuid())).str());
+  W->printString("Guid", formatv("{0}", TS.getGuid()).str());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
   return Error::success();
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 71a0966df036..bff3516203a0 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -10,13 +10,11 @@
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
 
@@ -57,56 +55,35 @@ namespace {
 /// streams: an item (or IPI) stream and a type stream, as this is what is
 /// actually stored in the final PDB. We choose which records go where by
 /// looking at the record kind.
-class TypeStreamMerger : public TypeVisitorCallbacks {
+class TypeStreamMerger {
 public:
-  explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest,
-                            TypeServerHandler *Handler)
-      : Handler(Handler), IndexMap(SourceToDest) {
+  explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest)
+      : IndexMap(SourceToDest) {
     SourceToDest.clear();
   }
 
   static const TypeIndex Untranslated;
 
-  Error visitTypeBegin(CVType &Record) override;
-  Error visitTypeEnd(CVType &Record) override;
-
   Error mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
-    const CVTypeArray &IdsAndTypes);
+                         const CVTypeArray &IdsAndTypes);
   Error mergeIdRecords(TypeTableBuilder &Dest,
                        ArrayRef<TypeIndex> TypeSourceToDest,
-    const CVTypeArray &Ids);
+                       const CVTypeArray &Ids);
   Error mergeTypeRecords(TypeTableBuilder &Dest, const CVTypeArray &Types);
 
 private:
   Error doit(const CVTypeArray &Types);
 
+  Error remapAllTypes(const CVTypeArray &Types);
+
+  Error remapType(const CVType &Type);
+
   void addMapping(TypeIndex Idx);
 
   bool remapTypeIndex(TypeIndex &Idx);
   bool remapItemIndex(TypeIndex &Idx);
 
-  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs) {
-    auto OriginalData = Record.OriginalRecord.content();
-    bool Success = true;
-    for (auto &Ref : Refs) {
-      uint32_t Offset = Ref.Offset;
-      ArrayRef<uint8_t> Bytes =
-          OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
-      ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
-                              Ref.Count);
-      for (auto TI : TIs) {
-        TypeIndex NewTI = TI;
-        bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
-                               ? remapItemIndex(NewTI)
-                               : remapTypeIndex(NewTI);
-        if (ThisSuccess && NewTI != TI)
-          Record.Mappings.emplace_back(Offset, NewTI);
-        Offset += sizeof(TypeIndex);
-        Success &= ThisSuccess;
-      }
-    }
-    return Success;
-  }
+  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs);
 
   bool remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map);
 
@@ -128,21 +105,6 @@ class TypeStreamMerger : public TypeVisitorCallbacks {
     return Error::success();
   }
 
-  Error writeTypeRecord(const CVType &Record) {
-    TypeIndex DestIdx =
-        DestTypeStream->writeSerializedRecord(Record.RecordData);
-    addMapping(DestIdx);
-    return Error::success();
-  }
-
-  Error writeTypeRecord(const RemappedType &Record, bool RemapSuccess) {
-    return writeRecord(*DestTypeStream, Record, RemapSuccess);
-  }
-
-  Error writeIdRecord(const RemappedType &Record, bool RemapSuccess) {
-    return writeRecord(*DestIdStream, Record, RemapSuccess);
-  }
-
   Optional<Error> LastError;
 
   bool IsSecondPass = false;
@@ -153,7 +115,6 @@ class TypeStreamMerger : public TypeVisitorCallbacks {
 
   TypeTableBuilder *DestIdStream = nullptr;
   TypeTableBuilder *DestTypeStream = nullptr;
-  TypeServerHandler *Handler = nullptr;
 
   // If we're only mapping id records, this array contains the mapping for
   // type records.
@@ -168,12 +129,8 @@ class TypeStreamMerger : public TypeVisitorCallbacks {
 
 const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
 
-Error TypeStreamMerger::visitTypeBegin(CVType &Rec) {
-  RemappedType R(Rec);
-  SmallVector<TiReference, 32> Refs;
-  discoverTypeIndices(Rec.RecordData, Refs);
-  bool Success = remapIndices(R, Refs);
-  switch (Rec.kind()) {
+static bool isIdRecord(TypeLeafKind K) {
+  switch (K) {
   case TypeLeafKind::LF_FUNC_ID:
   case TypeLeafKind::LF_MFUNC_ID:
   case TypeLeafKind::LF_STRING_ID:
@@ -181,19 +138,10 @@ Error TypeStreamMerger::visitTypeBegin(CVType &Rec) {
   case TypeLeafKind::LF_BUILDINFO:
   case TypeLeafKind::LF_UDT_SRC_LINE:
   case TypeLeafKind::LF_UDT_MOD_SRC_LINE:
-    return writeIdRecord(R, Success);
+    return true;
   default:
-    return writeTypeRecord(R, Success);
+    return false;
   }
-  return Error::success();
-}
-
-Error TypeStreamMerger::visitTypeEnd(CVType &Rec) {
-  ++CurIndex;
-  if (!IsSecondPass)
-    assert(IndexMap.size() == slotForIndex(CurIndex) &&
-           "visitKnownRecord should add one index map entry");
-  return Error::success();
 }
 
 void TypeStreamMerger::addMapping(TypeIndex Idx) {
@@ -256,7 +204,7 @@ bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) {
 }
 
 Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest,
-  const CVTypeArray &Types) {
+                                         const CVTypeArray &Types) {
   DestTypeStream = &Dest;
 
   return doit(Types);
@@ -264,7 +212,7 @@ Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest,
 
 Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest,
                                        ArrayRef<TypeIndex> TypeSourceToDest,
-  const CVTypeArray &Ids) {
+                                       const CVTypeArray &Ids) {
   DestIdStream = &Dest;
   TypeLookup = TypeSourceToDest;
 
@@ -273,25 +221,14 @@ Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest,
 
 Error TypeStreamMerger::mergeTypesAndIds(TypeTableBuilder &DestIds,
                                          TypeTableBuilder &DestTypes,
-  const CVTypeArray &IdsAndTypes) {
+                                         const CVTypeArray &IdsAndTypes) {
   DestIdStream = &DestIds;
   DestTypeStream = &DestTypes;
-
   return doit(IdsAndTypes);
 }
 
 Error TypeStreamMerger::doit(const CVTypeArray &Types) {
-  LastError = Error::success();
-
-  // We don't want to deserialize records.  I guess this flag is poorly named,
-  // but it really means "Don't deserialize records before switching on the
-  // concrete type.
-  // FIXME: We can probably get even more speed here if we don't use the visitor
-  // pipeline here, but instead write the switch ourselves.  I don't think it
-  // would buy us much since it's already pretty fast, but it's probably worth
-  // a few cycles.
-  if (auto EC =
-          codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler))
+  if (auto EC = remapAllTypes(Types))
     return EC;
 
   // If we found bad indices but no other errors, try doing another pass and see
@@ -301,50 +238,92 @@ Error TypeStreamMerger::doit(const CVTypeArray &Types) {
   // topologically sorted. The standard library contains MASM-produced objects,
   // so this is important to handle correctly, but we don't have to be too
   // efficient. MASM type streams are usually very small.
-  while (!*LastError && NumBadIndices > 0) {
+  while (!LastError && NumBadIndices > 0) {
     unsigned BadIndicesRemaining = NumBadIndices;
     IsSecondPass = true;
     NumBadIndices = 0;
     CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
 
-    if (auto EC =
-            codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler))
+    if (auto EC = remapAllTypes(Types))
       return EC;
 
     assert(NumBadIndices <= BadIndicesRemaining &&
            "second pass found more bad indices");
-    if (!*LastError && NumBadIndices == BadIndicesRemaining) {
+    if (!LastError && NumBadIndices == BadIndicesRemaining) {
       return llvm::make_error<CodeViewError>(
           cv_error_code::corrupt_record, "input type graph contains cycles");
     }
   }
 
-  Error Ret = std::move(*LastError);
-  LastError.reset();
-  return Ret;
+  if (LastError)
+    return std::move(*LastError);
+  return Error::success();
+}
+
+Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) {
+  for (const CVType &Type : Types)
+    if (auto EC = remapType(Type))
+      return EC;
+  return Error::success();
+}
+
+Error TypeStreamMerger::remapType(const CVType &Type) {
+  RemappedType R(Type);
+  SmallVector<TiReference, 32> Refs;
+  discoverTypeIndices(Type.RecordData, Refs);
+  bool MappedAllIndices = remapIndices(R, Refs);
+  TypeTableBuilder &Dest =
+      isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
+  if (auto EC = writeRecord(Dest, R, MappedAllIndices))
+    return EC;
+
+  ++CurIndex;
+  assert((IsSecondPass || IndexMap.size() == slotForIndex(CurIndex)) &&
+         "visitKnownRecord should add one index map entry");
+  return Error::success();
+}
+
+bool TypeStreamMerger::remapIndices(RemappedType &Record,
+                                    ArrayRef<TiReference> Refs) {
+  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.content();
+  bool Success = true;
+  for (auto &Ref : Refs) {
+    uint32_t Offset = Ref.Offset;
+    ArrayRef<uint8_t> Bytes = OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
+    ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
+                            Ref.Count);
+    for (auto TI : TIs) {
+      TypeIndex NewTI = TI;
+      bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
+                             ? remapItemIndex(NewTI)
+                             : remapTypeIndex(NewTI);
+      if (ThisSuccess && NewTI != TI)
+        Record.Mappings.emplace_back(Offset, NewTI);
+      Offset += sizeof(TypeIndex);
+      Success &= ThisSuccess;
+    }
+  }
+  return Success;
 }
 
 Error llvm::codeview::mergeTypeRecords(TypeTableBuilder &Dest,
                                        SmallVectorImpl<TypeIndex> &SourceToDest,
-                                       TypeServerHandler *Handler,
-  const CVTypeArray &Types) {
-  TypeStreamMerger M(SourceToDest, Handler);
+                                       const CVTypeArray &Types) {
+  TypeStreamMerger M(SourceToDest);
   return M.mergeTypeRecords(Dest, Types);
 }
 
 Error llvm::codeview::mergeIdRecords(TypeTableBuilder &Dest,
                                      ArrayRef<TypeIndex> TypeSourceToDest,
                                      SmallVectorImpl<TypeIndex> &SourceToDest,
-  const CVTypeArray &Ids) {
-  TypeStreamMerger M(SourceToDest, nullptr);
+                                     const CVTypeArray &Ids) {
+  TypeStreamMerger M(SourceToDest);
   return M.mergeIdRecords(Dest, TypeSourceToDest, Ids);
 }
 
 Error llvm::codeview::mergeTypeAndIdRecords(
     TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
-    SmallVectorImpl<TypeIndex> &SourceToDest, TypeServerHandler *Handler,
-  const CVTypeArray &IdsAndTypes) {
-
-  TypeStreamMerger M(SourceToDest, Handler);
+    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes) {
+  TypeStreamMerger M(SourceToDest);
   return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 0a10e6b78911..6cf44ffa3796 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -24,22 +24,166 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
-void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
-                                             DWARFAttribute &AttrValue) {
+bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
+                                     uint32_t *Offset, unsigned UnitIndex,
+                                     uint8_t &UnitType, bool &isUnitDWARF64) {
+  uint32_t AbbrOffset, Length;
+  uint8_t AddrSize = 0;
+  uint16_t Version;
+  bool Success = true;
+
+  bool ValidLength = false;
+  bool ValidVersion = false;
+  bool ValidAddrSize = false;
+  bool ValidType = true;
+  bool ValidAbbrevOffset = true;
+
+  uint32_t OffsetStart = *Offset;
+  Length = DebugInfoData.getU32(Offset);
+  if (Length == UINT32_MAX) {
+    isUnitDWARF64 = true;
+    OS << format(
+        "Unit[%d] is in 64-bit DWARF format; cannot verify from this point.\n",
+        UnitIndex);
+    return false;
+  }
+  Version = DebugInfoData.getU16(Offset);
+
+  if (Version >= 5) {
+    UnitType = DebugInfoData.getU8(Offset);
+    AddrSize = DebugInfoData.getU8(Offset);
+    AbbrOffset = DebugInfoData.getU32(Offset);
+    ValidType = DWARFUnit::isValidUnitType(UnitType);
+  } else {
+    UnitType = 0;
+    AbbrOffset = DebugInfoData.getU32(Offset);
+    AddrSize = DebugInfoData.getU8(Offset);
+  }
+
+  if (!DCtx.getDebugAbbrev()->getAbbreviationDeclarationSet(AbbrOffset))
+    ValidAbbrevOffset = false;
+
+  ValidLength = DebugInfoData.isValidOffset(OffsetStart + Length + 3);
+  ValidVersion = DWARFContext::isSupportedVersion(Version);
+  ValidAddrSize = AddrSize == 4 || AddrSize == 8;
+  if (!ValidLength || !ValidVersion || !ValidAddrSize || !ValidAbbrevOffset ||
+      !ValidType) {
+    Success = false;
+    OS << format("Units[%d] - start offset: 0x%08x \n", UnitIndex, OffsetStart);
+    if (!ValidLength)
+      OS << "\tError: The length for this unit is too "
+            "large for the .debug_info provided.\n";
+    if (!ValidVersion)
+      OS << "\tError: The 16 bit unit header version is not valid.\n";
+    if (!ValidType)
+      OS << "\tError: The unit type encoding is not valid.\n";
+    if (!ValidAbbrevOffset)
+      OS << "\tError: The offset into the .debug_abbrev section is "
+            "not valid.\n";
+    if (!ValidAddrSize)
+      OS << "\tError: The address size is unsupported.\n";
+  }
+  *Offset = OffsetStart + Length + 4;
+  return Success;
+}
+
+bool DWARFVerifier::verifyUnitContents(DWARFUnit Unit) {
+  uint32_t NumUnitErrors = 0;
+  unsigned NumDies = Unit.getNumDIEs();
+  for (unsigned I = 0; I < NumDies; ++I) {
+    auto Die = Unit.getDIEAtIndex(I);
+    if (Die.getTag() == DW_TAG_null)
+      continue;
+    for (auto AttrValue : Die.attributes()) {
+      NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue);
+      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
+    }
+  }
+  return NumUnitErrors == 0;
+}
+
+bool DWARFVerifier::handleDebugInfo() {
+  OS << "Verifying .debug_info Unit Header Chain...\n";
+
+  DWARFDataExtractor DebugInfoData(DCtx.getInfoSection(), DCtx.isLittleEndian(),
+                                   0);
+  uint32_t NumDebugInfoErrors = 0;
+  uint32_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
+  uint8_t UnitType = 0;
+  bool isUnitDWARF64 = false;
+  bool isHeaderChainValid = true;
+  bool hasDIE = DebugInfoData.isValidOffset(Offset);
+  while (hasDIE) {
+    OffsetStart = Offset;
+    if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
+                          isUnitDWARF64)) {
+      isHeaderChainValid = false;
+      if (isUnitDWARF64)
+        break;
+    } else {
+      std::unique_ptr<DWARFUnit> Unit;
+      switch (UnitType) {
+      case dwarf::DW_UT_type:
+      case dwarf::DW_UT_split_type: {
+        DWARFUnitSection<DWARFTypeUnit> TUSection{};
+        Unit.reset(new DWARFTypeUnit(
+            DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
+            &DCtx.getRangeSection(), DCtx.getStringSection(),
+            DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
+            DCtx.getLineSection(), DCtx.isLittleEndian(), false, TUSection,
+            nullptr));
+        break;
+      }
+      case dwarf::DW_UT_skeleton:
+      case dwarf::DW_UT_split_compile:
+      case dwarf::DW_UT_compile:
+      case dwarf::DW_UT_partial:
+      // UnitType = 0 means that we are
+      // verifying a compile unit in DWARF v4.
+      case 0: {
+        DWARFUnitSection<DWARFCompileUnit> CUSection{};
+        Unit.reset(new DWARFCompileUnit(
+            DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
+            &DCtx.getRangeSection(), DCtx.getStringSection(),
+            DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
+            DCtx.getLineSection(), DCtx.isLittleEndian(), false, CUSection,
+            nullptr));
+        break;
+      }
+      default: { llvm_unreachable("Invalid UnitType."); }
+      }
+      Unit->extract(DebugInfoData, &OffsetStart);
+      if (!verifyUnitContents(*Unit))
+        ++NumDebugInfoErrors;
+    }
+    hasDIE = DebugInfoData.isValidOffset(Offset);
+    ++UnitIdx;
+  }
+  if (UnitIdx == 0 && !hasDIE) {
+    OS << "Warning: .debug_info is empty.\n";
+    isHeaderChainValid = true;
+  }
+  NumDebugInfoErrors += verifyDebugInfoReferences();
+  return (isHeaderChainValid && NumDebugInfoErrors == 0);
+}
+
+unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
+                                                 DWARFAttribute &AttrValue) {
+  unsigned NumErrors = 0;
   const auto Attr = AttrValue.Attr;
   switch (Attr) {
   case DW_AT_ranges:
     // Make sure the offset in the DW_AT_ranges attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
       if (*SectionOffset >= DCtx.getRangeSection().Data.size()) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: DW_AT_ranges offset is beyond .debug_ranges "
               "bounds:\n";
         Die.dump(OS, 0);
         OS << "\n";
       }
     } else {
-      ++NumDebugInfoErrors;
+      ++NumErrors;
       OS << "error: DIE has invalid DW_AT_ranges encoding:\n";
       Die.dump(OS, 0);
       OS << "\n";
@@ -49,7 +193,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
     // Make sure the offset in the DW_AT_stmt_list attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
       if (*SectionOffset >= DCtx.getLineSection().Data.size()) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: DW_AT_stmt_list offset is beyond .debug_line "
               "bounds: "
            << format("0x%08" PRIx32, *SectionOffset) << "\n";
@@ -57,7 +201,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
         OS << "\n";
       }
     } else {
-      ++NumDebugInfoErrors;
+      ++NumErrors;
       OS << "error: DIE has invalid DW_AT_stmt_list encoding:\n";
       Die.dump(OS, 0);
       OS << "\n";
@@ -67,10 +211,12 @@ void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   default:
     break;
   }
+  return NumErrors;
 }
 
-void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
-                                        DWARFAttribute &AttrValue) {
+unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
+                                            DWARFAttribute &AttrValue) {
+  unsigned NumErrors = 0;
   const auto Form = AttrValue.Value.getForm();
   switch (Form) {
   case DW_FORM_ref1:
@@ -86,7 +232,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset();
       auto CUOffset = AttrValue.Value.getRawUValue();
       if (CUOffset >= CUSize) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: " << FormEncodingString(Form) << " CU offset "
            << format("0x%08" PRIx32, CUOffset)
            << " is invalid (must be less than CU size of "
@@ -108,7 +254,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     assert(RefVal);
     if (RefVal) {
       if (*RefVal >= DCtx.getInfoSection().Data.size()) {
-        ++NumDebugInfoErrors;
+        ++NumErrors;
         OS << "error: DW_FORM_ref_addr offset beyond .debug_info "
               "bounds:\n";
         Die.dump(OS, 0);
@@ -125,7 +271,7 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     auto SecOffset = AttrValue.Value.getAsSectionOffset();
     assert(SecOffset); // DW_FORM_strp is a section offset.
     if (SecOffset && *SecOffset >= DCtx.getStringSection().size()) {
-      ++NumDebugInfoErrors;
+      ++NumErrors;
       OS << "error: DW_FORM_strp offset beyond .debug_str bounds:\n";
       Die.dump(OS, 0);
       OS << "\n";
@@ -135,17 +281,19 @@ void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
   default:
     break;
   }
+  return NumErrors;
 }
 
-void DWARFVerifier::verifyDebugInfoReferences() {
+unsigned DWARFVerifier::verifyDebugInfoReferences() {
   // Take all references and make sure they point to an actual DIE by
   // getting the DIE by offset and emitting an error
   OS << "Verifying .debug_info references...\n";
+  unsigned NumErrors = 0;
   for (auto Pair : ReferenceToDIEOffsets) {
     auto Die = DCtx.getDIEForOffset(Pair.first);
     if (Die)
       continue;
-    ++NumDebugInfoErrors;
+    ++NumErrors;
     OS << "error: invalid DIE reference " << format("0x%08" PRIx64, Pair.first)
        << ". Offset is in between DIEs:\n";
     for (auto Offset : Pair.second) {
@@ -155,26 +303,7 @@ void DWARFVerifier::verifyDebugInfoReferences() {
     }
     OS << "\n";
   }
-}
-
-bool DWARFVerifier::handleDebugInfo() {
-  NumDebugInfoErrors = 0;
-  OS << "Verifying .debug_info...\n";
-  for (const auto &CU : DCtx.compile_units()) {
-    unsigned NumDies = CU->getNumDIEs();
-    for (unsigned I = 0; I < NumDies; ++I) {
-      auto Die = CU->getDIEAtIndex(I);
-      const auto Tag = Die.getTag();
-      if (Tag == DW_TAG_null)
-        continue;
-      for (auto AttrValue : Die.attributes()) {
-        verifyDebugInfoAttribute(Die, AttrValue);
-        verifyDebugInfoForm(Die, AttrValue);
-      }
-    }
-  }
-  verifyDebugInfoReferences();
-  return NumDebugInfoErrors == 0;
+  return NumErrors;
 }
 
 void DWARFVerifier::verifyDebugLineStmtOffsets() {
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index ff01c948e099..9b1f37943e67 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -52,7 +52,6 @@ add_pdb_impl_folder(Native
   Native/PDBFileBuilder.cpp
   Native/PDBStringTable.cpp
   Native/PDBStringTableBuilder.cpp
-  Native/PDBTypeServerHandler.cpp
   Native/PublicsStream.cpp
   Native/PublicsStreamBuilder.cpp
   Native/RawError.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index 0b48a366bd24..4c59d2f2a9d9 100644
--- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -125,16 +125,16 @@ PrivateGetDIAValue(IDiaSymbol *Symbol,
   return Result8;
 }
 
-PDB_UniqueId
+codeview::GUID
 PrivateGetDIAValue(IDiaSymbol *Symbol,
                    HRESULT (__stdcall IDiaSymbol::*Method)(GUID *)) {
   GUID Result;
   if (S_OK != (Symbol->*Method)(&Result))
-    return PDB_UniqueId();
+    return codeview::GUID();
 
-  static_assert(sizeof(PDB_UniqueId) == sizeof(GUID),
-                "PDB_UniqueId is the wrong size!");
-  PDB_UniqueId IdResult;
+  static_assert(sizeof(codeview::GUID) == sizeof(GUID),
+                "GUID is the wrong size!");
+  codeview::GUID IdResult;
   ::memcpy(&IdResult, &Result, sizeof(GUID));
   return IdResult;
 }
@@ -746,7 +746,7 @@ PDB_SymType DIARawSymbol::getSymTag() const {
                                                 &IDiaSymbol::get_symTag);
 }
 
-PDB_UniqueId DIARawSymbol::getGuid() const {
+codeview::GUID DIARawSymbol::getGuid() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_guid);
 }
 
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
index 789f3b813170..4fcecb92fd15 100644
--- a/lib/DebugInfo/PDB/GenericError.cpp
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -26,6 +26,8 @@ class GenericErrorCategory : public std::error_category {
     switch (static_cast<generic_error_code>(Condition)) {
     case generic_error_code::unspecified:
       return "An unknown error has occurred.";
+    case generic_error_code::type_server_not_found:
+      return "Type server PDB was not found.";
     case generic_error_code::dia_sdk_not_present:
       return "LLVM was not compiled with support for DIA.  This usually means "
              "that you are are not using MSVC, or your Visual Studio "
diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp
index 21b66b3e7bcf..829879060c33 100644
--- a/lib/DebugInfo/PDB/Native/InfoStream.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -118,7 +118,7 @@ uint32_t InfoStream::getSignature() const { return Signature; }
 
 uint32_t InfoStream::getAge() const { return Age; }
 
-PDB_UniqueId InfoStream::getGuid() const { return Guid; }
+GUID InfoStream::getGuid() const { return Guid; }
 
 uint32_t InfoStream::getNamedStreamMapByteSize() const {
   return NamedStreamMapByteSize;
diff --git a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index 707128f7efd4..6450ae752f96 100644
--- a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -34,7 +34,7 @@ void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; }
 
 void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
 
-void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; }
+void InfoStreamBuilder::setGuid(GUID G) { Guid = G; }
 
 void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
   Features.push_back(Sig);
diff --git a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index cb0830f453c8..3241000b06db 100644
--- a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -56,12 +56,12 @@ std::string NativeExeSymbol::getSymbolsFileName() const {
   return File.getFilePath();
 }
 
-PDB_UniqueId NativeExeSymbol::getGuid() const {
+codeview::GUID NativeExeSymbol::getGuid() const {
   auto IS = File.getPDBInfoStream();
   if (IS)
     return IS->getGuid();
   consumeError(IS.takeError());
-  return PDB_UniqueId{{0}};
+  return codeview::GUID{{0}};
 }
 
 bool NativeExeSymbol::hasCTypes() const {
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
index 92612bcea4ac..df3f418052a9 100644
--- a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -323,9 +323,7 @@ PDB_SymType NativeRawSymbol::getSymTag() const {
   return PDB_SymType::None;
 }
 
-PDB_UniqueId NativeRawSymbol::getGuid() const {
-  return PDB_UniqueId{{0}};
-}
+codeview::GUID NativeRawSymbol::getGuid() const { return codeview::GUID{{0}}; }
 
 int32_t NativeRawSymbol::getOffset() const {
   return 0;
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
deleted file mode 100644
index 9fd90102f72c..000000000000
--- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-//===- PDBTypeServerHandler.cpp ---------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Handles CodeView LF_TYPESERVER2 records by attempting to locate a matching
-// PDB file, then loading the PDB file and visiting all types from the
-// referenced PDB using the original supplied visitor.
-//
-// The net effect of this is that when visiting a PDB containing a TypeServer
-// record, the TypeServer record is "replaced" with all of the records in
-// the referenced PDB file.  If a single instance of PDBTypeServerHandler
-// encounters the same TypeServer multiple times (for example reusing one
-// PDBTypeServerHandler across multiple visitations of distinct object files or
-// PDB files), PDBTypeServerHandler will optionally revisit all the records
-// again, or simply consume the record and do nothing.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
-
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
-#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
-#include "llvm/DebugInfo/PDB/PDB.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-static void ignoreErrors(Error EC) {
-  llvm::handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {});
-}
-
-PDBTypeServerHandler::PDBTypeServerHandler(bool RevisitAlways)
-    : RevisitAlways(RevisitAlways) {}
-
-void PDBTypeServerHandler::addSearchPath(StringRef Path) {
-  if (Path.empty() || !sys::fs::is_directory(Path))
-    return;
-
-  SearchPaths.insert(Path);
-}
-
-Expected<bool>
-PDBTypeServerHandler::handleInternal(PDBFile &File,
-                                     TypeVisitorCallbacks &Callbacks) {
-  auto ExpectedTpi = File.getPDBTpiStream();
-  if (!ExpectedTpi)
-    return ExpectedTpi.takeError();
-
-  // For handling a type server, we should be using whatever the callback array
-  // was
-  // that is being used for the original file.  We shouldn't allow the visitor
-  // to
-  // arbitrarily stick a deserializer in there.
-  if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks,
-                                          VDS_BytesExternal))
-    return std::move(EC);
-
-  return true;
-}
-
-Expected<bool> PDBTypeServerHandler::handle(TypeServer2Record &TS,
-                                            TypeVisitorCallbacks &Callbacks) {
-  if (Session) {
-    // If we've already handled this TypeServer and we only want to handle each
-    // TypeServer once, consume the record without doing anything.
-    if (!RevisitAlways)
-      return true;
-
-    return handleInternal(Session->getPDBFile(), Callbacks);
-  }
-
-  StringRef File = sys::path::filename(TS.Name);
-  if (File.empty())
-    return make_error<CodeViewError>(
-        cv_error_code::corrupt_record,
-        "TypeServer2Record does not contain filename!");
-
-  for (auto &Path : SearchPaths) {
-    SmallString<64> PathStr = Path.getKey();
-    sys::path::append(PathStr, File);
-    if (!sys::fs::exists(PathStr))
-      continue;
-
-    std::unique_ptr<IPDBSession> ThisSession;
-    if (auto EC = loadDataForPDB(PDB_ReaderType::Native, PathStr, ThisSession)) {
-      // It is not an error if this PDB fails to load, it just means that it
-      // doesn't match and we should continue searching.
-      ignoreErrors(std::move(EC));
-      continue;
-    }
-
-    std::unique_ptr<NativeSession> NS(
-        static_cast<NativeSession *>(ThisSession.release()));
-    PDBFile &File = NS->getPDBFile();
-    auto ExpectedInfo = File.getPDBInfoStream();
-    // All PDB Files should have an Info stream.
-    if (!ExpectedInfo)
-      return ExpectedInfo.takeError();
-
-    // Just because a file with a matching name was found and it was an actual
-    // PDB file doesn't mean it matches.  For it to match the InfoStream's GUID
-    // must match the GUID specified in the TypeServer2 record.
-    ArrayRef<uint8_t> GuidBytes(ExpectedInfo->getGuid().Guid);
-    StringRef GuidStr(reinterpret_cast<const char *>(GuidBytes.begin()),
-                      GuidBytes.size());
-    if (GuidStr != TS.Guid)
-      continue;
-
-    Session = std::move(NS);
-    return handleInternal(File, Callbacks);
-  }
-
-  // We couldn't find a matching PDB, so let it be handled by someone else.
-  return false;
-}
diff --git a/lib/DebugInfo/PDB/Native/TpiHashing.cpp b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
index 91b8d648fcf9..77a2d57a8369 100644
--- a/lib/DebugInfo/PDB/Native/TpiHashing.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiHashing.cpp
@@ -11,101 +11,79 @@
 
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
-#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/JamCRC.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::pdb;
 
 // Corresponds to `fUDTAnon`.
-template <typename T> static bool isAnonymous(T &Rec) {
-  StringRef Name = Rec.getName();
+static bool isAnonymous(StringRef Name) {
   return Name == "<unnamed-tag>" || Name == "__unnamed" ||
          Name.endswith("::<unnamed-tag>") || Name.endswith("::__unnamed");
 }
 
-// Computes a hash for a given TPI record.
-template <typename T>
-static uint32_t getTpiHash(T &Rec, ArrayRef<uint8_t> FullRecord) {
-  auto Opts = static_cast<uint16_t>(Rec.getOptions());
-
-  bool ForwardRef =
-      Opts & static_cast<uint16_t>(ClassOptions::ForwardReference);
-  bool Scoped = Opts & static_cast<uint16_t>(ClassOptions::Scoped);
-  bool UniqueName = Opts & static_cast<uint16_t>(ClassOptions::HasUniqueName);
-  bool IsAnon = UniqueName && isAnonymous(Rec);
+// Computes the hash for a user-defined type record. This could be a struct,
+// class, union, or enum.
+static uint32_t getHashForUdt(const TagRecord &Rec,
+                              ArrayRef<uint8_t> FullRecord) {
+  ClassOptions Opts = Rec.getOptions();
+  bool ForwardRef = bool(Opts & ClassOptions::ForwardReference);
+  bool Scoped = bool(Opts & ClassOptions::Scoped);
+  bool HasUniqueName = bool(Opts & ClassOptions::HasUniqueName);
+  bool IsAnon = HasUniqueName && isAnonymous(Rec.getName());
 
   if (!ForwardRef && !Scoped && !IsAnon)
     return hashStringV1(Rec.getName());
-  if (!ForwardRef && UniqueName && !IsAnon)
+  if (!ForwardRef && HasUniqueName && !IsAnon)
     return hashStringV1(Rec.getUniqueName());
   return hashBufferV8(FullRecord);
 }
 
-template <typename T> static uint32_t getSourceLineHash(T &Rec) {
+template <typename T>
+static Expected<uint32_t> getHashForUdt(const CVType &Rec) {
+  T Deserialized;
+  if (auto E = TypeDeserializer::deserializeAs(const_cast<CVType &>(Rec),
+                                               Deserialized))
+    return std::move(E);
+  return getHashForUdt(Deserialized, Rec.data());
+}
+
+template <typename T>
+static Expected<uint32_t> getSourceLineHash(const CVType &Rec) {
+  T Deserialized;
+  if (auto E = TypeDeserializer::deserializeAs(const_cast<CVType &>(Rec),
+                                               Deserialized))
+    return std::move(E);
   char Buf[4];
-  support::endian::write32le(Buf, Rec.getUDT().getIndex());
+  support::endian::write32le(Buf, Deserialized.getUDT().getIndex());
   return hashStringV1(StringRef(Buf, 4));
 }
 
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR,
-                                          UdtSourceLineRecord &Rec) {
-  CVR.Hash = getSourceLineHash(Rec);
-}
+Expected<uint32_t> llvm::pdb::hashTypeRecord(const CVType &Rec) {
+  switch (Rec.kind()) {
+  case LF_CLASS:
+  case LF_STRUCTURE:
+  case LF_INTERFACE:
+    return getHashForUdt<ClassRecord>(Rec);
+  case LF_UNION:
+    return getHashForUdt<UnionRecord>(Rec);
+  case LF_ENUM:
+    return getHashForUdt<EnumRecord>(Rec);
 
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR,
-                                          UdtModSourceLineRecord &Rec) {
-  CVR.Hash = getSourceLineHash(Rec);
-}
+  case LF_UDT_SRC_LINE:
+    return getSourceLineHash<UdtSourceLineRecord>(Rec);
+  case LF_UDT_MOD_SRC_LINE:
+    return getSourceLineHash<UdtModSourceLineRecord>(Rec);
 
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, ClassRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
+  default:
+    break;
+  }
 
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, EnumRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, UnionRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UdtSourceLineRecord &Rec) {
-  return verifySourceLine(Rec.getUDT());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR,
-                                        UdtModSourceLineRecord &Rec) {
-  return verifySourceLine(Rec.getUDT());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, ClassRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, EnumRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UnionRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-
-Error TpiHashVerifier::verifySourceLine(codeview::TypeIndex TI) {
-  char Buf[4];
-  support::endian::write32le(Buf, TI.getIndex());
-  uint32_t Hash = hashStringV1(StringRef(Buf, 4));
-  if (Hash % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-
-Error TpiHashVerifier::visitTypeBegin(CVType &Rec) {
-  ++Index;
-  RawRecord = Rec;
-  return Error::success();
+  // Run CRC32 over the bytes. This corresponds to `hashBufv8`.
+  JamCRC JC(/*Init=*/0U);
+  ArrayRef<char> Bytes(reinterpret_cast<const char *>(Rec.data().data()),
+                       Rec.data().size());
+  JC.update(Bytes);
+  return JC.getCRC();
 }
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index f917ef91f639..d3ef87d9009d 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -14,7 +14,6 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
index faf1142ddf17..c291185bc67a 100644
--- a/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -260,12 +260,6 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
   return OS;
 }
 
-raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Guid) {
-  codeview::detail::GuidAdapter A(Guid.Guid);
-  A.format(OS, "");
-  return OS;
-}
-
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   switch (Type) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Class, "class", OS)
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index adca0eeb08b4..43461de4c491 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -288,7 +288,6 @@ class RuntimeDyldMachOARM
                       HalfDiffKindBits);
 
     addRelocationForSection(R, SectionAID);
-    addRelocationForSection(R, SectionBID);
 
     return ++RelI;
   }
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
index fa743c280e86..bc744890b997 100644
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -46,7 +46,6 @@ if ( LLVM_USE_SANITIZE_COVERAGE OR CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux" )
       FuzzerShmemPosix.cpp
       FuzzerShmemWindows.cpp
       FuzzerTracePC.cpp
-      FuzzerTraceState.cpp
       FuzzerUtil.cpp
       FuzzerUtilDarwin.cpp
       FuzzerUtilLinux.cpp
diff --git a/lib/Fuzzer/FuzzerCorpus.h b/lib/Fuzzer/FuzzerCorpus.h
index 218ae5b6ac4d..bae0aea78f13 100644
--- a/lib/Fuzzer/FuzzerCorpus.h
+++ b/lib/Fuzzer/FuzzerCorpus.h
@@ -34,7 +34,8 @@ struct InputInfo {
   size_t NumExecutedMutations = 0;
   size_t NumSuccessfullMutations = 0;
   bool MayDeleteFile = false;
-  std::vector<uint32_t> FeatureSet;
+  bool Reduced = false;
+  std::vector<uint32_t> UniqFeatureSet;
 };
 
 class InputCorpus {
@@ -79,7 +80,8 @@ class InputCorpus {
     II.U = U;
     II.NumFeatures = NumFeatures;
     II.MayDeleteFile = MayDeleteFile;
-    II.FeatureSet = FeatureSet;
+    II.UniqFeatureSet = FeatureSet;
+    std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
     ComputeSHA1(U.data(), U.size(), II.Sha1);
     Hashes.insert(Sha1ToString(II.Sha1));
     UpdateCorpusDistribution();
@@ -117,34 +119,21 @@ class InputCorpus {
         Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
         PrintUnit(II->U);
         Printf(" ");
-        PrintFeatureSet(II->FeatureSet);
+        PrintFeatureSet(II->UniqFeatureSet);
         Printf("\n");
       }
       i++;
     }
   }
 
-  // If FeatureSet is that same as in II, replace II->U with {Data,Size}.
-  bool TryToReplace(InputInfo *II, const uint8_t *Data, size_t Size,
-                    const std::vector<uint32_t> &FeatureSet) {
-    if (II->U.size() > Size && II->FeatureSet.size() &&
-        II->FeatureSet == FeatureSet) {
-      if (FeatureDebug)
-        Printf("Replace: %zd => %zd\n", II->U.size(), Size);
-      Replace(II, {Data, Data + Size});
-      PrintCorpus();
-      return true;
-    }
-    return false;
-  }
-
   void Replace(InputInfo *II, const Unit &U) {
-    assert(II->U.size());
+    assert(II->U.size() > U.size());
     Hashes.erase(Sha1ToString(II->Sha1));
     DeleteFile(*II);
     ComputeSHA1(U.data(), U.size(), II->Sha1);
     Hashes.insert(Sha1ToString(II->Sha1));
     II->U = U;
+    II->Reduced = true;
   }
 
   bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
@@ -198,7 +187,7 @@ class InputCorpus {
       Printf("EVICTED %zd\n", Idx);
   }
 
-  void AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
+  bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
     assert(NewSize);
     Idx = Idx % kFeatureSetSize;
     uint32_t OldSize = GetFeature(Idx);
@@ -218,8 +207,9 @@ class InputCorpus {
         Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
       SmallestElementPerFeature[Idx] = Inputs.size();
       InputSizesPerFeature[Idx] = NewSize;
-      CountingFeatures = true;
+      return true;
     }
+    return false;
   }
 
   size_t NumFeatures() const { return NumAddedFeatures; }
@@ -238,7 +228,6 @@ class InputCorpus {
   size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
 
   void ValidateFeatureSet() {
-    if (!CountingFeatures) return;
     if (FeatureDebug)
       PrintFeatureSet();
     for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
@@ -256,14 +245,12 @@ class InputCorpus {
   // Must be called whenever the corpus or unit weights are changed.
   void UpdateCorpusDistribution() {
     size_t N = Inputs.size();
+    assert(N);
     Intervals.resize(N + 1);
     Weights.resize(N);
     std::iota(Intervals.begin(), Intervals.end(), 0);
-    if (CountingFeatures)
-      for (size_t i = 0; i < N; i++)
-        Weights[i] = Inputs[i]->NumFeatures * (i + 1);
-    else
-      std::iota(Weights.begin(), Weights.end(), 1);
+    for (size_t i = 0; i < N; i++)
+      Weights[i] = Inputs[i]->NumFeatures * (i + 1);
     CorpusDistribution = std::piecewise_constant_distribution<double>(
         Intervals.begin(), Intervals.end(), Weights.begin());
   }
@@ -275,7 +262,6 @@ class InputCorpus {
   std::unordered_set<std::string> Hashes;
   std::vector<InputInfo*> Inputs;
 
-  bool CountingFeatures = false;
   size_t NumAddedFeatures = 0;
   size_t NumUpdatedFeatures = 0;
   uint32_t InputSizesPerFeature[kFeatureSetSize];
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index 87968893853e..fd8cab38a7bb 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -186,7 +186,11 @@ static void ParseFlags(const std::vector<std::string> &Args) {
   }
   Inputs = new std::vector<std::string>;
   for (size_t A = 1; A < Args.size(); A++) {
-    if (ParseOneFlag(Args[A].c_str())) continue;
+    if (ParseOneFlag(Args[A].c_str())) {
+      if (Flags.ignore_remaining_args)
+        break;
+      continue;
+    }
     Inputs->push_back(Args[A]);
   }
 }
@@ -356,16 +360,17 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
     exit(1);
   }
   std::string InputFilePath = Inputs->at(0);
-  std::string BaseCmd =
-      CloneArgsWithoutX(Args, "minimize_crash", "exact_artifact_path");
-  auto InputPos = BaseCmd.find(" " + InputFilePath + " ");
+  auto BaseCmd = SplitBefore(
+      "-ignore_remaining_args=1",
+      CloneArgsWithoutX(Args, "minimize_crash", "exact_artifact_path"));
+  auto InputPos = BaseCmd.first.find(" " + InputFilePath + " ");
   assert(InputPos != std::string::npos);
-  BaseCmd.erase(InputPos, InputFilePath.size() + 1);
+  BaseCmd.first.erase(InputPos, InputFilePath.size() + 1);
   if (Flags.runs <= 0 && Flags.max_total_time == 0) {
     Printf("INFO: you need to specify -runs=N or "
            "-max_total_time=N with -minimize_crash=1\n"
            "INFO: defaulting to -max_total_time=600\n");
-    BaseCmd += " -max_total_time=600";
+    BaseCmd.first += " -max_total_time=600";
   }
 
   auto LogFilePath = DirPlusFile(
@@ -378,7 +383,8 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
     Printf("CRASH_MIN: minimizing crash input: '%s' (%zd bytes)\n",
            CurrentFilePath.c_str(), U.size());
 
-    auto Cmd = BaseCmd + " " + CurrentFilePath + LogFileRedirect;
+    auto Cmd = BaseCmd.first + " " + CurrentFilePath + LogFileRedirect + " " +
+               BaseCmd.second;
 
     Printf("CRASH_MIN: executing: %s\n", Cmd.c_str());
     int ExitCode = ExecuteCommand(Cmd);
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 5e70cbad3cf1..526805705b20 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -121,6 +121,9 @@ FUZZER_FLAG_STRING(exit_on_src_pos, "Exit if a newly found PC originates"
 FUZZER_FLAG_STRING(exit_on_item, "Exit if an item with a given sha1 sum"
     " was added to the corpus. "
     "Used primarily for testing libFuzzer itself.")
+FUZZER_FLAG_INT(ignore_remaining_args, 0, "If 1, ignore all arguments passed "
+                "after this one. Useful for fuzzers that need to do their own "
+                "argument parsing.")
 
 FUZZER_FLAG_STRING(run_equivalence_server, "Experimental")
 FUZZER_FLAG_STRING(use_equivalence_server, "Experimental")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index a732f895375e..3fc3fe004cef 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -38,7 +38,6 @@ class Fuzzer {
   void Loop();
   void MinimizeCrashLoop(const Unit &U);
   void ShuffleAndMinimize(UnitVector *V);
-  void InitializeTraceState();
   void RereadOutputCorpus(size_t MaxSize);
 
   size_t secondsSinceProcessStartUp() {
@@ -100,19 +99,10 @@ class Fuzzer {
   void WriteToOutputCorpus(const Unit &U);
   void WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix);
   void PrintStats(const char *Where, const char *End = "\n", size_t Units = 0);
-  void PrintStatusForNewUnit(const Unit &U);
+  void PrintStatusForNewUnit(const Unit &U, const char *Text);
   void ShuffleCorpus(UnitVector *V);
   void CheckExitOnSrcPosOrItem();
 
-  // Trace-based fuzzing: we run a unit with some kind of tracing
-  // enabled and record potentially useful mutations. Then
-  // We apply these mutations one by one to the unit and run it again.
-
-  // Start tracing; forget all previously proposed mutations.
-  void StartTraceRecording();
-  // Stop tracing.
-  void StopTraceRecording();
-
   static void StaticDeathCallback();
   void DumpCurrentUnit(const char *Prefix);
   void DeathCallback();
@@ -142,7 +132,7 @@ class Fuzzer {
   size_t MaxInputLen = 0;
   size_t MaxMutationLen = 0;
 
-  std::vector<uint32_t> FeatureSetTmp;
+  std::vector<uint32_t> UniqFeatureSetTmp;
 
   // Need to know our own thread.
   static thread_local bool IsMyThread;
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 6816f3af8a6f..8ac7a847aef7 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -114,7 +114,6 @@ Fuzzer::Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD,
     : CB(CB), Corpus(Corpus), MD(MD), Options(Options) {
   if (EF->__sanitizer_set_death_callback)
     EF->__sanitizer_set_death_callback(StaticDeathCallback);
-  InitializeTraceState();
   assert(!F);
   F = this;
   TPC.ResetMaps();
@@ -403,22 +402,29 @@ bool Fuzzer::RunOne(const uint8_t *Data, size_t Size, bool MayDeleteFile,
 
   ExecuteCallback(Data, Size);
 
-  FeatureSetTmp.clear();
+  UniqFeatureSetTmp.clear();
+  size_t FoundUniqFeaturesOfII = 0;
   size_t NumUpdatesBefore = Corpus.NumFeatureUpdates();
   TPC.CollectFeatures([&](size_t Feature) {
-    Corpus.AddFeature(Feature, Size, Options.Shrink);
-    if (Options.ReduceInputs)
-      FeatureSetTmp.push_back(Feature);
+    if (Corpus.AddFeature(Feature, Size, Options.Shrink))
+      UniqFeatureSetTmp.push_back(Feature);
+    if (Options.ReduceInputs && II)
+      if (std::binary_search(II->UniqFeatureSet.begin(),
+                             II->UniqFeatureSet.end(), Feature))
+        FoundUniqFeaturesOfII++;
   });
   PrintPulseAndReportSlowInput(Data, Size);
   size_t NumNewFeatures = Corpus.NumFeatureUpdates() - NumUpdatesBefore;
   if (NumNewFeatures) {
     Corpus.AddToCorpus({Data, Data + Size}, NumNewFeatures, MayDeleteFile,
-                       FeatureSetTmp);
+                       UniqFeatureSetTmp);
     CheckExitOnSrcPosOrItem();
     return true;
   }
-  if (II && Corpus.TryToReplace(II, Data, Size, FeatureSetTmp)) {
+  if (II && FoundUniqFeaturesOfII &&
+      FoundUniqFeaturesOfII == II->UniqFeatureSet.size() &&
+      II->U.size() > Size) {
+    Corpus.Replace(II, {Data, Data + Size});
     CheckExitOnSrcPosOrItem();
     return true;
   }
@@ -501,10 +507,10 @@ void Fuzzer::WriteUnitToFileWithPrefix(const Unit &U, const char *Prefix) {
     Printf("Base64: %s\n", Base64(U).c_str());
 }
 
-void Fuzzer::PrintStatusForNewUnit(const Unit &U) {
+void Fuzzer::PrintStatusForNewUnit(const Unit &U, const char *Text) {
   if (!Options.PrintNEW)
     return;
-  PrintStats("NEW   ", "");
+  PrintStats(Text, "");
   if (Options.Verbosity) {
     Printf(" L: %zd ", U.size());
     MD.PrintMutationSequence();
@@ -515,7 +521,8 @@ void Fuzzer::PrintStatusForNewUnit(const Unit &U) {
 void Fuzzer::ReportNewCoverage(InputInfo *II, const Unit &U) {
   II->NumSuccessfullMutations++;
   MD.RecordSuccessfulMutationSequence();
-  PrintStatusForNewUnit(U);
+  PrintStatusForNewUnit(U, II->Reduced ? "REDUCE" :
+                                         "NEW   ");
   WriteToOutputCorpus(U);
   NumberOfNewUnitsAdded++;
   TPC.PrintNewPCs();
@@ -600,13 +607,10 @@ void Fuzzer::MutateAndTestOne() {
     assert(NewSize > 0 && "Mutator returned empty unit");
     assert(NewSize <= CurrentMaxMutationLen && "Mutator return overisized unit");
     Size = NewSize;
-    if (i == 0)
-      StartTraceRecording();
     II.NumExecutedMutations++;
     if (RunOne(CurrentUnitData, Size, /*MayDeleteFile=*/true, &II))
       ReportNewCoverage(&II, {CurrentUnitData, CurrentUnitData + Size});
 
-    StopTraceRecording();
     TryDetectingAMemoryLeak(CurrentUnitData, Size,
                             /*DuringInitialCorpusExecution*/ false);
   }
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
index 612f4bbb28f2..616c0999aa39 100644
--- a/lib/Fuzzer/FuzzerMerge.cpp
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -241,7 +241,6 @@ void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
       return true;
     });
     // Show stats.
-    TotalNumberOfRuns++;
     if (!(TotalNumberOfRuns & (TotalNumberOfRuns - 1)))
       PrintStats("pulse ");
     // Write the post-run marker and the coverage.
@@ -286,12 +285,13 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
 
   // Execute the inner process untill it passes.
   // Every inner process should execute at least one input.
-  std::string BaseCmd = CloneArgsWithoutX(Args, "keep-all-flags");
+  auto BaseCmd = SplitBefore("-ignore_remaining_args=1",
+                             CloneArgsWithoutX(Args, "keep-all-flags"));
   bool Success = false;
   for (size_t i = 1; i <= AllFiles.size(); i++) {
     Printf("MERGE-OUTER: attempt %zd\n", i);
-    auto ExitCode =
-        ExecuteCommand(BaseCmd + " -merge_control_file=" + CFPath);
+    auto ExitCode = ExecuteCommand(BaseCmd.first + " -merge_control_file=" +
+                                   CFPath + " " + BaseCmd.second);
     if (!ExitCode) {
       Printf("MERGE-OUTER: succesfull in %zd attempt(s)\n", i);
       Success = true;
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index 53cb9027e455..5998ef9d3193 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -43,8 +43,6 @@ MutationDispatcher::MutationDispatcher(Random &Rand,
           {&MutationDispatcher::Mutate_CrossOver, "CrossOver"},
           {&MutationDispatcher::Mutate_AddWordFromManualDictionary,
            "ManualDict"},
-          {&MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary,
-           "TempAutoDict"},
           {&MutationDispatcher::Mutate_AddWordFromPersistentAutoDictionary,
            "PersAutoDict"},
       });
@@ -165,11 +163,6 @@ size_t MutationDispatcher::Mutate_AddWordFromManualDictionary(uint8_t *Data,
   return AddWordFromDictionary(ManualDictionary, Data, Size, MaxSize);
 }
 
-size_t MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary(
-    uint8_t *Data, size_t Size, size_t MaxSize) {
-  return AddWordFromDictionary(TempAutoDictionary, Data, Size, MaxSize);
-}
-
 size_t MutationDispatcher::ApplyDictionaryEntry(uint8_t *Data, size_t Size,
                                                 size_t MaxSize,
                                                 DictionaryEntry &DE) {
@@ -251,7 +244,7 @@ size_t MutationDispatcher::Mutate_AddWordFromTORC(
     uint8_t *Data, size_t Size, size_t MaxSize) {
   Word W;
   DictionaryEntry DE;
-  switch (Rand(3)) {
+  switch (Rand(4)) {
   case 0: {
     auto X = TPC.TORC8.Get(Rand.Rand());
     DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
@@ -267,6 +260,10 @@ size_t MutationDispatcher::Mutate_AddWordFromTORC(
     auto X = TPC.TORCW.Get(Rand.Rand());
     DE = MakeDictionaryEntryFromCMP(X.A, X.B, Data, Size);
   } break;
+  case 3: if (Options.UseMemmem) {
+    auto X = TPC.MMT.Get(Rand.Rand());
+    DE = DictionaryEntry(X);
+  } break;
   default:
     assert(0);
   }
@@ -533,14 +530,4 @@ void MutationDispatcher::AddWordToManualDictionary(const Word &W) {
       {W, std::numeric_limits<size_t>::max()});
 }
 
-void MutationDispatcher::AddWordToAutoDictionary(DictionaryEntry DE) {
-  static const size_t kMaxAutoDictSize = 1 << 14;
-  if (TempAutoDictionary.size() >= kMaxAutoDictSize) return;
-  TempAutoDictionary.push_back(DE);
-}
-
-void MutationDispatcher::ClearAutoDictionary() {
-  TempAutoDictionary.clear();
-}
-
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerMutate.h b/lib/Fuzzer/FuzzerMutate.h
index 8c8fb3fd74c7..84b04c0dbf3e 100644
--- a/lib/Fuzzer/FuzzerMutate.h
+++ b/lib/Fuzzer/FuzzerMutate.h
@@ -52,10 +52,6 @@ class MutationDispatcher {
   size_t Mutate_AddWordFromManualDictionary(uint8_t *Data, size_t Size,
                                             size_t MaxSize);
 
-  /// Mutates data by adding a word from the temporary automatic dictionary.
-  size_t Mutate_AddWordFromTemporaryAutoDictionary(uint8_t *Data, size_t Size,
-                                                   size_t MaxSize);
-
   /// Mutates data by adding a word from the TORC.
   size_t Mutate_AddWordFromTORC(uint8_t *Data, size_t Size, size_t MaxSize);
 
@@ -84,8 +80,6 @@ class MutationDispatcher {
 
   void AddWordToManualDictionary(const Word &W);
 
-  void AddWordToAutoDictionary(DictionaryEntry DE);
-  void ClearAutoDictionary();
   void PrintRecommendedDictionary();
 
   void SetCorpus(const InputCorpus *Corpus) { this->Corpus = Corpus; }
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
index 6f5c7be41062..ced0a2133340 100644
--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -37,6 +37,8 @@ namespace fuzzer {
 
 TracePC TPC;
 
+int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr;
+
 uint8_t *TracePC::Counters() const {
   return __sancov_trace_pc_guard_8bit_counters;
 }
@@ -293,6 +295,20 @@ void TracePC::HandleCmp(uintptr_t PC, T Arg1, T Arg2) {
   ValueProfileMap.AddValue(Idx);
 }
 
+static size_t InternalStrnlen(const char *S, size_t MaxLen) {
+  size_t Len = 0;
+  for (; Len < MaxLen && S[Len]; Len++) {}
+  return Len;
+}
+
+// Finds min of (strlen(S1), strlen(S2)).
+// Needed bacause one of these strings may actually be non-zero terminated.
+static size_t InternalStrnlen2(const char *S1, const char *S2) {
+  size_t Len = 0;
+  for (; S1[Len] && S2[Len]; Len++)  {}
+  return Len;
+}
+
 } // namespace fuzzer
 
 extern "C" {
@@ -415,4 +431,71 @@ void __sanitizer_cov_trace_gep(uintptr_t Idx) {
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   fuzzer::TPC.HandleCmp(PC, Idx, (uintptr_t)0);
 }
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
+                                  const void *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  if (result == 0) return;  // No reason to mutate.
+  if (n <= 1) return;  // Not interesting.
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
+                                   const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  if (result == 0) return;  // No reason to mutate.
+  size_t Len1 = fuzzer::InternalStrnlen(s1, n);
+  size_t Len2 = fuzzer::InternalStrnlen(s2, n);
+  n = std::min(n, Len1);
+  n = std::min(n, Len2);
+  if (n <= 1) return;  // Not interesting.
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
+                                   const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  if (result == 0) return;  // No reason to mutate.
+  size_t N = fuzzer::InternalStrnlen2(s1, s2);
+  if (N <= 1) return;  // Not interesting.
+  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
+                                       const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
+                                      const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
+                                  const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  fuzzer::TPC.MMT.Add(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
+                                      const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  fuzzer::TPC.MMT.Add(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
+}
+
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
+void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
+                                  const void *s2, size_t len2, void *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
+  fuzzer::TPC.MMT.Add(reinterpret_cast<const uint8_t *>(s2), len2);
+}
 }  // extern "C"
diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h
index 5ec8c590b4df..b36c4f54306c 100644
--- a/lib/Fuzzer/FuzzerTracePC.h
+++ b/lib/Fuzzer/FuzzerTracePC.h
@@ -45,6 +45,28 @@ struct TableOfRecentCompares {
   Pair Table[kSize];
 };
 
+template <size_t kSizeT>
+struct MemMemTable {
+  static const size_t kSize = kSizeT;
+  Word MemMemWords[kSize];
+  Word EmptyWord;
+
+  void Add(const uint8_t *Data, size_t Size) {
+    if (Size <= 2) return;
+    Size = std::min(Size, Word::GetMaxSize());
+    size_t Idx = SimpleFastHash(Data, Size) % kSize;
+    MemMemWords[Idx].Set(Data, Size);
+  }
+  const Word &Get(size_t Idx) {
+    for (size_t i = 0; i < kSize; i++) {
+      const Word &W = MemMemWords[(Idx + i) % kSize];
+      if (W.size()) return W;
+    }
+    EmptyWord.Set(nullptr, 0);
+    return EmptyWord;
+  }
+};
+
 class TracePC {
  public:
   static const size_t kNumPCs = 1 << 21;
@@ -81,6 +103,7 @@ class TracePC {
   TableOfRecentCompares<uint32_t, 32> TORC4;
   TableOfRecentCompares<uint64_t, 32> TORC8;
   TableOfRecentCompares<Word, 32> TORCW;
+  MemMemTable<1024> MMT;
 
   void PrintNewPCs();
   void InitializePrintNewPCs();
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
deleted file mode 100644
index 8670e2ad6727..000000000000
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//===- FuzzerTraceState.cpp - Trace-based fuzzer mutator ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// Data tracing.
-//===----------------------------------------------------------------------===//
-
-#include "FuzzerDictionary.h"
-#include "FuzzerIO.h"
-#include "FuzzerInternal.h"
-#include "FuzzerMutate.h"
-#include "FuzzerTracePC.h"
-#include <algorithm>
-#include <cstring>
-#include <map>
-#include <set>
-#include <thread>
-
-namespace fuzzer {
-
-// Declared as static globals for faster checks inside the hooks.
-static bool RecordingMemmem = false;
-
-int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr;
-
-class TraceState {
-public:
-  TraceState(MutationDispatcher &MD, const FuzzingOptions &Options,
-             const Fuzzer *F)
-      : MD(MD), Options(Options), F(F) {}
-
-  void StartTraceRecording() {
-    if (!Options.UseMemmem)
-      return;
-    RecordingMemmem = true;
-    InterestingWords.clear();
-    MD.ClearAutoDictionary();
-  }
-
-  void StopTraceRecording() {
-    if (!RecordingMemmem)
-      return;
-    for (auto &W : InterestingWords)
-      MD.AddWordToAutoDictionary({W});
-  }
-
-  void AddInterestingWord(const uint8_t *Data, size_t Size) {
-    if (!RecordingMemmem || !F->InFuzzingThread()) return;
-    if (Size <= 1) return;
-    Size = std::min(Size, Word::GetMaxSize());
-    Word W(Data, Size);
-    InterestingWords.insert(W);
-  }
-
- private:
-
-  // TODO: std::set is too inefficient, need to have a custom DS here.
-  std::set<Word> InterestingWords;
-  MutationDispatcher &MD;
-  const FuzzingOptions Options;
-  const Fuzzer *F;
-};
-
-static TraceState *TS;
-
-void Fuzzer::StartTraceRecording() {
-  if (!TS) return;
-  TS->StartTraceRecording();
-}
-
-void Fuzzer::StopTraceRecording() {
-  if (!TS) return;
-  TS->StopTraceRecording();
-}
-
-void Fuzzer::InitializeTraceState() {
-  if (!Options.UseMemmem) return;
-  TS = new TraceState(MD, Options, this);
-}
-
-static size_t InternalStrnlen(const char *S, size_t MaxLen) {
-  size_t Len = 0;
-  for (; Len < MaxLen && S[Len]; Len++) {}
-  return Len;
-}
-
-// Finds min of (strlen(S1), strlen(S2)).
-// Needed bacause one of these strings may actually be non-zero terminated.
-static size_t InternalStrnlen2(const char *S1, const char *S2) {
-  size_t Len = 0;
-  for (; S1[Len] && S2[Len]; Len++)  {}
-  return Len;
-}
-
-}  // namespace fuzzer
-
-using fuzzer::TS;
-
-extern "C" {
-
-// We may need to avoid defining weak hooks to stay compatible with older clang.
-#ifndef LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
-# define LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS 1
-#endif
-
-#if LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
-                                  const void *s2, size_t n, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  if (result == 0) return;  // No reason to mutate.
-  if (n <= 1) return;  // Not interesting.
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
-                                   const char *s2, size_t n, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  if (result == 0) return;  // No reason to mutate.
-  size_t Len1 = fuzzer::InternalStrnlen(s1, n);
-  size_t Len2 = fuzzer::InternalStrnlen(s2, n);
-  n = std::min(n, Len1);
-  n = std::min(n, Len2);
-  if (n <= 1) return;  // Not interesting.
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/true);
-}
-
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
-                                   const char *s2, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  if (result == 0) return;  // No reason to mutate.
-  size_t N = fuzzer::InternalStrnlen2(s1, s2);
-  if (N <= 1) return;  // Not interesting.
-  fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
-                                       const char *s2, size_t n, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
-                                      const char *s2, int result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
-                                  const char *s2, char *result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
-                                      const char *s2, char *result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
-}
-
-ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
-void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
-                                  const void *s2, size_t len2, void *result) {
-  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
-  TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), len2);
-}
-
-#endif  // LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
-}  // extern "C"
diff --git a/lib/Fuzzer/FuzzerUtil.cpp b/lib/Fuzzer/FuzzerUtil.cpp
index f5fd3a85187c..2d95f40e46a1 100644
--- a/lib/Fuzzer/FuzzerUtil.cpp
+++ b/lib/Fuzzer/FuzzerUtil.cpp
@@ -215,4 +215,11 @@ bool ExecuteCommandAndReadOutput(const std::string &Command, std::string *Out) {
   return true;
 }
 
+size_t SimpleFastHash(const uint8_t *Data, size_t Size) {
+  size_t Res = 0;
+  for (size_t i = 0; i < Size; i++)
+    Res = Res * 11 + Data[i];
+  return Res;
+}
+
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerUtil.h b/lib/Fuzzer/FuzzerUtil.h
index f84fd9ef0fce..62d6e61dcf17 100644
--- a/lib/Fuzzer/FuzzerUtil.h
+++ b/lib/Fuzzer/FuzzerUtil.h
@@ -67,10 +67,20 @@ inline std::string CloneArgsWithoutX(const std::vector<std::string> &Args,
   return CloneArgsWithoutX(Args, X, X);
 }
 
+inline std::pair<std::string, std::string> SplitBefore(std::string X,
+                                                       std::string S) {
+  auto Pos = S.find(X);
+  if (Pos == std::string::npos)
+    return std::make_pair(S, "");
+  return std::make_pair(S.substr(0, Pos), S.substr(Pos));
+}
+
 std::string DisassembleCmd(const std::string &FileName);
 
 std::string SearchRegexCmd(const std::string &Regex);
 
+size_t SimpleFastHash(const uint8_t *Data, size_t Size);
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_UTIL_H
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index d0521bdfdd67..15bceb896e17 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -22,8 +22,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   return 0;
 }
 EOF
-# Build your target with -fsanitize-coverage=trace-pc using fresh clang.
-clang -g -fsanitize-coverage=trace-pc test_fuzzer.cc -c
+# Build your target with -fsanitize-coverage=trace-pc-guard using fresh clang.
+clang -g -fsanitize-coverage=trace-pc-guard test_fuzzer.cc -c
 # Build afl-llvm-rt.o.c from the AFL distribution.
 clang -c -w $AFL_HOME/llvm_mode/afl-llvm-rt.o.c
 # Build this file, link it with afl-llvm-rt.o.o and the target code.
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index 30566bdc87ae..43aea2b7a186 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -90,6 +90,7 @@ set(Tests
   EmptyTest
   EquivalenceATest
   EquivalenceBTest
+  FlagsTest
   FourIndependentBranchesTest
   FullCoverageSetTest
   InitializeTest
@@ -272,5 +273,5 @@ add_lit_testsuite(check-fuzzer "Running Fuzzer tests"
 # Don't add dependencies on Windows. The linker step would fail on Windows,
 # since cmake will use link.exe for linking and won't include compiler-rt libs.
 if(NOT MSVC)
-  add_dependencies(check-fuzzer FileCheck sancov not llvm-symbolizer)
+  add_dependencies(check-fuzzer FileCheck sancov not)
 endif()
diff --git a/lib/Fuzzer/test/FlagsTest.cpp b/lib/Fuzzer/test/FlagsTest.cpp
new file mode 100644
index 000000000000..ac64b9d48df5
--- /dev/null
+++ b/lib/Fuzzer/test/FlagsTest.cpp
@@ -0,0 +1,32 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Parse some flags
+#include <string>
+#include <vector>
+
+static std::vector<std::string> Flags;
+
+extern "C" int LLVMFuzzerInitialize(int *Argc, char ***Argv) {
+  // Parse --flags and anything after -ignore_remaining_args=1 is passed.
+  int I = 1;
+  while (I < *Argc) {
+    std::string S((*Argv)[I++]);
+    if (S == "-ignore_remaining_args=1")
+      break;
+    if (S.substr(0, 2) == "--")
+      Flags.push_back(S);
+  }
+  while (I < *Argc)
+    Flags.push_back(std::string((*Argv)[I++]));
+
+  return 0;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  fprintf(stderr, "BINGO ");
+  for (auto Flag : Flags)
+    fprintf(stderr, "%s ", Flag.c_str());
+  fprintf(stderr, "\n");
+  exit(0);
+}
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 1053c28527bf..eba2663029b2 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -424,35 +424,6 @@ TEST(FuzzerMutate, AddWordFromDictionary2) {
   TestAddWordFromDictionary(&MutationDispatcher::Mutate, 1 << 15);
 }
 
-void TestAddWordFromDictionaryWithHint(Mutator M, int NumIter) {
-  std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
-  fuzzer::EF = t.get();
-  Random Rand(0);
-  MutationDispatcher MD(Rand, {});
-  uint8_t W[] = {0xAA, 0xBB, 0xCC, 0xDD, 0xFF, 0xEE, 0xEF};
-  size_t PosHint = 7777;
-  MD.AddWordToAutoDictionary({Word(W, sizeof(W)), PosHint});
-  int FoundMask = 0;
-  for (int i = 0; i < NumIter; i++) {
-    uint8_t T[10000];
-    memset(T, 0, sizeof(T));
-    size_t NewSize = (MD.*M)(T, 9000, 10000);
-    if (NewSize >= PosHint + sizeof(W) &&
-        !memcmp(W, T + PosHint, sizeof(W)))
-      FoundMask = 1;
-  }
-  EXPECT_EQ(FoundMask, 1);
-}
-
-TEST(FuzzerMutate, AddWordFromDictionaryWithHint1) {
-  TestAddWordFromDictionaryWithHint(
-      &MutationDispatcher::Mutate_AddWordFromTemporaryAutoDictionary, 1 << 5);
-}
-
-TEST(FuzzerMutate, AddWordFromDictionaryWithHint2) {
-  TestAddWordFromDictionaryWithHint(&MutationDispatcher::Mutate, 1 << 10);
-}
-
 void TestChangeASCIIInteger(Mutator M, int NumIter) {
   std::unique_ptr<ExternalFunctions> t(new ExternalFunctions());
   fuzzer::EF = t.get();
@@ -593,7 +564,7 @@ TEST(Corpus, Distribution) {
   size_t N = 10;
   size_t TriesPerUnit = 1<<16;
   for (size_t i = 0; i < N; i++)
-    C->AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 0, false, {});
+    C->AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 1, false, {});
 
   std::vector<size_t> Hist(N);
   for (size_t i = 0; i < N * TriesPerUnit; i++) {
diff --git a/lib/Fuzzer/test/fuzzer-flags.test b/lib/Fuzzer/test/fuzzer-flags.test
index 76ea27705750..976da2906d7c 100644
--- a/lib/Fuzzer/test/fuzzer-flags.test
+++ b/lib/Fuzzer/test/fuzzer-flags.test
@@ -1,10 +1,21 @@
-RUN: LLVMFuzzer-SimpleTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR
+# Does not work on windows for unknown reason.
+UNSUPPORTED: windows
+
+RUN: LLVMFuzzer-FlagsTest -foo_bar=1 2>&1 | FileCheck %s --check-prefix=FOO_BAR
 FOO_BAR: WARNING: unrecognized flag '-foo_bar=1'; use -help=1 to list all flags
 FOO_BAR: BINGO
 
-RUN: LLVMFuzzer-SimpleTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH
+RUN: LLVMFuzzer-FlagsTest -runs=10 --max_len=100 2>&1 | FileCheck %s --check-prefix=DASH_DASH
 DASH_DASH: WARNING: did you mean '-max_len=100' (single dash)?
 DASH_DASH: INFO: A corpus is not provided, starting from an empty corpus
 
-RUN: LLVMFuzzer-SimpleTest -help=1 2>&1 | FileCheck %s --check-prefix=NO_INTERNAL
+RUN: LLVMFuzzer-FlagsTest -help=1 2>&1 | FileCheck %s --check-prefix=NO_INTERNAL
 NO_INTERNAL-NOT: internal flag
+
+RUN: LLVMFuzzer-FlagsTest --foo-bar -runs=10 -ignore_remaining_args=1 --baz -help=1 test 2>&1 | FileCheck %s --check-prefix=PASSTHRU
+PASSTHRU: BINGO --foo-bar --baz -help=1 test
+
+RUN: mkdir -p %t/T0 %t/T1
+RUN: touch %t/T1/empty
+RUN: LLVMFuzzer-FlagsTest --foo-bar -merge=1 %t/T0 %t/T1 -ignore_remaining_args=1 --baz -help=1 test 2>&1 | FileCheck %s --check-prefix=PASSTHRU-MERGE
+PASSTHRU-MERGE: BINGO --foo-bar --baz -help=1 test
diff --git a/lib/Fuzzer/test/fuzzer-traces-hooks.test b/lib/Fuzzer/test/fuzzer-traces-hooks.test
index f93a8b7199e2..77ca4b47bd01 100644
--- a/lib/Fuzzer/test/fuzzer-traces-hooks.test
+++ b/lib/Fuzzer/test/fuzzer-traces-hooks.test
@@ -10,7 +10,7 @@ RUN: not LLVMFuzzer-StrstrTest               -seed=1 -runs=2000000   2>&1 | File
 
 RUN: not LLVMFuzzer-Memcmp64BytesTest        -seed=1 -runs=1000000   2>&1 | FileCheck %s
 
-RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
+RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 -max_len=20 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
 RECOMMENDED_DICT:###### Recommended dictionary. ######
 RECOMMENDED_DICT-DAG: "foo"
 RECOMMENDED_DICT-DAG: "bar"
diff --git a/lib/Fuzzer/test/reduce_inputs.test b/lib/Fuzzer/test/reduce_inputs.test
index a4a5c57123d3..5ce4440788f4 100644
--- a/lib/Fuzzer/test/reduce_inputs.test
+++ b/lib/Fuzzer/test/reduce_inputs.test
@@ -9,5 +9,6 @@ CHECK: INFO: found item with checksum '0eb8e4ed029b774d80f2b66408203801cb982a60'
 RUN: LLVMFuzzer-ShrinkControlFlowSimpleTest -runs=0 %t/C 2>&1 | FileCheck %s --check-prefix=COUNT
 COUNT: READ units: 3
 
-
+# a bit longer test
+RUN: LLVMFuzzer-ShrinkControlFlowTest  -exit_on_item=0eb8e4ed029b774d80f2b66408203801cb982a60  -seed=1 -reduce_inputs=1 -runs=1000000  2>&1 | FileCheck %s
 
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 80371780fb6d..170bc544d53f 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -365,7 +365,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
   case CallingConv::PTX_Device:    Out << "ptx_device"; break;
   case CallingConv::X86_64_SysV:   Out << "x86_64_sysvcc"; break;
-  case CallingConv::X86_64_Win64:  Out << "x86_64_win64cc"; break;
+  case CallingConv::Win64:         Out << "win64cc"; break;
   case CallingConv::SPIR_FUNC:     Out << "spir_func"; break;
   case CallingConv::SPIR_KERNEL:   Out << "spir_kernel"; break;
   case CallingConv::Swift:         Out << "swiftcc"; break;
@@ -1964,6 +1964,7 @@ static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N,
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("entity", N->getRawEntity());
+  Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
   Out << ")";
 }
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index e31779c83e3a..f56fe7089807 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -44,8 +44,8 @@ bool Constant::isNegativeZeroValue() const {
 
   // Equivalent for a vector of -0.0's.
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
-      if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative())
+    if (CV->getElementType()->isFloatingPointTy() && CV->isSplat())
+      if (CV->getElementAsAPFloat(0).isNegZero())
         return true;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -70,8 +70,8 @@ bool Constant::isZeroValue() const {
 
   // Equivalent for a vector of -0.0's.
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
-      if (SplatCFP && SplatCFP->isZero())
+    if (CV->getElementType()->isFloatingPointTy() && CV->isSplat())
+      if (CV->getElementAsAPFloat(0).isZero())
         return true;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -113,9 +113,13 @@ bool Constant::isAllOnesValue() const {
       return Splat->isAllOnesValue();
 
   // Check for constant vectors which are splats of -1 values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isAllOnesValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isAllOnesValue();
+      return CV->getElementAsAPInt(0).isAllOnesValue();
+    }
+  }
 
   return false;
 }
@@ -135,9 +139,13 @@ bool Constant::isOneValue() const {
       return Splat->isOneValue();
 
   // Check for constant vectors which are splats of 1 values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isOneValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isOneValue();
+      return CV->getElementAsAPInt(0).isOneValue();
+    }
+  }
 
   return false;
 }
@@ -157,9 +165,13 @@ bool Constant::isMinSignedValue() const {
       return Splat->isMinSignedValue();
 
   // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isMinSignedValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
+      return CV->getElementAsAPInt(0).isMinSignedValue();
+    }
+  }
 
   return false;
 }
@@ -179,9 +191,13 @@ bool Constant::isNotMinSignedValue() const {
       return Splat->isNotMinSignedValue();
 
   // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isNotMinSignedValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return !CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
+      return !CV->getElementAsAPInt(0).isMinSignedValue();
+    }
+  }
 
   // It *may* contain INT_MIN, we can't tell.
   return false;
@@ -2565,6 +2581,34 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   }
 }
 
+APInt ConstantDataSequential::getElementAsAPInt(unsigned Elt) const {
+  assert(isa<IntegerType>(getElementType()) &&
+         "Accessor can only be used when element is an integer");
+  const char *EltPtr = getElementPointer(Elt);
+
+  // The data is stored in host byte order, make sure to cast back to the right
+  // type to load with the right endianness.
+  switch (getElementType()->getIntegerBitWidth()) {
+  default: llvm_unreachable("Invalid bitwidth for CDS");
+  case 8: {
+    auto EltVal = *reinterpret_cast<const uint8_t *>(EltPtr);
+    return APInt(8, EltVal);
+  }
+  case 16: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APInt(16, EltVal);
+  }
+  case 32: {
+    auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
+    return APInt(32, EltVal);
+  }
+  case 64: {
+    auto EltVal = *reinterpret_cast<const uint64_t *>(EltPtr);
+    return APInt(64, EltVal);
+  }
+  }
+}
+
 APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
   const char *EltPtr = getElementPointer(Elt);
 
@@ -2623,17 +2667,21 @@ bool ConstantDataSequential::isCString() const {
   return Str.drop_back().find(0) == StringRef::npos;
 }
 
-Constant *ConstantDataVector::getSplatValue() const {
+bool ConstantDataVector::isSplat() const {
   const char *Base = getRawDataValues().data();
 
   // Compare elements 1+ to the 0'th element.
   unsigned EltSize = getElementByteSize();
   for (unsigned i = 1, e = getNumElements(); i != e; ++i)
     if (memcmp(Base, Base+i*EltSize, EltSize))
-      return nullptr;
+      return false;
 
+  return true;
+}
+
+Constant *ConstantDataVector::getSplatValue() const {
   // If they're all the same, return the 0th one as a representative.
-  return getElementAsConstant(0);
+  return isSplat() ? getElementAsConstant(0) : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 2165ae5a9470..aba770457e2f 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm-c/Core.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 7e598b43ac16..bce28ba3b950 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -148,10 +148,13 @@ DICompileUnit *DIBuilder::createCompileUnit(
 
 static DIImportedEntity *
 createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
-                     Metadata *NS, unsigned Line, StringRef Name,
+                     Metadata *NS, DIFile *File, unsigned Line, StringRef Name,
                      SmallVectorImpl<TrackingMDNodeRef> &AllImportedModules) {
+  if (Line)
+    assert(File && "Source location has line number but no file");
   unsigned EntitiesCount = C.pImpl->DIImportedEntitys.size();
-  auto *M = DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), Line, Name);
+  auto *M =
+      DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), File, Line, Name);
   if (EntitiesCount < C.pImpl->DIImportedEntitys.size())
     // A new Imported Entity was just added to the context.
     // Add it to the Imported Modules list.
@@ -160,33 +163,38 @@ createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
-                                                  DINamespace *NS,
+                                                  DINamespace *NS, DIFile *File,
                                                   unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, Line, StringRef(), AllImportedModules);
+                                Context, NS, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
                                                   DIImportedEntity *NS,
-                                                  unsigned Line) {
+                                                  DIFile *File, unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, Line, StringRef(), AllImportedModules);
+                                Context, NS, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, DIModule *M,
-                                                  unsigned Line) {
+                                                  DIFile *File, unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, M, Line, StringRef(), AllImportedModules);
+                                Context, M, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context,
                                                        DINode *Decl,
+                                                       DIFile *File,
                                                        unsigned Line,
                                                        StringRef Name) {
   // Make sure to use the unique identifier based metadata reference for
   // types that have one.
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
-                                Context, Decl, Line, Name, AllImportedModules);
+                                Context, Decl, File, Line, Name,
+                                AllImportedModules);
 }
 
 DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory,
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 0bf68b4c53bb..c14940bad45d 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -760,12 +760,13 @@ DIObjCProperty *DIObjCProperty::getImpl(
 
 DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
                                             Metadata *Scope, Metadata *Entity,
-                                            unsigned Line, MDString *Name,
-                                            StorageType Storage,
+                                            Metadata *File, unsigned Line,
+                                            MDString *Name, StorageType Storage,
                                             bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIImportedEntity, (Tag, Scope, Entity, Line, Name));
-  Metadata *Ops[] = {Scope, Entity, Name};
+  DEFINE_GETIMPL_LOOKUP(DIImportedEntity,
+                        (Tag, Scope, Entity, File, Line, Name));
+  Metadata *Ops[] = {Scope, Entity, Name, File};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
 
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index 9bd0e297f4ef..4d7e3040ecd7 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -61,24 +61,30 @@ bool BasicBlockEdge::isSingleEdge() const {
 //===----------------------------------------------------------------------===//
 
 template class llvm::DomTreeNodeBase<BasicBlock>;
-template class llvm::DominatorTreeBase<BasicBlock>;
+template class llvm::DominatorTreeBase<BasicBlock, false>; // DomTreeBase
+template class llvm::DominatorTreeBase<BasicBlock, true>; // PostDomTreeBase
 
-template void llvm::DomTreeBuilder::Calculate<Function, BasicBlock *>(
-    DominatorTreeBase<
-        typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
-        &DT,
-    Function &F);
-template void llvm::DomTreeBuilder::Calculate<Function, Inverse<BasicBlock *>>(
-    DominatorTreeBase<typename std::remove_pointer<
-        GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT,
-    Function &F);
-template bool llvm::DomTreeBuilder::Verify<BasicBlock *>(
-    const DominatorTreeBase<
-        typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
-        &DT);
-template bool llvm::DomTreeBuilder::Verify<Inverse<BasicBlock *>>(
-    const DominatorTreeBase<typename std::remove_pointer<
-        GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT);
+template void
+llvm::DomTreeBuilder::Calculate<DomTreeBuilder::BBDomTree, Function>(
+    DomTreeBuilder::BBDomTree &DT, Function &F);
+template void
+llvm::DomTreeBuilder::Calculate<DomTreeBuilder::BBPostDomTree, Function>(
+    DomTreeBuilder::BBPostDomTree &DT, Function &F);
+
+template void llvm::DomTreeBuilder::InsertEdge<DomTreeBuilder::BBDomTree>(
+    DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To);
+template void llvm::DomTreeBuilder::InsertEdge<DomTreeBuilder::BBPostDomTree>(
+    DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
+
+template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBDomTree>(
+    DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To);
+template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBPostDomTree>(
+    DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
+
+template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBDomTree>(
+    const DomTreeBuilder::BBDomTree &DT);
+template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBPostDomTree>(
+    const DomTreeBuilder::BBPostDomTree &DT);
 
 bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &) {
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index e413a4f34432..bea2c7ae8ff2 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -990,24 +990,26 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   unsigned Tag;
   Metadata *Scope;
   Metadata *Entity;
+  Metadata *File;
   unsigned Line;
   MDString *Name;
 
-  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, unsigned Line,
-                MDString *Name)
-      : Tag(Tag), Scope(Scope), Entity(Entity), Line(Line), Name(Name) {}
+  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, Metadata *File,
+                unsigned Line, MDString *Name)
+      : Tag(Tag), Scope(Scope), Entity(Entity), File(File), Line(Line),
+        Name(Name) {}
   MDNodeKeyImpl(const DIImportedEntity *N)
       : Tag(N->getTag()), Scope(N->getRawScope()), Entity(N->getRawEntity()),
-        Line(N->getLine()), Name(N->getRawName()) {}
+        File(N->getRawFile()), Line(N->getLine()), Name(N->getRawName()) {}
 
   bool isKeyOf(const DIImportedEntity *RHS) const {
     return Tag == RHS->getTag() && Scope == RHS->getRawScope() &&
-           Entity == RHS->getRawEntity() && Line == RHS->getLine() &&
-           Name == RHS->getRawName();
+           Entity == RHS->getRawEntity() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Name == RHS->getRawName();
   }
 
   unsigned getHashValue() const {
-    return hash_combine(Tag, Scope, Entity, Line, Name);
+    return hash_combine(Tag, Scope, Entity, File, Line, Name);
   }
 };
 
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 29e2f42d3e05..995e1e570340 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -625,21 +625,21 @@ void PMTopLevelManager::schedulePass(Pass *P) {
     checkAnalysis = false;
 
     const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
-           E = RequiredSet.end(); I != E; ++I) {
+    for (const AnalysisID ID : RequiredSet) {
 
-      Pass *AnalysisPass = findAnalysisPass(*I);
+      Pass *AnalysisPass = findAnalysisPass(ID);
       if (!AnalysisPass) {
-        const PassInfo *PI = findAnalysisPassInfo(*I);
+        const PassInfo *PI = findAnalysisPassInfo(ID);
 
         if (!PI) {
           // Pass P is not in the global PassRegistry
           dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
           dbgs() << "Verify if there is a pass dependency cycle." << "\n";
           dbgs() << "Required Passes:" << "\n";
-          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
-                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
-            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+          for (const AnalysisID ID2 : RequiredSet) {
+            if (ID == ID2)
+              break;
+            Pass *AnalysisPass2 = findAnalysisPass(ID2);
             if (AnalysisPass2) {
               dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
             } else {
@@ -1070,17 +1070,15 @@ void PMDataManager::collectRequiredAndUsedAnalyses(
 void PMDataManager::initializeAnalysisImpl(Pass *P) {
   AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
 
-  for (AnalysisUsage::VectorType::const_iterator
-         I = AnUsage->getRequiredSet().begin(),
-         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
-    Pass *Impl = findAnalysisPass(*I, true);
+  for (const AnalysisID ID : AnUsage->getRequiredSet()) {
+    Pass *Impl = findAnalysisPass(ID, true);
     if (!Impl)
       // This may be analysis pass that is initialized on the fly.
       // If that is not the case then it will raise an assert when it is used.
       continue;
     AnalysisResolver *AR = P->getResolver();
     assert(AR && "Analysis Resolver is not set");
-    AR->addAnalysisImplsPair(*I, Impl);
+    AR->addAnalysisImplsPair(ID, Impl);
   }
 }
 
@@ -1112,21 +1110,19 @@ void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
 
   TPM->collectLastUses(LUses, P);
 
-  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
-         E = LUses.end(); I != E; ++I) {
+  for (Pass *P : LUses) {
     dbgs() << "--" << std::string(Offset*2, ' ');
-    (*I)->dumpPassStructure(0);
+    P->dumpPassStructure(0);
   }
 }
 
 void PMDataManager::dumpPassArguments() const {
-  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
-        E = PassVector.end(); I != E; ++I) {
-    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+  for (Pass *P : PassVector) {
+    if (PMDataManager *PMD = P->getAsPMDataManager())
       PMD->dumpPassArguments();
     else
       if (const PassInfo *PI =
-            TPM->findAnalysisPassInfo((*I)->getPassID()))
+            TPM->findAnalysisPassInfo(P->getPassID()))
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
@@ -1255,9 +1251,8 @@ Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
 
 // Destructor
 PMDataManager::~PMDataManager() {
-  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
-         E = PassVector.end(); I != E; ++I)
-    delete *I;
+  for (Pass *P : PassVector)
+    delete P;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1284,35 +1279,35 @@ bool BBPassManager::runOnFunction(Function &F) {
 
   bool Changed = doInitialization(F);
 
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+  for (BasicBlock &BB : F)
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       BasicBlockPass *BP = getContainedPass(Index);
       bool LocalChanged = false;
 
-      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, BB.getName());
       dumpRequiredSet(BP);
 
       initializeAnalysisImpl(BP);
 
       {
         // If the pass crashes, remember this.
-        PassManagerPrettyStackEntry X(BP, *I);
+        PassManagerPrettyStackEntry X(BP, BB);
         TimeRegion PassTimer(getPassTimer(BP));
 
-        LocalChanged |= BP->runOnBasicBlock(*I);
+        LocalChanged |= BP->runOnBasicBlock(BB);
       }
 
       Changed |= LocalChanged;
       if (LocalChanged)
         dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
-                     I->getName());
+                     BB.getName());
       dumpPreservedSet(BP);
       dumpUsedSet(BP);
 
       verifyPreservedAnalysis(BP);
       removeNotPreservedAnalysis(BP);
       recordAvailableAnalysis(BP);
-      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+      removeDeadPasses(BP, BB.getName(), ON_BASICBLOCK_MSG);
     }
 
   return doFinalization(F) || Changed;
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index fdc7de6eaa34..c230a50044c7 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -103,7 +103,7 @@ std::unique_ptr<RandomNumberGenerator> Module::createRNG(const Pass* P) const {
   // store salt metadata from the Module constructor.
   Salt += sys::path::filename(getModuleIdentifier());
 
-  return std::unique_ptr<RandomNumberGenerator>{new RandomNumberGenerator(Salt)};
+  return std::unique_ptr<RandomNumberGenerator>(new RandomNumberGenerator(Salt));
 }
 
 /// getNamedValue - Return the first global value in the module with
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index 4034f9039dda..b052c76d1fed 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -318,7 +318,8 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
         continue;
       if (!(Symflags & object::SymbolRef::SF_Global))
         continue;
-      if (Symflags & object::SymbolRef::SF_Undefined)
+      if (Symflags & object::SymbolRef::SF_Undefined &&
+          !(Symflags & object::SymbolRef::SF_Indirect))
         continue;
 
       unsigned NameOffset = NameOS.tell();
diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp
index 740bf94d40e0..d1f46fdfa292 100644
--- a/lib/Object/COFFImportFile.cpp
+++ b/lib/Object/COFFImportFile.cpp
@@ -131,14 +131,14 @@ class ObjectFactory {
   using u32 = support::ulittle32_t;
   MachineTypes Machine;
   BumpPtrAllocator Alloc;
-  StringRef DLLName;
+  StringRef ImportName;
   StringRef Library;
   std::string ImportDescriptorSymbolName;
   std::string NullThunkSymbolName;
 
 public:
   ObjectFactory(StringRef S, MachineTypes M)
-      : Machine(M), DLLName(S), Library(S.drop_back(4)),
+      : Machine(M), ImportName(S), Library(S.drop_back(4)),
         ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
         NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}
 
@@ -162,14 +162,17 @@ class ObjectFactory {
   // Library Format.
   NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
                                      ImportType Type, ImportNameType NameType);
+
+  // Create a weak external file which is described in PE/COFF Aux Format 3.
+  NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp);
 };
 } // namespace
 
 NewArchiveMember
 ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
-  static const uint32_t NumberOfSections = 2;
-  static const uint32_t NumberOfSymbols = 7;
-  static const uint32_t NumberOfRelocations = 3;
+  const uint32_t NumberOfSections = 2;
+  const uint32_t NumberOfSymbols = 7;
+  const uint32_t NumberOfRelocations = 3;
 
   // COFF Header
   coff_file_header Header{
@@ -181,7 +184,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
           sizeof(coff_import_directory_table_entry) +
           NumberOfRelocations * sizeof(coff_relocation) +
           // .idata$4
-          (DLLName.size() + 1)),
+          (ImportName.size() + 1)),
       u32(NumberOfSymbols),
       u16(0),
       u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
@@ -189,7 +192,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, Header);
 
   // Section Header Table
-  static const coff_section SectionTable[NumberOfSections] = {
+  const coff_section SectionTable[NumberOfSections] = {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '2'},
        u32(0),
        u32(0),
@@ -205,7 +208,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '6'},
        u32(0),
        u32(0),
-       u32(DLLName.size() + 1),
+       u32(ImportName.size() + 1),
        u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
            sizeof(coff_import_directory_table_entry) +
            NumberOfRelocations * sizeof(coff_relocation)),
@@ -219,12 +222,12 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, SectionTable);
 
   // .idata$2
-  static const coff_import_directory_table_entry ImportDescriptor{
+  const coff_import_directory_table_entry ImportDescriptor{
       u32(0), u32(0), u32(0), u32(0), u32(0),
   };
   append(Buffer, ImportDescriptor);
 
-  static const coff_relocation RelocationTable[NumberOfRelocations] = {
+  const coff_relocation RelocationTable[NumberOfRelocations] = {
       {u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
        u16(getImgRelRelocation(Machine))},
       {u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
@@ -236,9 +239,9 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
 
   // .idata$6
   auto S = Buffer.size();
-  Buffer.resize(S + DLLName.size() + 1);
-  memcpy(&Buffer[S], DLLName.data(), DLLName.size());
-  Buffer[S + DLLName.size()] = '\0';
+  Buffer.resize(S + ImportName.size() + 1);
+  memcpy(&Buffer[S], ImportName.data(), ImportName.size());
+  Buffer[S + ImportName.size()] = '\0';
 
   // Symbol Table
   coff_symbol16 SymbolTable[NumberOfSymbols] = {
@@ -302,13 +305,13 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
                     NullThunkSymbolName});
 
   StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
-  return {MemoryBufferRef(F, DLLName)};
+  return {MemoryBufferRef(F, ImportName)};
 }
 
 NewArchiveMember
 ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
-  static const uint32_t NumberOfSections = 1;
-  static const uint32_t NumberOfSymbols = 1;
+  const uint32_t NumberOfSections = 1;
+  const uint32_t NumberOfSymbols = 1;
 
   // COFF Header
   coff_file_header Header{
@@ -325,7 +328,7 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, Header);
 
   // Section Header Table
-  static const coff_section SectionTable[NumberOfSections] = {
+  const coff_section SectionTable[NumberOfSections] = {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '3'},
        u32(0),
        u32(0),
@@ -342,7 +345,7 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
   append(Buffer, SectionTable);
 
   // .idata$3
-  static const coff_import_directory_table_entry ImportDescriptor{
+  const coff_import_directory_table_entry ImportDescriptor{
       u32(0), u32(0), u32(0), u32(0), u32(0),
   };
   append(Buffer, ImportDescriptor);
@@ -363,12 +366,12 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
   writeStringTable(Buffer, {NullImportDescriptorSymbolName});
 
   StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
-  return {MemoryBufferRef(F, DLLName)};
+  return {MemoryBufferRef(F, ImportName)};
 }
 
 NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
-  static const uint32_t NumberOfSections = 2;
-  static const uint32_t NumberOfSymbols = 1;
+  const uint32_t NumberOfSections = 2;
+  const uint32_t NumberOfSymbols = 1;
   uint32_t VASize = is32bit(Machine) ? 4 : 8;
 
   // COFF Header
@@ -388,7 +391,7 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   append(Buffer, Header);
 
   // Section Header Table
-  static const coff_section SectionTable[NumberOfSections] = {
+  const coff_section SectionTable[NumberOfSections] = {
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '5'},
        u32(0),
        u32(0),
@@ -445,14 +448,14 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   writeStringTable(Buffer, {NullThunkSymbolName});
 
   StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
-  return {MemoryBufferRef{F, DLLName}};
+  return {MemoryBufferRef{F, ImportName}};
 }
 
 NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
                                                   uint16_t Ordinal,
                                                   ImportType ImportType,
                                                   ImportNameType NameType) {
-  size_t ImpSize = DLLName.size() + Sym.size() + 2; // +2 for NULs
+  size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
   size_t Size = sizeof(coff_import_header) + ImpSize;
   char *Buf = Alloc.Allocate<char>(Size);
   memset(Buf, 0, Size);
@@ -471,17 +474,96 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
   // Write symbol name and DLL name.
   memcpy(P, Sym.data(), Sym.size());
   P += Sym.size() + 1;
-  memcpy(P, DLLName.data(), DLLName.size());
+  memcpy(P, ImportName.data(), ImportName.size());
 
-  return {MemoryBufferRef(StringRef(Buf, Size), DLLName)};
+  return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
 }
 
-std::error_code writeImportLibrary(StringRef DLLName, StringRef Path,
+NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
+                                                   StringRef Weak, bool Imp) {
+  std::vector<uint8_t> Buffer;
+  const uint32_t NumberOfSections = 1;
+  const uint32_t NumberOfSymbols = 5;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(0),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'd', 'r', 'e', 'c', 't', 'v', 'e'},
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_LNK_INFO | IMAGE_SCN_LNK_REMOVE)}};
+  append(Buffer, SectionTable);
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{'@', 'c', 'o', 'm', 'p', '.', 'i', 'd'}},
+       u32(0),
+       u16(0xFFFF),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{'@', 'f', 'e', 'a', 't', '.', '0', '0'}},
+       u32(0),
+       u16(0xFFFF),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_WEAK_EXTERNAL,
+       1},
+      {{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0},
+  };
+  SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);
+
+  //__imp_ String Table
+  if (Imp) {
+    SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 7;
+    writeStringTable(Buffer, {std::string("__imp_").append(Sym),
+                              std::string("__imp_").append(Weak)});
+  } else {
+    SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 1;
+    writeStringTable(Buffer, {Sym, Weak});
+  }
+  append(Buffer, SymbolTable);
+
+  // Copied here so we can still use writeStringTable
+  char *Buf = Alloc.Allocate<char>(Buffer.size());
+  memcpy(Buf, Buffer.data(), Buffer.size());
+  return {MemoryBufferRef(StringRef(Buf, Buffer.size()), ImportName)};
+}
+
+std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
                                    ArrayRef<COFFShortExport> Exports,
                                    MachineTypes Machine) {
 
   std::vector<NewArchiveMember> Members;
-  ObjectFactory OF(llvm::sys::path::filename(DLLName), Machine);
+  ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);
 
   std::vector<uint8_t> ImportDescriptor;
   Members.push_back(OF.createImportDescriptor(ImportDescriptor));
@@ -496,6 +578,12 @@ std::error_code writeImportLibrary(StringRef DLLName, StringRef Path,
     if (E.Private)
       continue;
 
+    if (E.isWeak()) {
+      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
+      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
+      continue;
+    }
+
     ImportType ImportType = IMPORT_CODE;
     if (E.Data)
       ImportType = IMPORT_DATA;
diff --git a/lib/Object/COFFModuleDefinition.cpp b/lib/Object/COFFModuleDefinition.cpp
index 0d69cb6b709c..ed9140d1fe08 100644
--- a/lib/Object/COFFModuleDefinition.cpp
+++ b/lib/Object/COFFModuleDefinition.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm::COFF;
@@ -55,8 +56,10 @@ struct Token {
   StringRef Value;
 };
 
-static bool isDecorated(StringRef Sym) {
-  return Sym.startswith("_") || Sym.startswith("@") || Sym.startswith("?");
+static bool isDecorated(StringRef Sym, bool MingwDef) {
+  // mingw does not prepend "_".
+  return (!MingwDef && Sym.startswith("_")) || Sym.startswith("@") ||
+         Sym.startswith("?");
 }
 
 static Error createError(const Twine &Err) {
@@ -83,6 +86,9 @@ class Lexer {
     }
     case '=':
       Buf = Buf.drop_front();
+      // GNU dlltool accepts both = and ==.
+      if (Buf.startswith("="))
+        Buf = Buf.drop_front();
       return Token(Equal, "=");
     case ',':
       Buf = Buf.drop_front();
@@ -120,7 +126,8 @@ class Lexer {
 
 class Parser {
 public:
-  explicit Parser(StringRef S, MachineTypes M) : Lex(S), Machine(M) {}
+  explicit Parser(StringRef S, MachineTypes M, bool B)
+      : Lex(S), Machine(M), MingwDef(B) {}
 
   Expected<COFFModuleDefinition> parse() {
     do {
@@ -181,14 +188,17 @@ class Parser {
       std::string Name;
       if (Error Err = parseName(&Name, &Info.ImageBase))
         return Err;
-      // Append the appropriate file extension if not already present.
-      StringRef Ext = IsDll ? ".dll" : ".exe";
-      if (!StringRef(Name).endswith_lower(Ext))
-        Name += Ext;
+
+      Info.ImportName = Name;
 
       // Set the output file, but don't override /out if it was already passed.
-      if (Info.OutputFile.empty())
+      if (Info.OutputFile.empty()) {
         Info.OutputFile = Name;
+        // Append the appropriate file extension if not already present.
+        if (!sys::path::has_extension(Name))
+          Info.OutputFile += IsDll ? ".dll" : ".exe";
+      }
+
       return Error::success();
     }
     case KwVersion:
@@ -213,9 +223,9 @@ class Parser {
     }
 
     if (Machine == IMAGE_FILE_MACHINE_I386) {
-      if (!isDecorated(E.Name))
+      if (!isDecorated(E.Name, MingwDef))
         E.Name = (std::string("_").append(E.Name));
-      if (!E.ExtName.empty() && !isDecorated(E.ExtName))
+      if (!E.ExtName.empty() && !isDecorated(E.ExtName, MingwDef))
         E.ExtName = (std::string("_").append(E.ExtName));
     }
 
@@ -308,11 +318,13 @@ class Parser {
   std::vector<Token> Stack;
   MachineTypes Machine;
   COFFModuleDefinition Info;
+  bool MingwDef;
 };
 
 Expected<COFFModuleDefinition> parseCOFFModuleDefinition(MemoryBufferRef MB,
-                                                         MachineTypes Machine) {
-  return Parser(MB.getBuffer(), Machine).parse();
+                                                         MachineTypes Machine,
+                                                         bool MingwDef) {
+  return Parser(MB.getBuffer(), Machine, MingwDef).parse();
 }
 
 } // namespace object
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 1e9b0c5b0454..0a2053477caf 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -227,8 +227,11 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   if (Symb.isExternal() || Symb.isWeakExternal())
     Result |= SymbolRef::SF_Global;
 
-  if (Symb.isWeakExternal())
+  if (Symb.isWeakExternal()) {
     Result |= SymbolRef::SF_Weak;
+    // We use indirect to allow the archiver to write weak externs
+    Result |= SymbolRef::SF_Indirect;
+  }
 
   if (Symb.getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE)
     Result |= SymbolRef::SF_Absolute;
diff --git a/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
index 0b2ea61c5fe0..81046b217862 100644
--- a/lib/ObjectYAML/CodeViewYAMLTypes.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -141,6 +141,33 @@ template <typename T> struct MemberRecordImpl : public MemberRecordBase {
 } // end namespace CodeViewYAML
 } // end namespace llvm
 
+void ScalarTraits<GUID>::output(const GUID &G, void *, llvm::raw_ostream &OS) {
+  OS << G;
+}
+
+StringRef ScalarTraits<GUID>::input(StringRef Scalar, void *Ctx, GUID &S) {
+  if (Scalar.size() != 38)
+    return "GUID strings are 38 characters long";
+  if (Scalar[0] != '{' || Scalar[37] != '}')
+    return "GUID is not enclosed in {}";
+  if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' ||
+      Scalar[24] != '-')
+    return "GUID sections are not properly delineated with dashes";
+
+  uint8_t *OutBuffer = S.Guid;
+  for (auto Iter = Scalar.begin(); Iter != Scalar.end();) {
+    if (*Iter == '-' || *Iter == '{' || *Iter == '}') {
+      ++Iter;
+      continue;
+    }
+    uint8_t Value = (llvm::hexDigitValue(*Iter++) << 4);
+    Value |= llvm::hexDigitValue(*Iter++);
+    *OutBuffer++ = Value;
+  }
+
+  return "";
+}
+
 void ScalarTraits<TypeIndex>::output(const TypeIndex &S, void *,
                                      raw_ostream &OS) {
   OS << S.getIndex();
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index bcd365236e46..f3b438e829d6 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -390,27 +390,29 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
   return Name;
 }
 
+namespace {
+struct OptionInfo {
+  std::string Name;
+  StringRef HelpText;
+};
+} // namespace
+
 static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
-                                std::vector<std::pair<std::string,
-                                const char*>> &OptionHelp) {
+                                std::vector<OptionInfo> &OptionHelp) {
   OS << Title << ":\n";
 
   // Find the maximum option length.
   unsigned OptionFieldWidth = 0;
   for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    // Skip titles.
-    if (!OptionHelp[i].second)
-      continue;
-
     // Limit the amount of padding we are willing to give up for alignment.
-    unsigned Length = OptionHelp[i].first.size();
+    unsigned Length = OptionHelp[i].Name.size();
     if (Length <= 23)
       OptionFieldWidth = std::max(OptionFieldWidth, Length);
   }
 
   const unsigned InitialPad = 2;
   for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    const std::string &Option = OptionHelp[i].first;
+    const std::string &Option = OptionHelp[i].Name;
     int Pad = OptionFieldWidth - int(Option.size());
     OS.indent(InitialPad) << Option;
 
@@ -419,7 +421,7 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
       OS << "\n";
       Pad = OptionFieldWidth + InitialPad;
     }
-    OS.indent(Pad + 1) << OptionHelp[i].second << '\n';
+    OS.indent(Pad + 1) << OptionHelp[i].HelpText << '\n';
   }
 }
 
@@ -458,8 +460,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
 
   // Render help text into a map of group-name to a list of (option, help)
   // pairs.
-  using helpmap_ty =
-      std::map<std::string, std::vector<std::pair<std::string, const char*>>>;
+  using helpmap_ty = std::map<std::string, std::vector<OptionInfo>>;
   helpmap_ty GroupedOptionHelp;
 
   for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
@@ -478,7 +479,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
     if (const char *Text = getOptionHelpText(Id)) {
       const char *HelpGroup = getOptionHelpGroup(*this, Id);
       const std::string &OptName = getOptionHelpName(*this, Id);
-      GroupedOptionHelp[HelpGroup].push_back(std::make_pair(OptName, Text));
+      GroupedOptionHelp[HelpGroup].push_back({OptName, Text});
     }
   }
 
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index fe69151665c6..2fd4f3ea0d45 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -45,22 +45,36 @@ static void *ErrorHandlerUserData = nullptr;
 static fatal_error_handler_t BadAllocErrorHandler = nullptr;
 static void *BadAllocErrorHandlerUserData = nullptr;
 
+#if LLVM_ENABLE_THREADS == 1
 // Mutexes to synchronize installing error handlers and calling error handlers.
 // Do not use ManagedStatic, or that may allocate memory while attempting to
 // report an OOM.
+//
+// This usage of std::mutex has to be conditionalized behind ifdefs because
+// of this script:
+//   compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+// That script attempts to statically link the LLVM symbolizer library with the
+// STL and hide all of its symbols with 'opt -internalize'. To reduce size, it
+// cuts out the threading portions of the hermetic copy of libc++ that it
+// builds. We can remove these ifdefs if that script goes away.
 static std::mutex ErrorHandlerMutex;
 static std::mutex BadAllocErrorHandlerMutex;
+#endif
 
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
   assert(!ErrorHandler && "Error handler already registered!\n");
   ErrorHandler = handler;
   ErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_fatal_error_handler() {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
   ErrorHandler = nullptr;
   ErrorHandlerUserData = nullptr;
 }
@@ -83,7 +97,9 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   {
     // Only acquire the mutex while reading the handler, so as not to invoke a
     // user-supplied callback under a lock.
+#if LLVM_ENABLE_THREADS == 1
     std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
     handler = ErrorHandler;
     handlerData = ErrorHandlerUserData;
   }
@@ -112,14 +128,18 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
 
 void llvm::install_bad_alloc_error_handler(fatal_error_handler_t handler,
                                            void *user_data) {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
   assert(!ErrorHandler && "Bad alloc error handler already registered!\n");
   BadAllocErrorHandler = handler;
   BadAllocErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_bad_alloc_error_handler() {
+#if LLVM_ENABLE_THREADS == 1
   std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
   BadAllocErrorHandler = nullptr;
   BadAllocErrorHandlerUserData = nullptr;
 }
@@ -130,7 +150,9 @@ void llvm::report_bad_alloc_error(const char *Reason, bool GenCrashDiag) {
   {
     // Only acquire the mutex while reading the handler, so as not to invoke a
     // user-supplied callback under a lock.
+#if LLVM_ENABLE_THREADS == 1
     std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
     Handler = BadAllocErrorHandler;
     HandlerData = BadAllocErrorHandlerUserData;
   }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 9f22f89b3c9e..5cf0316d4d71 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -250,6 +250,8 @@ StringRef sys::detail::getHostCPUNameForS390x(
         Pos += sizeof("machine = ") - 1;
         unsigned int Id;
         if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 3906 && HaveVectorSupport)
+            return "z14";
           if (Id >= 2964 && HaveVectorSupport)
             return "z13";
           if (Id >= 2827)
@@ -460,8 +462,8 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__x86_64__) || defined(_M_X64)
 #if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__)
   // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
   // FIXME: should we save this for Clang?
   __asm__("movq\t%%rbx, %%rsi\n\t"
@@ -470,6 +472,16 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value), "c"(subleaf));
   return false;
+#elif defined(__i386__)
+  __asm__("movl\t%%ebx, %%esi\n\t"
+          "cpuid\n\t"
+          "xchgl\t%%ebx, %%esi\n\t"
+          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+          : "a"(value), "c"(subleaf));
+  return false;
+#else
+  return true;
+#endif
 #elif defined(_MSC_VER)
   int registers[4];
   __cpuidex(registers, value, subleaf);
@@ -481,35 +493,6 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
 #else
   return true;
 #endif
-#elif defined(__i386__) || defined(_M_IX86)
-#if defined(__GNUC__) || defined(__clang__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#elif defined(_MSC_VER)
-  __asm {
-      mov   eax,value
-      mov   ecx,subleaf
-      cpuid
-      mov   esi,rEAX
-      mov   dword ptr [esi],eax
-      mov   esi,rEBX
-      mov   dword ptr [esi],ebx
-      mov   esi,rECX
-      mov   dword ptr [esi],ecx
-      mov   esi,rEDX
-      mov   dword ptr [esi],edx
-  }
-  return false;
-#else
-  return true;
-#endif
-#else
-  return true;
-#endif
 }
 
 // Read control register 0 (XCR0). Used to detect features such as AVX.
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index e58f856ca244..ea59ba62d7bd 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -13,8 +13,6 @@
 
 #include "llvm/Support/Path.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/BinaryFormat/COFF.h"
-#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 13bb6f23bc83..e8ef1d2fd8b9 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -452,6 +452,8 @@ bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+ras");
   if (Extensions & AArch64::AEK_LSE)
     Features.push_back("+lse");
+  if (Extensions & AArch64::AEK_SVE)
+    Features.push_back("+sve");
 
   return true;
 }
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 601084f9eae3..65eda246a7fe 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -60,6 +60,14 @@ Input::Input(StringRef InputContent, void *Ctxt,
   DocIterator = Strm->begin();
 }
 
+Input::Input(MemoryBufferRef Input, void *Ctxt,
+             SourceMgr::DiagHandlerTy DiagHandler, void *DiagHandlerCtxt)
+    : IO(Ctxt), Strm(new Stream(Input, SrcMgr, false, &EC)) {
+  if (DiagHandler)
+    SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
+  DocIterator = Strm->begin();
+}
+
 Input::~Input() = default;
 
 std::error_code Input::error() { return EC; }
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 9480cd46d28f..dd58eccee957 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -326,13 +326,30 @@ raw_ostream &raw_ostream::operator<<(const formatv_object_base &Obj) {
 }
 
 raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
-  unsigned Len = FS.Str.size(); 
-  int PadAmount = FS.Width - Len;
-  if (FS.RightJustify && (PadAmount > 0))
-    this->indent(PadAmount);
-  this->operator<<(FS.Str);
-  if (!FS.RightJustify && (PadAmount > 0))
+  if (FS.Str.size() >= FS.Width || FS.Justify == FormattedString::JustifyNone) {
+    this->operator<<(FS.Str);
+    return *this;
+  }
+  const size_t Difference = FS.Width - FS.Str.size();
+  switch (FS.Justify) {
+  case FormattedString::JustifyLeft:
+    this->operator<<(FS.Str);
+    this->indent(Difference);
+    break;
+  case FormattedString::JustifyRight:
+    this->indent(Difference);
+    this->operator<<(FS.Str);
+    break;
+  case FormattedString::JustifyCenter: {
+    int PadAmount = Difference / 2;
     this->indent(PadAmount);
+    this->operator<<(FS.Str);
+    this->indent(Difference - PadAmount);
+    break;
+  }
+  default:
+    llvm_unreachable("Bad Justification");
+  }
   return *this;
 }
 
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 37b9690d0434..1dda746a6be1 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -44,6 +44,8 @@ ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
+FunctionPass *createFalkorHWPFFixPass();
+FunctionPass *createFalkorMarkStridedAccessesPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -66,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeFalkorHWPFFixPass(PassRegistry&);
+void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 53eef79c4df3..436bf1193304 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -50,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
 def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
   "Enable Statistical Profiling extension">;
 
+def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
+  "Enable Scalable Vector Extension (SVE) instructions">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -269,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 938779d23690..291bc5ea858e 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
+// Vararg functions on windows pass floats in integer registers
+def CC_AArch64_Win64_VarArg : CallingConv<[
+  CCIfType<[f16, f32],    CCPromoteToType<f64>>,
+  CCIfType<[f64], CCBitConvertToType<i64>>,
+  CCDelegateTo<CC_AArch64_AAPCS>
+]>;
+
 
 // Darwin uses a calling convention which differs in only two ways
 // from the standard one at this level:
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index ee54550c9900..b72f23b109d9 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -102,6 +102,10 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       case AArch64::LDADDALh:
       case AArch64::LDADDALs:
       case AArch64::LDADDALd:
+      case AArch64::LDCLRALb:
+      case AArch64::LDCLRALh:
+      case AArch64::LDCLRALs:
+      case AArch64::LDCLRALd:
       case AArch64::LDEORALb:
       case AArch64::LDEORALh:
       case AArch64::LDEORALs:
diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
new file mode 100644
index 000000000000..c0e22355a9ff
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,790 @@
+//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
+/// that may inhibit the HW prefetching.  This is done in two steps.  Before
+/// ISel, we mark strided loads (i.e. those that will likely benefit from
+/// prefetching) with metadata.  Then, after opcodes have been finalized, we
+/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "falkor-hwpf-fix"
+
+STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
+STATISTIC(NumCollisionsAvoided,
+          "Number of HW prefetch tag collisions avoided");
+STATISTIC(NumCollisionsNotAvoided,
+          "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+
+namespace {
+
+class FalkorMarkStridedAccesses {
+public:
+  FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
+      : LI(LI), SE(SE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop &L);
+
+  LoopInfo &LI;
+  ScalarEvolution &SE;
+};
+
+class FalkorMarkStridedAccessesLegacy : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
+    initializeFalkorMarkStridedAccessesLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    // FIXME: For some reason, preserving SE here breaks LSR (even if
+    // this pass changes nothing).
+    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FalkorMarkStridedAccessesLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                      "Falkor HW Prefetch Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                    "Falkor HW Prefetch Fix", false, false)
+
+FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
+  return new FalkorMarkStridedAccessesLegacy();
+}
+
+bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
+  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const AArch64Subtarget *ST =
+      TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
+  if (ST->getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(F))
+    return false;
+
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  FalkorMarkStridedAccesses LDP(LI, SE);
+  return LDP.run();
+}
+
+bool FalkorMarkStridedAccesses::run() {
+  bool MadeChange = false;
+
+  for (Loop *L : LI)
+    for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
+      MadeChange |= runOnLoop(**LIt);
+
+  return MadeChange;
+}
+
+bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
+  // Only mark strided loads in the inner-most loop
+  if (!L.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  for (BasicBlock *BB : L.blocks()) {
+    for (Instruction &I : *BB) {
+      LoadInst *LoadI = dyn_cast<LoadInst>(&I);
+      if (!LoadI)
+        continue;
+
+      Value *PtrValue = LoadI->getPointerOperand();
+      if (L.isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE.getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+        continue;
+
+      LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
+                         MDNode::get(LoadI->getContext(), {}));
+      ++NumStridedLoadsMarked;
+      DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+
+class FalkorHWPFFix : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FalkorHWPFFix() : MachineFunctionPass(ID) {
+    initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  void runOnLoop(MachineLoop &L, MachineFunction &Fn);
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
+  bool Modified;
+};
+
+/// Bits from load opcodes used to compute HW prefetcher instruction tags.
+struct LoadInfo {
+  LoadInfo()
+      : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr),
+        IsPrePost(false) {}
+  unsigned DestReg;
+  unsigned BaseReg;
+  int BaseRegIdx;
+  const MachineOperand *OffsetOpnd;
+  bool IsPrePost;
+};
+
+} // namespace
+
+char FalkorHWPFFix::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                      "Falkor HW Prefetch Fix Late Phase", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                    "Falkor HW Prefetch Fix Late Phase", false, false)
+
+static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
+  return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
+}
+
+static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
+  int DestRegIdx;
+  int BaseRegIdx;
+  int OffsetIdx;
+  bool IsPrePost;
+
+  switch (MI.getOpcode()) {
+  default:
+    return None;
+
+  case AArch64::LD1i8:
+  case AArch64::LD1i16:
+  case AArch64::LD1i32:
+  case AArch64::LD1i64:
+  case AArch64::LD2i8:
+  case AArch64::LD2i16:
+  case AArch64::LD2i32:
+  case AArch64::LD2i64:
+  case AArch64::LD3i8:
+  case AArch64::LD3i16:
+  case AArch64::LD3i32:
+  case AArch64::LD4i8:
+  case AArch64::LD4i16:
+  case AArch64::LD4i32:
+    DestRegIdx = 0;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64:
+  case AArch64::LD4i64:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d:
+  case AArch64::LD1Onev2s:
+  case AArch64::LD1Onev4h:
+  case AArch64::LD1Onev8b:
+  case AArch64::LD1Onev2d:
+  case AArch64::LD1Onev4s:
+  case AArch64::LD1Onev8h:
+  case AArch64::LD1Onev16b:
+  case AArch64::LD1Rv1d:
+  case AArch64::LD1Rv2s:
+  case AArch64::LD1Rv4h:
+  case AArch64::LD1Rv8b:
+  case AArch64::LD1Rv2d:
+  case AArch64::LD1Rv4s:
+  case AArch64::LD1Rv8h:
+  case AArch64::LD1Rv16b:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Twov2s:
+  case AArch64::LD1Twov4h:
+  case AArch64::LD1Twov8b:
+  case AArch64::LD2Twov2s:
+  case AArch64::LD2Twov4s:
+  case AArch64::LD2Twov8b:
+  case AArch64::LD2Rv1d:
+  case AArch64::LD2Rv2s:
+  case AArch64::LD2Rv4s:
+  case AArch64::LD2Rv8b:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Twov4s:
+  case AArch64::LD1Twov8h:
+  case AArch64::LD1Twov16b:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Threev2s:
+  case AArch64::LD1Threev4h:
+  case AArch64::LD1Threev8b:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Threev4s:
+  case AArch64::LD1Threev8h:
+  case AArch64::LD1Threev16b:
+  case AArch64::LD1Fourv1d:
+  case AArch64::LD1Fourv2s:
+  case AArch64::LD1Fourv4h:
+  case AArch64::LD1Fourv8b:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Fourv4s:
+  case AArch64::LD1Fourv8h:
+  case AArch64::LD1Fourv16b:
+  case AArch64::LD2Twov2d:
+  case AArch64::LD2Twov4h:
+  case AArch64::LD2Twov8h:
+  case AArch64::LD2Twov16b:
+  case AArch64::LD2Rv2d:
+  case AArch64::LD2Rv4h:
+  case AArch64::LD2Rv8h:
+  case AArch64::LD2Rv16b:
+  case AArch64::LD3Threev2s:
+  case AArch64::LD3Threev4h:
+  case AArch64::LD3Threev8b:
+  case AArch64::LD3Threev2d:
+  case AArch64::LD3Threev4s:
+  case AArch64::LD3Threev8h:
+  case AArch64::LD3Threev16b:
+  case AArch64::LD3Rv1d:
+  case AArch64::LD3Rv2s:
+  case AArch64::LD3Rv4h:
+  case AArch64::LD3Rv8b:
+  case AArch64::LD3Rv2d:
+  case AArch64::LD3Rv4s:
+  case AArch64::LD3Rv8h:
+  case AArch64::LD3Rv16b:
+  case AArch64::LD4Fourv2s:
+  case AArch64::LD4Fourv4h:
+  case AArch64::LD4Fourv8b:
+  case AArch64::LD4Fourv2d:
+  case AArch64::LD4Fourv4s:
+  case AArch64::LD4Fourv8h:
+  case AArch64::LD4Fourv16b:
+  case AArch64::LD4Rv1d:
+  case AArch64::LD4Rv2s:
+  case AArch64::LD4Rv4h:
+  case AArch64::LD4Rv8b:
+  case AArch64::LD4Rv2d:
+  case AArch64::LD4Rv4s:
+  case AArch64::LD4Rv8h:
+  case AArch64::LD4Rv16b:
+    DestRegIdx = -1;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1i8_POST:
+  case AArch64::LD1i16_POST:
+  case AArch64::LD1i32_POST:
+  case AArch64::LD1i64_POST:
+  case AArch64::LD2i8_POST:
+  case AArch64::LD2i16_POST:
+  case AArch64::LD2i32_POST:
+  case AArch64::LD2i64_POST:
+  case AArch64::LD3i8_POST:
+  case AArch64::LD3i16_POST:
+  case AArch64::LD3i32_POST:
+  case AArch64::LD4i8_POST:
+  case AArch64::LD4i16_POST:
+  case AArch64::LD4i32_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64_POST:
+  case AArch64::LD4i64_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d_POST:
+  case AArch64::LD1Onev2s_POST:
+  case AArch64::LD1Onev4h_POST:
+  case AArch64::LD1Onev8b_POST:
+  case AArch64::LD1Onev2d_POST:
+  case AArch64::LD1Onev4s_POST:
+  case AArch64::LD1Onev8h_POST:
+  case AArch64::LD1Onev16b_POST:
+  case AArch64::LD1Rv1d_POST:
+  case AArch64::LD1Rv2s_POST:
+  case AArch64::LD1Rv4h_POST:
+  case AArch64::LD1Rv8b_POST:
+  case AArch64::LD1Rv2d_POST:
+  case AArch64::LD1Rv4s_POST:
+  case AArch64::LD1Rv8h_POST:
+  case AArch64::LD1Rv16b_POST:
+  case AArch64::LD1Twov1d_POST:
+  case AArch64::LD1Twov2s_POST:
+  case AArch64::LD1Twov4h_POST:
+  case AArch64::LD1Twov8b_POST:
+  case AArch64::LD2Twov2s_POST:
+  case AArch64::LD2Twov4s_POST:
+  case AArch64::LD2Twov8b_POST:
+  case AArch64::LD2Rv1d_POST:
+  case AArch64::LD2Rv2s_POST:
+  case AArch64::LD2Rv4s_POST:
+  case AArch64::LD2Rv8b_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d_POST:
+  case AArch64::LD1Twov4s_POST:
+  case AArch64::LD1Twov8h_POST:
+  case AArch64::LD1Twov16b_POST:
+  case AArch64::LD1Threev1d_POST:
+  case AArch64::LD1Threev2s_POST:
+  case AArch64::LD1Threev4h_POST:
+  case AArch64::LD1Threev8b_POST:
+  case AArch64::LD1Threev2d_POST:
+  case AArch64::LD1Threev4s_POST:
+  case AArch64::LD1Threev8h_POST:
+  case AArch64::LD1Threev16b_POST:
+  case AArch64::LD1Fourv1d_POST:
+  case AArch64::LD1Fourv2s_POST:
+  case AArch64::LD1Fourv4h_POST:
+  case AArch64::LD1Fourv8b_POST:
+  case AArch64::LD1Fourv2d_POST:
+  case AArch64::LD1Fourv4s_POST:
+  case AArch64::LD1Fourv8h_POST:
+  case AArch64::LD1Fourv16b_POST:
+  case AArch64::LD2Twov2d_POST:
+  case AArch64::LD2Twov4h_POST:
+  case AArch64::LD2Twov8h_POST:
+  case AArch64::LD2Twov16b_POST:
+  case AArch64::LD2Rv2d_POST:
+  case AArch64::LD2Rv4h_POST:
+  case AArch64::LD2Rv8h_POST:
+  case AArch64::LD2Rv16b_POST:
+  case AArch64::LD3Threev2s_POST:
+  case AArch64::LD3Threev4h_POST:
+  case AArch64::LD3Threev8b_POST:
+  case AArch64::LD3Threev2d_POST:
+  case AArch64::LD3Threev4s_POST:
+  case AArch64::LD3Threev8h_POST:
+  case AArch64::LD3Threev16b_POST:
+  case AArch64::LD3Rv1d_POST:
+  case AArch64::LD3Rv2s_POST:
+  case AArch64::LD3Rv4h_POST:
+  case AArch64::LD3Rv8b_POST:
+  case AArch64::LD3Rv2d_POST:
+  case AArch64::LD3Rv4s_POST:
+  case AArch64::LD3Rv8h_POST:
+  case AArch64::LD3Rv16b_POST:
+  case AArch64::LD4Fourv2s_POST:
+  case AArch64::LD4Fourv4h_POST:
+  case AArch64::LD4Fourv8b_POST:
+  case AArch64::LD4Fourv2d_POST:
+  case AArch64::LD4Fourv4s_POST:
+  case AArch64::LD4Fourv8h_POST:
+  case AArch64::LD4Fourv16b_POST:
+  case AArch64::LD4Rv1d_POST:
+  case AArch64::LD4Rv2s_POST:
+  case AArch64::LD4Rv4h_POST:
+  case AArch64::LD4Rv8b_POST:
+  case AArch64::LD4Rv2d_POST:
+  case AArch64::LD4Rv4s_POST:
+  case AArch64::LD4Rv8h_POST:
+  case AArch64::LD4Rv16b_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBBui:
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRBui:
+  case AArch64::LDRDl:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRDui:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHHui:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRHui:
+  case AArch64::LDRQl:
+  case AArch64::LDRQroW:
+  case AArch64::LDRQroX:
+  case AArch64::LDRQui:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWl:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSl:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+  case AArch64::LDRSui:
+  case AArch64::LDRWl:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRWui:
+  case AArch64::LDRXl:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+  case AArch64::LDRXui:
+  case AArch64::LDURBBi:
+  case AArch64::LDURBi:
+  case AArch64::LDURDi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURHi:
+  case AArch64::LDURQi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = 2;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpost:
+  case AArch64::LDRBpre:
+  case AArch64::LDRDpost:
+  case AArch64::LDRDpre:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpost:
+  case AArch64::LDRHpre:
+  case AArch64::LDRQpost:
+  case AArch64::LDRQpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRSpost:
+  case AArch64::LDRSpre:
+  case AArch64::LDRWpost:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpost:
+  case AArch64::LDRXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPSWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+  }
+
+  LoadInfo LI;
+  LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
+  LI.BaseReg = MI.getOperand(BaseRegIdx).getReg();
+  LI.BaseRegIdx = BaseRegIdx;
+  LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
+  LI.IsPrePost = IsPrePost;
+  return LI;
+}
+
+static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
+                                 const MachineInstr &MI, const LoadInfo &LI) {
+  unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
+  unsigned Base = TRI->getEncodingValue(LI.BaseReg);
+  unsigned Off;
+  if (LI.OffsetOpnd == nullptr)
+    Off = 0;
+  else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
+           LI.OffsetOpnd->isCPI())
+    return None;
+  else if (LI.OffsetOpnd->isReg())
+    Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
+  else
+    Off = LI.OffsetOpnd->getImm() >> 2;
+
+  return makeTag(Dest, Base, Off);
+}
+
+void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
+  // Build the initial tag map for the whole loop.
+  TagMap.clear();
+  for (MachineBasicBlock *MBB : L.getBlocks())
+    for (MachineInstr &MI : *MBB) {
+      Optional<LoadInfo> LInfo = getLoadInfo(MI);
+      if (!LInfo)
+        continue;
+      Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
+      if (!Tag)
+        continue;
+      TagMap[*Tag].push_back(&MI);
+    }
+
+  bool AnyCollisions = false;
+  for (auto &P : TagMap) {
+    auto Size = P.second.size();
+    if (Size > 1) {
+      for (auto *MI : P.second) {
+        if (TII->isStridedAccess(*MI)) {
+          AnyCollisions = true;
+          break;
+        }
+      }
+    }
+    if (AnyCollisions)
+      break;
+  }
+  // Nothing to fix.
+  if (!AnyCollisions)
+    return;
+
+  MachineRegisterInfo &MRI = Fn.getRegInfo();
+
+  // Go through all the basic blocks in the current loop and fix any streaming
+  // loads to avoid collisions with any other loads.
+  LiveRegUnits LR(*TRI);
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    LR.clear();
+    LR.addLiveOuts(*MBB);
+    for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
+      MachineInstr &MI = *I;
+      if (!TII->isStridedAccess(MI))
+        continue;
+
+      LoadInfo LdI = *getLoadInfo(MI);
+      unsigned OldTag = *getTag(TRI, MI, LdI);
+      auto &OldCollisions = TagMap[OldTag];
+      if (OldCollisions.size() <= 1)
+        continue;
+
+      bool Fixed = false;
+      DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+
+      for (unsigned ScratchReg : AArch64::GPR64RegClass) {
+        if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
+          continue;
+
+        LoadInfo NewLdI(LdI);
+        NewLdI.BaseReg = ScratchReg;
+        unsigned NewTag = *getTag(TRI, MI, NewLdI);
+        // Scratch reg tag would collide too, so don't use it.
+        if (TagMap.count(NewTag))
+          continue;
+
+        DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI)
+                     << '\n');
+
+        // Rewrite:
+        //   Xd = LOAD Xb, off
+        // to:
+        //   Xc = MOV Xb
+        //   Xd = LOAD Xc, off
+        DebugLoc DL = MI.getDebugLoc();
+        BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
+            .addReg(AArch64::XZR)
+            .addReg(LdI.BaseReg)
+            .addImm(0);
+        MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
+        BaseOpnd.setReg(ScratchReg);
+
+        // If the load does a pre/post increment, then insert a MOV after as
+        // well to update the real base register.
+        if (LdI.IsPrePost) {
+          DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+                       << PrintReg(ScratchReg, TRI) << '\n');
+          MI.getOperand(0).setReg(
+              ScratchReg); // Change tied operand pre/post update dest.
+          BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
+                  TII->get(AArch64::ORRXrs), LdI.BaseReg)
+              .addReg(AArch64::XZR)
+              .addReg(ScratchReg)
+              .addImm(0);
+        }
+
+        for (int I = 0, E = OldCollisions.size(); I != E; ++I)
+          if (OldCollisions[I] == &MI) {
+            std::swap(OldCollisions[I], OldCollisions[E - 1]);
+            OldCollisions.pop_back();
+            break;
+          }
+
+        // Update TagMap to reflect instruction changes to reduce the number
+        // of later MOVs to be inserted.  This needs to be done after
+        // OldCollisions is updated since it may be relocated by this
+        // insertion.
+        TagMap[NewTag].push_back(&MI);
+        ++NumCollisionsAvoided;
+        Fixed = true;
+        Modified = true;
+        break;
+      }
+      if (!Fixed)
+        ++NumCollisionsNotAvoided;
+    }
+  }
+}
+
+bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
+  auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  if (ST.getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  TRI = ST.getRegisterInfo();
+
+  assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
+         "Register liveness not available!");
+
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+
+  Modified = false;
+
+  for (MachineLoop *I : LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      // Only process inner-loops
+      if (L->empty())
+        runOnLoop(**L, Fn);
+
+  return Modified;
+}
+
+FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 3682b62d2b84..97396057dce0 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5138,6 +5138,7 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
+  (void)&CC_AArch64_Win64_VarArg;
 }
 
 namespace llvm {
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index e96ee7d29b3e..4907d082eda0 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -41,6 +41,10 @@
 // |                                   |
 // |-----------------------------------|
 // |                                   |
+// | (Win64 only) varargs from reg     |
+// |                                   |
+// |-----------------------------------|
+// |                                   |
 // | prev_fp, prev_lr                  |
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
@@ -950,7 +954,13 @@ static void computeCalleeSaveRegisterPairs(
           CC == CallingConv::PreserveMost ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
-  unsigned Offset = AFI->getCalleeSavedStackSize();
+  int Offset = AFI->getCalleeSavedStackSize();
+
+  unsigned GPRSaveSize = AFI->getVarArgsGPRSize();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  if (IsWin64)
+    Offset -= alignTo(GPRSaveSize, 16);
 
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 04687847c1a3..06005f6b6886 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -239,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
   case InlineAsm::Constraint_i:
   case InlineAsm::Constraint_m:
   case InlineAsm::Constraint_Q:
-    // Require the address to be in a register.  That is safe for all AArch64
-    // variants and it is hard to do anything much smarter without knowing
-    // how the operand is used.
-    OutOps.push_back(Op);
+    // We need to make sure that this one operand does not end up in XZR, thus
+    // require the address to be in a PointerRegClass register.
+    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
+    SDLoc dl(Op);
+    SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
+    SDValue NewOp =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       dl, Op.getValueType(),
+                                       Op, RC), 0);
+    OutOps.push_back(NewOp);
     return false;
   }
   return true;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60fde5caa339..c6150f9e5d1d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2650,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::PreserveMost:
   case CallingConv::CXX_FAST_TLS:
   case CallingConv::Swift:
+    if (Subtarget->isTargetWindows() && IsVarArg)
+      return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  case CallingConv::Win64:
+    return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   }
 }
 
@@ -2668,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2824,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
+    if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
+      // Win64 variadic functions also pass arguments in registers, but all float
+      // arguments are passed in integer registers.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
@@ -2869,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2881,7 +2889,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
   if (GPRSaveSize != 0) {
-    GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+    if (IsWin64)
+      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
+    else
+      GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
@@ -2890,7 +2901,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
-          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+          IsWin64
+              ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                                  GPRIdx,
+                                                  (i - FirstVariadicGPR) * 8)
+              : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2899,7 +2914,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  if (Subtarget->hasFPARMv8()) {
+  if (Subtarget->hasFPARMv8() && !IsWin64) {
     static const MCPhysReg FPRArgRegs[] = {
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
@@ -4491,6 +4506,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
+SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
+                                     ? FuncInfo->getVarArgsGPRIndex()
+                                     : FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
@@ -4562,8 +4592,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    return LowerWin64_VASTART(Op, DAG);
+  else if (Subtarget->isTargetDarwin())
+    return LowerDarwin_VASTART(Op, DAG);
+  else
+    return LowerAAPCS_VASTART(Op, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
@@ -4571,7 +4607,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  unsigned VaListSize =
+      Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -7451,6 +7488,14 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
 }
 
+MachineMemOperand::Flags
+AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
+      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
+    return MOStridedAccess;
+  return MachineMemOperand::MONone;
+}
+
 bool AArch64TargetLowering::isLegalInterleavedAccessType(
     VectorType *VecTy, const DataLayout &DL) const {
 
@@ -10567,9 +10612,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   if (Size > 128) return AtomicExpansionKind::None;
   // Nand not supported in LSE.
   if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
-  // Currently leaving And and Sub to LLSC
-  if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub))
-    return AtomicExpansionKind::LLSC;
   // Leave 128 bits to LLSC.
   return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
@@ -10783,7 +10825,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
 
 unsigned
 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
-  if (Subtarget->isTargetDarwin())
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
     return getPointerTy(DL).getSizeInBits();
 
   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index ecc2517fb288..3b0e0f1de894 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -408,6 +408,19 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
+  bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                        const SelectionDAG &DAG) const override {
+    // Do not merge to float value size (128 bytes) if no implicit
+    // float attribute is set.
+
+    bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+        Attribute::NoImplicitFloat);
+
+    if (NoFloat)
+      return (MemVT.getSizeInBits() <= 64);
+    return true;
+  }
+
   bool isCheapToSpeculateCttz() const override {
     return true;
   }
@@ -455,6 +468,8 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                      const DataLayout &DL) const;
 
+  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
@@ -541,6 +556,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index de283b70210f..eec41ddbc159 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -451,3 +451,13 @@ def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>
 def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
 def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
 def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index dba3e4bdf82f..c0c6055c358f 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -52,9 +52,6 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-static const MachineMemOperand::Flags MOSuppressPair =
-    MachineMemOperand::MOTargetFlag1;
-
 static cl::opt<unsigned>
 TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
@@ -1715,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
 }
 
+/// Check all MachineMemOperands for a hint that the load/store is strided.
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOStridedAccess;
+  });
+}
+
 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
   switch (Opc) {
   default:
@@ -4433,7 +4437,8 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
-      {{MOSuppressPair, "aarch64-suppress-pair"}};
+      {{MOSuppressPair, "aarch64-suppress-pair"},
+       {MOStridedAccess, "aarch64-strided-access"}};
   return makeArrayRef(TargetFlags);
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 0809ede4df2a..1765a0263ea4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -27,6 +27,13 @@ namespace llvm {
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
+static const MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MOStridedAccess =
+    MachineMemOperand::MOTargetFlag2;
+
+#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"
+
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
@@ -81,6 +88,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// unprofitable.
   bool isLdStPairSuppressed(const MachineInstr &MI) const;
 
+  /// Return true if the given load or store is a strided memory access.
+  bool isStridedAccess(const MachineInstr &MI) const;
+
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(unsigned Opc) const;
 
@@ -356,7 +366,7 @@ enum AArch64FrameOffsetStatus {
 /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
 /// use an offset.eq
 /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
+/// rewritten in @p MI.
 /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
 /// amount that is off the limit of the legal offset.
 /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0be14673eb20..0dcf07f98412 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -37,6 +37,8 @@ def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
                                  AssemblerPredicate<"FeatureSPE", "spe">;
+def HasSVE           : Predicate<"Subtarget->hasSVE()">,
+                                 AssemblerPredicate<"FeatureSVE", "sve">;
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 4a0a7c36baf8..ffb27834c31c 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -82,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({Op, 1, s1}, Legal);
   }
 
-  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index fab92e139dd0..9f7dcb3fe1c3 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -74,7 +74,7 @@ const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
@@ -167,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
 const TargetRegisterClass *
 AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
-  return &AArch64::GPR64RegClass;
+  return &AArch64::GPR64spRegClass;
 }
 
 const TargetRegisterClass *
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index a3238cf3b60f..ea6112452736 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -134,7 +134,9 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA72:
     PrefFunctionAlignment = 4;
     break;
-  case CortexA73: break;
+  case CortexA73:
+    PrefFunctionAlignment = 4;
+    break;
   case Others: break;
   }
 }
@@ -171,7 +173,8 @@ struct AArch64GISelActualAccessor : public GISelAccessor {
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+    : AArch64GenSubtargetInfo(TT, CPU, FS),
+      ReserveX18(TT.isOSDarwin() || TT.isOSWindows()),
       IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
       TLInfo(TM, *this), GISel() {
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index db53946cbc77..5a1f45ee2552 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -70,6 +70,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool HasFullFP16 = false;
   bool HasSPE = false;
   bool HasLSLFast = false;
+  bool HasSVE = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -251,6 +252,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
   bool hasLSLFast() const { return HasLSLFast; }
+  bool hasSVE() const { return HasSVE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -304,6 +306,17 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool enableEarlyIfConversion() const override;
 
   std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    switch (CC) {
+    case CallingConv::C:
+      return isTargetWindows();
+    case CallingConv::Win64:
+      return true;
+    default:
+      return false;
+    }
+  }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6237b8f3e7b9..ba28c01a2eff 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -138,6 +138,9 @@ static cl::opt<int> EnableGlobalISelAtO(
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(-1));
 
+static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
+                                         cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -158,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
+  initializeFalkorHWPFFixPass(*PR);
+  initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
 }
 
@@ -182,7 +187,7 @@ static std::string computeDataLayout(const Triple &TT,
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
   if (TT.isOSBinFormatCOFF())
-    return "e-m:w-i64:64-i128:128-n32:64-S128";
+    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
@@ -346,8 +351,12 @@ void AArch64PassConfig::addIRPasses() {
   //
   // Run this before LSR to remove the multiplies involved in computing the
   // pointer values N iterations ahead.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
-    addPass(createLoopDataPrefetchPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoopDataPrefetch)
+      addPass(createLoopDataPrefetchPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorMarkStridedAccessesPass());
+  }
 
   TargetPassConfig::addIRPasses();
 
@@ -478,8 +487,12 @@ void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
-    addPass(createAArch64LoadStoreOptimizationPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoadStoreOpt)
+      addPass(createAArch64LoadStoreOptimizationPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorHWPFFixPass());
+  }
 }
 
 void AArch64PassConfig::addPreEmitPass() {
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index fefa7e26b79f..85de02e859e0 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -36,6 +36,8 @@ class AArch64TargetMachine : public LLVMTargetMachine {
 
   ~AArch64TargetMachine() override;
   const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some, targets is
+  // deprecated and should not be used.
   const AArch64Subtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e841fb894519..a79d51820545 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -86,7 +86,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+  bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
 
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveCPU(SMLoc L);
@@ -3257,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
   }
 }
 
-bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS);
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+                                      OperandVector &Operands) {
   switch (ErrCode) {
   case Match_MissingFeature:
     return Error(Loc,
@@ -3380,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "expected readable system register");
   case Match_MSR:
     return Error(Loc, "expected writable system register or pstate");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
+  case Match_MnemonicFail: {
+    std::string Suggestion = AArch64MnemonicSpellCheck(
+        ((AArch64Operand &)*Operands[0]).getToken(),
+        ComputeAvailableFeatures(STI->getFeatureBits()));
+    return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
+  }
   default:
     llvm_unreachable("unexpected error code!");
   }
@@ -3707,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, Msg);
   }
   case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
+    return showMatchError(IDLoc, MatchResult, Operands);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
 
@@ -3726,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
       MatchResult = Match_InvalidSuffix;
 
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   case Match_InvalidMemoryIndexed1:
   case Match_InvalidMemoryIndexed2:
@@ -3784,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   }
 
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 02b12b5e90ca..f7e0a5c7bed3 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -47,6 +47,7 @@ add_llvm_target(AArch64CodeGen
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
+  AArch64FalkorHWPFFix.cpp
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a7a7daf4b4a5..2bd0cbf9f7c6 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_1:
     return 1;
 
-  case FK_Data_2:
   case AArch64::fixup_aarch64_movw:
+  case FK_Data_2:
+  case FK_SecRel_2:
     return 2;
 
   case AArch64::fixup_aarch64_pcrel_branch14:
@@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
   case FK_Data_4:
+  case FK_SecRel_4:
     return 4;
 
   case FK_Data_8:
@@ -218,6 +220,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
+  case FK_SecRel_2:
+  case FK_SecRel_4:
     return Value;
   }
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 7862a03e771c..31762b9e4cd5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -27,8 +27,7 @@ namespace {
 class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
 public:
   AArch64WinCOFFObjectWriter()
-    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {
-  }
+      : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {}
 
   ~AArch64WinCOFFObjectWriter() override = default;
 
@@ -36,19 +35,59 @@ class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
                         const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 
-   bool recordRelocation(const MCFixup &) const override;
+  bool recordRelocation(const MCFixup &) const override;
 };
 
 } // end anonymous namespace
 
-unsigned
-AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
-                                         const MCValue &Target,
-                                         const MCFixup &Fixup,
-                                         bool IsCrossSection,
-                                         const MCAsmBackend &MAB) const {
-  const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
-  report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+unsigned AArch64WinCOFFObjectWriter::getRelocType(
+    MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup,
+    bool IsCrossSection, const MCAsmBackend &MAB) const {
+  auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                                      : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: {
+    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+  }
+
+  case FK_Data_4:
+    switch (Modifier) {
+    default:
+      return COFF::IMAGE_REL_ARM64_ADDR32;
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM64_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM64_SECREL;
+    }
+
+  case FK_Data_8:
+    return COFF::IMAGE_REL_ARM64_ADDR64;
+
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM64_SECTION;
+
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM64_SECREL;
+
+  case AArch64::fixup_aarch64_add_imm12:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
+
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
+
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
+
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    return COFF::IMAGE_REL_ARM64_BRANCH26;
+  }
 }
 
 bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5a799b2d88d0..568682899be5 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -56,7 +56,7 @@ extern char &AMDGPUMachineCFGStructurizerID;
 
 void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+Pass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 7235d8fae332..c68e5861ff25 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -15,8 +15,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
@@ -26,26 +28,27 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUAnnotateKernelFeatures : public ModulePass {
+class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
+  const TargetMachine *TM = nullptr;
   AMDGPUAS AS;
-  static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
 
-  void addAttrToCallers(Function *Intrin, StringRef AttrName);
-  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+  bool addFeatureAttributes(Function &F);
 
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {}
-  bool runOnModule(Module &M) override;
+  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
+
+  bool doInitialization(CallGraph &CG) override;
+  bool runOnSCC(CallGraphSCC &SCC) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    ModulePass::getAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
   }
 
   static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
@@ -121,16 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   return false;
 }
 
-// Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
-    AMDGPUAS AS) {
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID,
+                                     bool &NonKernelOnly,
+                                     bool &IsQueuePtr) {
+  switch (ID) {
+  case Intrinsic::amdgcn_workitem_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-item-id-x";
+  case Intrinsic::amdgcn_workgroup_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-group-id-x";
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+    return "amdgpu-work-item-id-y";
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+    return "amdgpu-work-item-id-z";
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::r600_read_tgid_y:
+    return "amdgpu-work-group-id-y";
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::r600_read_tgid_z:
+    return "amdgpu-work-group-id-z";
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return "amdgpu-dispatch-ptr";
+  case Intrinsic::amdgcn_dispatch_id:
+    return "amdgpu-dispatch-id";
+  case Intrinsic::amdgcn_kernarg_segment_ptr:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+    return "amdgpu-kernarg-segment-ptr";
+  case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::trap:
+  case Intrinsic::debugtrap:
+    IsQueuePtr = true;
+    return "amdgpu-queue-ptr";
+  default:
+    return "";
+  }
+}
+
+static bool handleAttr(Function &Parent, const Function &Callee,
+                       StringRef Name) {
+  if (Callee.hasFnAttribute(Name)) {
+    Parent.addFnAttr(Name);
+    return true;
+  }
+
+  return false;
+}
+
+static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
+                                   bool &NeedQueuePtr) {
+  // X ids unnecessarily propagated to kernels.
+  static const StringRef AttrNames[] = {
+    { "amdgpu-work-item-id-x" },
+    { "amdgpu-work-item-id-y" },
+    { "amdgpu-work-item-id-z" },
+    { "amdgpu-work-group-id-x" },
+    { "amdgpu-work-group-id-y" },
+    { "amdgpu-work-group-id-z" },
+    { "amdgpu-dispatch-ptr" },
+    { "amdgpu-dispatch-id" },
+    { "amdgpu-kernarg-segment-ptr" }
+  };
+
+  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
+    NeedQueuePtr = true;
+
+  for (StringRef AttrName : AttrNames)
+    handleAttr(Parent, Callee, AttrName);
+}
+
+bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  bool HasFlat = ST.hasFlatAddressSpace();
+  bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
+  bool Changed = false;
+  bool NeedQueuePtr = false;
+  bool HaveCall = false;
+  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallSite CS(&I);
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+
+        // TODO: Do something with indirect calls.
+        if (!Callee) {
+          if (!CS.isInlineAsm())
+            HaveCall = true;
+          continue;
+        }
+
+        Intrinsic::ID IID = Callee->getIntrinsicID();
+        if (IID == Intrinsic::not_intrinsic) {
+          HaveCall = true;
+          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
+          Changed = true;
+        } else {
+          bool NonKernelOnly = false;
+          StringRef AttrName = intrinsicToAttrName(IID,
+                                                   NonKernelOnly, NeedQueuePtr);
+          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+            F.addFnAttr(AttrName);
+            Changed = true;
+          }
+        }
+      }
+
+      if (NeedQueuePtr || HasApertureRegs)
+        continue;
+
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC, AS))
-          return true;
+        if (castRequiresQueuePtr(ASC, AS)) {
+          NeedQueuePtr = true;
+          continue;
+        }
       }
 
       for (const Use &U : I.operands()) {
@@ -138,100 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS))
-          return true;
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+          NeedQueuePtr = true;
+          break;
+        }
       }
     }
   }
 
+  if (NeedQueuePtr) {
+    F.addFnAttr("amdgpu-queue-ptr");
+    Changed = true;
+  }
+
+  // TODO: We could refine this to captured pointers that could possibly be
+  // accessed by flat instructions. For now this is mostly a poor way of
+  // estimating whether there are calls before argument lowering.
+  if (HasFlat && !IsFunc && HaveCall) {
+    F.addFnAttr("amdgpu-flat-scratch");
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
+  Module &M = SCC.getCallGraph().getModule();
+  Triple TT(M.getTargetTriple());
+
+  bool Changed = false;
+  for (CallGraphNode *I : SCC) {
+    Function *F = I->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    Changed |= addFeatureAttributes(*F);
+  }
+
+
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    report_fatal_error("TargetMachine is required");
+
+  AS = AMDGPU::getAMDGPUAS(CG.getModule());
+  TM = &TPC->getTM<TargetMachine>();
   return false;
 }
 
-void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
-                                                    StringRef AttrName) {
-  SmallPtrSet<Function *, 4> SeenFuncs;
-
-  for (User *U : Intrin->users()) {
-    // CallInst is the only valid user for an intrinsic.
-    CallInst *CI = cast<CallInst>(U);
-
-    Function *CallingFunction = CI->getParent()->getParent();
-    if (SeenFuncs.insert(CallingFunction).second)
-      CallingFunction->addFnAttr(AttrName);
-  }
-}
-
-bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
-  Module &M,
-  ArrayRef<StringRef[2]> IntrinsicToAttr) {
-  bool Changed = false;
-
-  for (const StringRef *Arr  : IntrinsicToAttr) {
-    if (Function *Fn = M.getFunction(Arr[0])) {
-      addAttrToCallers(Fn, Arr[1]);
-      Changed = true;
-    }
-  }
-
-  return Changed;
-}
-
-bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
-  Triple TT(M.getTargetTriple());
-  AS = AMDGPU::getAMDGPUAS(M);
-
-  static const StringRef IntrinsicToAttr[][2] = {
-    // .x omitted
-    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
-    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
-
-    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
-    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
-
-    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
-    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
-
-    // .x omitted
-    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
-    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
-  };
-
-  static const StringRef HSAIntrinsicToAttr[][2] = {
-    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
-    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
-    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" },
-    { "llvm.trap", "amdgpu-queue-ptr" },
-    { "llvm.debugtrap", "amdgpu-queue-ptr" }
-  };
-
-  // TODO: We should not add the attributes if the known compile time workgroup
-  // size is 1 for y/z.
-
-  // TODO: Intrinsics that require queue ptr.
-
-  // We do not need to note the x workitem or workgroup id because they are
-  // always initialized.
-
-  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
-  if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
-    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
-
-    for (Function &F : M) {
-      if (F.hasFnAttribute("amdgpu-queue-ptr"))
-        continue;
-
-      auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-      bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>()
-                                        .getSubtarget<AMDGPUSubtarget>(F)
-                                        .hasApertureRegs();
-      if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
-        F.addFnAttr("amdgpu-queue-ptr");
-    }
-  }
-
-  return Changed;
-}
-
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
   return new AMDGPUAnnotateKernelFeatures();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 83ad1a5c6ee3..2247814cfe55 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -268,19 +268,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                  CurrentProgramInfo.ScratchSize,
                                  getFunctionCodeSize(MF));
 
-      OutStreamer->emitRawComment(" codeLenInByte = " +
-                                  Twine(getFunctionCodeSize(MF)), false);
-      OutStreamer->emitRawComment(
-        " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
-      OutStreamer->emitRawComment(
-        " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
-
       OutStreamer->emitRawComment(
         " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
       OutStreamer->emitRawComment(
         " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
-      OutStreamer->emitRawComment(
-        " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
       OutStreamer->emitRawComment(
         " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
         " bytes/workgroup (compile time only)", false);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2553cf4da0fe..258b1737deb3 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -573,6 +573,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FABS);
+  setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::AssertSext);
 }
 
 //===----------------------------------------------------------------------===//
@@ -883,7 +885,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 
 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 /// input values across multiple registers.  Each item in the Ins array
-/// represents a single value that will be stored in regsters.  Ins[x].VT is
+/// represents a single value that will be stored in registers.  Ins[x].VT is
 /// the value type of the value that will be stored in the register, so
 /// whatever SDNode we lower the argument to needs to be this type.
 ///
@@ -2591,6 +2593,31 @@ SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
+// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
+// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
+// issues.
+SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
+                                                        DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
+  //     (vt2 (truncate (assertzext vt0:x, vt1)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue N1 = N->getOperand(1);
+    EVT ExtVT = cast<VTSDNode>(N1)->getVT();
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.bitsGE(ExtVT)) {
+      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
+      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
+    }
+  }
+
+  return SDValue();
+}
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -3521,6 +3548,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+    return performAssertSZExtCombine(N, DCI);
   }
   return SDValue();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a45234e2b39f..d85aada6053a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -76,6 +76,7 @@ class AMDGPUTargetLowering : public TargetLowering {
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1bc5a52053ec..779617629010 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -277,7 +277,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   // Make sure requested values are compatible with values implied by requested
   // minimum/maximum flat work group sizes.
   if (RequestedFlatWorkGroupSize &&
-      Requested.first > MinImpliedByFlatWorkGroupSize)
+      Requested.first < MinImpliedByFlatWorkGroupSize)
     return Default;
 
   return Requested;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 22cede59086a..d4b6a5fe8020 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -359,6 +359,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return FP64FP16Denormals;
   }
 
+  bool supportsMinMaxDenormModes() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   bool hasFPExceptions() const {
     return FPExceptions;
   }
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e3c90f250600..b37c274102bc 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1208,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
 }
 
 bool AMDGPUOperand::isLiteralImm(MVT type) const {
-  // Check that this imediate can be added as literal
+  // Check that this immediate can be added as literal
   if (!isImmTy(ImmTyNone)) {
     return false;
   }
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index f26e49295e69..966c6fec20c6 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -87,6 +87,7 @@ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
 DECODE_OPERAND_REG(VGPR_32)
 DECODE_OPERAND_REG(VS_32)
 DECODE_OPERAND_REG(VS_64)
+DECODE_OPERAND_REG(VS_128)
 
 DECODE_OPERAND_REG(VReg_64)
 DECODE_OPERAND_REG(VReg_96)
@@ -318,6 +319,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
   return decodeSrcOp(OPW16, Val);
 }
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3d71db909e20..4c755be09999 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -70,6 +70,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
+  MCOperand decodeOperand_VS_128(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
   MCOperand decodeOperand_VSrcV216(unsigned Val) const;
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3af242d9ea66..0aad8f0843d6 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -653,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // again. The same constant folded instruction could also have a second
         // use operand.
         NextUse = MRI->use_begin(Dst.getReg());
+        FoldList.clear();
         continue;
       }
 
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 08a64de38501..7334781916d8 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -158,7 +158,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   // No replacement necessary.
   if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
       !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
-    assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister);
+    assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
     return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
   }
 
@@ -246,13 +246,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   // this point it appears we need the setup. This part of the prolog should be
   // emitted after frame indices are eliminated.
 
-  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
+  if (MFI->hasFlatScratchInit())
     emitFlatScratchInit(ST, MF, MBB);
 
   unsigned SPReg = MFI->getStackPtrOffsetReg();
-  if (SPReg != AMDGPU::NoRegister) {
+  if (SPReg != AMDGPU::SP_REG) {
+    assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
+
     DebugLoc DL;
-    int64_t StackSize = MF.getFrameInfo().getStackSize();
+    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    int64_t StackSize = FrameInfo.getStackSize();
 
     if (StackSize == 0) {
       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 2ba570b9ebbb..2356405f0919 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1171,8 +1171,7 @@ static void allocateSystemSGPRs(CCState &CCInfo,
 static void reservePrivateMemoryRegs(const TargetMachine &TM,
                                      MachineFunction &MF,
                                      const SIRegisterInfo &TRI,
-                                     SIMachineFunctionInfo &Info,
-                                     bool NeedSP) {
+                                     SIMachineFunctionInfo &Info) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1234,15 +1233,6 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
-
-  if (NeedSP) {
-    unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
-    Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
-
-    assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
-    assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
-                              Info.getStackPtrOffsetReg()));
-  }
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -1380,10 +1370,37 @@ SDValue SITargetLowering::LowerFormalArguments(
 
     unsigned Reg = VA.getLocReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+    EVT ValVT = VA.getValVT();
 
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
+    // If this is an 8 or 16-bit value, it is really passed promoted
+    // to 32 bits. Insert an assert[sz]ext to capture this, then
+    // truncate to the right size.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
     if (IsShader && Arg.VT.isVector()) {
       // Build a vector from the registers
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
@@ -1410,25 +1427,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-
-  // TODO: Could maybe omit SP if only tail calls?
-  bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
-    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
     CCInfo.AllocateReg(Info->getFrameOffsetReg());
-
-    if (NeedSP) {
-      unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
-      CCInfo.AllocateReg(StackPtrReg);
-      Info->setStackPtrOffsetReg(StackPtrReg);
-    }
   }
 
   return Chains.empty() ? Chain :
@@ -4624,8 +4629,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   return DAG.isKnownNeverNaN(Op);
 }
 
-static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
-                            unsigned MaxDepth=5) {
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                            const SISubtarget *ST, unsigned MaxDepth=5) {
   // If source is a result of another standard FP operation it is already in
   // canonical form.
 
@@ -4663,7 +4668,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
   case ISD::FNEG:
   case ISD::FABS:
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
 
   case ISD::FSIN:
   case ISD::FCOS:
@@ -4672,16 +4677,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
 
   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
   // For such targets need to check their input recursively.
-  // TODO: on GFX9+ we could return true without checking provided no-nan
-  // mode, since canonicalization is also used to quiet sNaNs.
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINNAN:
   case ISD::FMAXNAN:
 
+    if (ST->supportsMinMaxDenormModes() &&
+        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+        DAG.isKnownNeverNaN(Op.getOperand(1)))
+      return true;
+
     return (MaxDepth > 0) &&
-           isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
-           isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
 
   case ISD::ConstantFP: {
     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4708,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
 
   if (!CFP) {
     SDValue N0 = N->getOperand(0);
+    EVT VT = N0.getValueType().getScalarType();
+    auto ST = getSubtarget();
+
+    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+        DAG.isKnownNeverNaN(N0))
+      return N0;
 
     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
 
     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
-        isCanonicalized(N0, getSubtarget()))
+        isCanonicalized(DAG, N0, ST))
       return N0;
 
     return SDValue();
@@ -5813,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
   }
   return TargetLowering::getConstraintType(Constraint);
 }
+
+// Figure out which registers should be reserved for stack access. Only after
+// the function is legalized do we know all of the non-spill stack objects or if
+// calls are present.
+void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  if (Info->isEntryFunction()) {
+    // Callable functions have fixed registers used for stack access.
+    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+  }
+
+  // We have to assume the SP is needed in case there are calls in the function
+  // during lowering. Calls are only detected after the function is
+  // lowered. We're about to reserve registers, so don't bother using it if we
+  // aren't really going to use it.
+  bool NeedSP = !Info->isEntryFunction() ||
+    MFI.hasVarSizedObjects() ||
+    MFI.hasCalls();
+
+  if (NeedSP) {
+    unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
+    Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+    assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
+    assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+                               Info->getStackPtrOffsetReg()));
+    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
+  }
+
+  MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
+  MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
+  MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+                     Info->getScratchWaveOffsetReg());
+
+  TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 83392a7ab1b2..e6bb3d6cd419 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -232,6 +232,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
+
+  void finalizeLowering(MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 160f8837d49c..a7e0feb10b9f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3408,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
 }
 
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
-  SmallVector<MachineInstr *, 128> Worklist;
-  Worklist.push_back(&TopInst);
+  SetVectorType Worklist;
+  Worklist.insert(&TopInst);
 
   while (!Worklist.empty()) {
     MachineInstr &Inst = *Worklist.pop_back_val();
@@ -3610,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   }
 }
 
-void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3635,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3686,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3753,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBCNT(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+    SetVectorType &Worklist, MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -3789,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
                                       MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3853,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
-  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  SetVectorType &Worklist) const {
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
          E = MRI.use_end(); I != E;) {
     MachineInstr &UseMI = *I->getParent();
     if (!canReadVGPR(UseMI, I.getOperandNo())) {
-      Worklist.push_back(&UseMI);
+      Worklist.insert(&UseMI);
 
       do {
         ++I;
@@ -3869,7 +3869,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
-void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
                                  MachineRegisterInfo &MRI,
                                  MachineInstr &Inst) const {
   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -3932,7 +3932,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
-    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+    MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
   // This assumes that all the users of SCC are in the same block
   // as the SCC def.
   for (MachineInstr &MI :
@@ -3943,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
       return;
 
     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
-      Worklist.push_back(&MI);
+      Worklist.insert(&MI);
   }
 }
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index d00c0d4a7f4e..3dd5bc89e6c7 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -19,6 +19,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "SIDefines.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 
 namespace llvm {
 
@@ -38,6 +39,8 @@ class SIInstrInfo final : public AMDGPUInstrInfo {
     EXECZ = 3
   };
 
+  typedef SmallSetVector<MachineInstr *, 32> SetVectorType;
+
   static unsigned getBranchOpcode(BranchPredicate Cond);
   static BranchPredicate getBranchPredicate(unsigned Opcode);
 
@@ -56,30 +59,30 @@ class SIInstrInfo final : public AMDGPUInstrInfo {
 
   void swapOperands(MachineInstr &Inst) const;
 
-  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+  void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
 
-  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                 MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
-  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
-  void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+  void movePackToVALU(SetVectorType &Worklist,
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
-    SmallVectorImpl<MachineInstr *> &Worklist) const;
+    SetVectorType &Worklist) const;
 
   void
   addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
-                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+                               SetVectorType &Worklist) const;
 
   const TargetRegisterClass *
   getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index ffb01363e131..088173680fa8 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1436,7 +1436,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
 
   field bit IsPacked = isPackedType<Src0VT>.ret;
   field bit HasOpSel = IsPacked;
-  field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
+  field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
   field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bcc685015cf5..ba69e42d9125 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1060,7 +1060,7 @@ def : Pat <
 
 class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
-  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
 >;
 
 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3203c38dae34..a7c8166ff6d2 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -23,10 +23,10 @@ using namespace llvm;
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    ScratchRSrcReg(AMDGPU::NoRegister),
-    ScratchWaveOffsetReg(AMDGPU::NoRegister),
-    FrameOffsetReg(AMDGPU::NoRegister),
-    StackPtrOffsetReg(AMDGPU::NoRegister),
+    ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
+    ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
+    FrameOffsetReg(AMDGPU::FP_REG),
+    StackPtrOffsetReg(AMDGPU::SP_REG),
     PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
     DispatchPtrUserSGPR(AMDGPU::NoRegister),
     QueuePtrUserSGPR(AMDGPU::NoRegister),
@@ -42,6 +42,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    WorkItemIDXVGPR(AMDGPU::NoRegister),
+    WorkItemIDYVGPR(AMDGPU::NoRegister),
+    WorkItemIDZVGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
     PSInputEnable(0),
     ReturnsVoid(true),
@@ -87,12 +90,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     ScratchWaveOffsetReg = AMDGPU::SGPR4;
     FrameOffsetReg = AMDGPU::SGPR5;
     StackPtrOffsetReg = AMDGPU::SGPR32;
-    return;
+
+    // FIXME: Not really a system SGPR.
+    PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
   }
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    KernargSegmentPtr = true;
+    KernargSegmentPtr = !F->arg_empty();
     WorkGroupIDX = true;
     WorkItemIDX = true;
   } else if (CC == CallingConv::AMDGPU_PS) {
@@ -101,17 +106,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   if (ST.debuggerEmitPrologue()) {
     // Enable everything.
+    WorkGroupIDX = true;
     WorkGroupIDY = true;
     WorkGroupIDZ = true;
+    WorkItemIDX = true;
     WorkItemIDY = true;
     WorkItemIDZ = true;
   } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+      WorkGroupIDX = true;
+
     if (F->hasFnAttribute("amdgpu-work-group-id-y"))
       WorkGroupIDY = true;
 
     if (F->hasFnAttribute("amdgpu-work-group-id-z"))
       WorkGroupIDZ = true;
 
+    if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+      WorkItemIDX = true;
+
     if (F->hasFnAttribute("amdgpu-work-item-id-y"))
       WorkItemIDY = true;
 
@@ -119,25 +132,28 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       WorkItemIDZ = true;
   }
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
-
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
-  bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
+  bool HasStackObjects = FrameInfo.hasStackObjects();
 
-  if (HasStackObjects || MaySpill) {
-    PrivateSegmentWaveByteOffset = true;
+  if (isEntryFunction()) {
+    // X, XY, and XYZ are the only supported combinations, so make sure Y is
+    // enabled if Z is.
+    if (WorkItemIDZ)
+      WorkItemIDY = true;
 
-    // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
-    if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
-        (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
-      PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+    if (HasStackObjects || MaySpill) {
+      PrivateSegmentWaveByteOffset = true;
+
+      // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+      if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+          (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+        PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+    }
   }
 
-  if (ST.isAmdCodeObjectV2(MF)) {
+  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -154,11 +170,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       ImplicitBufferPtr = true;
   }
 
-  // We don't need to worry about accessing spills with flat instructions.
-  // TODO: On VI where we must use flat for global, we should be able to omit
-  // this if it is never used for generic access.
-  if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS())
-    FlatScratchInit = true;
+  if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
+
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+    // TODO: This could be refined a lot. The attribute is a poor way of
+    // detecting calls that may require it before argument lowering.
+    if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+      FlatScratchInit = true;
+  }
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 05aa249584bf..4c7f38a09a48 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -119,6 +119,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned WorkGroupInfoSystemSGPR;
   unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
+  // VGPR inputs. These are always v0, v1 and v2 for entry functions.
+  unsigned WorkItemIDXVGPR;
+  unsigned WorkItemIDYVGPR;
+  unsigned WorkItemIDZVGPR;
+
   // Graphics info.
   unsigned PSInputAddr;
   unsigned PSInputEnable;
@@ -377,10 +382,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   }
 
   void setStackPtrOffsetReg(unsigned Reg) {
-    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     StackPtrOffsetReg = Reg;
   }
 
+  // Note the unset value for this is AMDGPU::SP_REG rather than
+  // NoRegister. This is mostly a workaround for MIR tests where state that
+  // can't be directly computed from the function is not preserved in serialized
+  // MIR.
   unsigned getStackPtrOffsetReg() const {
     return StackPtrOffsetReg;
   }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ef6ad4ad0c8f..4a3fbb4593bb 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -207,7 +207,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  // We have to assume the SP is needed in case there are calls in the function,
+  // which is detected after the function is lowered. If we aren't really going
+  // to need SP, don't bother reserving it.
   unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+
   if (StackPtrReg != AMDGPU::NoRegister) {
     reserveRegisterTuples(Reserved, StackPtrReg);
     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index fc808011cd88..54ea7805e18d 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
 def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
+// Pseudo-registers: Used as placeholders during isel and immediately
+// replaced, never seeing the verifier.
+def PRIVATE_RSRC_REG : SIReg<"", 0>;
+def FP_REG : SIReg<"", 0>;
+def SP_REG : SIReg<"", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+
 // VCC for 64-bit instructions
 def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
           DwarfRegAlias<VCC_LO> {
@@ -267,7 +274,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
    TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
-   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
+   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT,
+   FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
   let AllocationPriority = 7;
 }
 
@@ -314,7 +322,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+  (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
 
@@ -464,7 +473,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
 
 defm VSrc : RegImmOperand<"VS", "VSrc">;
 
-def VSrc_128 : RegisterOperand<VReg_128>;
+def VSrc_128 : RegisterOperand<VReg_128> {
+  let DecoderMethod = "DecodeVS_128RegisterClass";
+}
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an VGPR
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 26515b27bb77..67ad904ca972 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -539,23 +539,9 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
 }
 
 bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
-
-  if (Reg0 == Reg1) {
-    return true;
+  for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
+    if (*R == Reg1) return true;
   }
-
-  unsigned SubReg0 = TRI->getSubReg(Reg0, 1);
-  if (SubReg0 == 0) {
-    return TRI->getSubRegIndex(Reg1, Reg0) > 0;
-  }
-
-  for (unsigned Idx = 2; SubReg0 > 0; ++Idx) {
-    if (isRegIntersect(Reg1, SubReg0, TRI)) {
-      return true;
-    }
-    SubReg0 = TRI->getSubReg(Reg0, Idx);
-  }
-
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 7b9bc71ad4c7..d5acb49b4f39 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -117,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
-      (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+      (node (P.Src0VT
+              !if(P.HasOMod,
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
             (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
     [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
 }
@@ -813,9 +816,11 @@ let SubtargetPredicate = isVI in {
 
 // Aliases to simplify matching of floating-point instructions that
 // are VOP2 on SI and VOP3 on VI.
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias <
   name#" $dst, $src0, $src1",
-  (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+  !if(inst.Pfl.HasOMod,
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0),
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0))
 >, PredicateControl {
   let UseInstAsmMatchConverter = 0;
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index a8ca593f14ed..92ed0706dc01 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -12,17 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+  dag src0 = !if(P.HasOMod,
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+    (node (P.Src0VT src0)))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -92,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
   // v_div_scale_{f32|f64} do not support input modifiers.
   let HasModifiers = 0;
+  let HasOMod = 0;
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
   let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index f2de1f995726..3becf758aaa3 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -34,6 +34,9 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_
 
 let isCommutable = 1 in {
 def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
@@ -41,7 +44,6 @@ def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
 def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
 
 def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
@@ -50,6 +52,9 @@ def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>
 def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
 }
 
+def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+
 def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
@@ -71,6 +76,7 @@ multiclass VOP3P_Real_vi<bits<10> op> {
   }
 }
 
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
 defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
 defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
 defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
@@ -79,8 +85,10 @@ defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
 defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
 defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
 defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
 
 defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
 defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
 defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
 defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index f3482a22d5dc..b636fc9be431 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -148,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   let SubtargetPredicate = AssemblerPredicate;
 }
 
+class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+  list<dag> ret = !if(P.HasModifiers,
+      [(set i1:$sdst,
+        (setcc (P.Src0VT
+                  !if(P.HasOMod,
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
+               (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+               cond))],
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
+}
+
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          PatLeaf cond = COND_NULL,
@@ -163,14 +176,7 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
-  def _e64 : VOP3_Pseudo<opName, P,
-    !if(P.HasModifiers,
-      [(set i1:$sdst,
-          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                 cond))],
-      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+  def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
@@ -634,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
   (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
                    (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
   (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
-        DSTCLAMP.NONE, DSTOMOD.NONE)
+        DSTCLAMP.NONE)
 >;
 
 def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 77b7952b22a8..b47538ba0349 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -136,6 +136,8 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let TSFlags            = ps.TSFlags;
   let UseNamedOperandTable = ps.UseNamedOperandTable;
   let Uses                 = ps.Uses;
+
+  VOPProfile Pfl = ps.Pfl;
 }
 
 // XXX - Is there any reason to distingusih this from regular VOP3
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index c40b4450a5b5..e49c1babac21 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -16,145 +16,173 @@
 
 include "llvm/Target/Target.td"
 
-//===----------------------------------------------------------------------===//
-// ARM Helper classes.
-//
-
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
-
-class Architecture<string fname, string aname, list<SubtargetFeature> features >
-  : SubtargetFeature<fname, "ARMArch", aname,
-                     !strconcat(aname, " architecture"), features>;
-
 //===----------------------------------------------------------------------===//
 // ARM Subtarget state.
 //
 
-def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
-                                  "Thumb mode">;
+def ModeThumb             : SubtargetFeature<"thumb-mode", "InThumbMode",
+                                             "true", "Thumb mode">;
+
+def ModeSoftFloat         : SubtargetFeature<"soft-float","UseSoftFloat",
+                                             "true", "Use software floating "
+                                             "point features.">;
 
-def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
-                                     "Use software floating point features.">;
 
 //===----------------------------------------------------------------------===//
 // ARM Subtarget features.
 //
 
-def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
-                                   "Enable VFP2 instructions">;
-def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
-                                   "Enable VFP3 instructions",
-                                   [FeatureVFP2]>;
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-                                   "Enable NEON instructions",
-                                   [FeatureVFP3]>;
-def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
-                                     "Enable Thumb2 instructions">;
-def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
-                                     "Does not support ARM mode execution",
-                                     [ModeThumb]>;
-def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
-                                     "Enable half-precision floating point">;
-def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
-                                     "Enable VFP4 instructions",
-                                     [FeatureVFP3, FeatureFP16]>;
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
-                                   "true", "Enable ARMv8 FP",
-                                   [FeatureVFP4]>;
-def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
-                                       "Enable full half-precision floating point",
-                                       [FeatureFPARMv8]>;
-def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
-                                     "Restrict FP to 16 double registers">;
-def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb",
-                                         "true",
-                                         "Enable divide instructions in Thumb">;
-def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
-                                        "HasHardwareDivideInARM", "true",
-                                      "Enable divide instructions in ARM mode">;
-def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
-                                   "Has data barrier (dmb / dsb) instructions">;
-def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
-                                      "Has v7 clrex instruction">;
+// Floating Point, HW Division and Neon Support
+def FeatureVFP2           : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                             "Enable VFP2 instructions">;
+
+def FeatureVFP3           : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                             "Enable VFP3 instructions",
+                                             [FeatureVFP2]>;
+
+def FeatureNEON           : SubtargetFeature<"neon", "HasNEON", "true",
+                                             "Enable NEON instructions",
+                                             [FeatureVFP3]>;
+
+def FeatureFP16           : SubtargetFeature<"fp16", "HasFP16", "true",
+                                             "Enable half-precision "
+                                             "floating point">;
+
+def FeatureVFP4           : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                             "Enable VFP4 instructions",
+                                             [FeatureVFP3, FeatureFP16]>;
+
+def FeatureFPARMv8        : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                             "true", "Enable ARMv8 FP",
+                                             [FeatureVFP4]>;
+
+def FeatureFullFP16       : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                             "Enable full half-precision "
+                                             "floating point",
+                                             [FeatureFPARMv8]>;
+
+def FeatureVFPOnlySP      : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                                             "Floating point unit supports "
+                                             "single precision only">;
+
+def FeatureD16            : SubtargetFeature<"d16", "HasD16", "true",
+                                             "Restrict FP to 16 double registers">;
+
+def FeatureHWDivThumb     : SubtargetFeature<"hwdiv",
+                                             "HasHardwareDivideInThumb", "true",
+                                             "Enable divide instructions in Thumb">;
+
+def FeatureHWDivARM       : SubtargetFeature<"hwdiv-arm",
+                                             "HasHardwareDivideInARM", "true",
+                                             "Enable divide instructions in ARM mode">;
+
+// Atomic Support
+def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                             "Has data barrier (dmb/dsb) instructions">;
+
+def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                             "Has v7 clrex instruction">;
+
 def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
                                              "HasAcquireRelease", "true",
-                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
-def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
-                                         "FP compare + branch is slow">;
-def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
-                          "Floating point unit supports single precision only">;
-def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
-                           "Enable support for Performance Monitor extensions">;
-def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
-                          "Enable support for TrustZone security extensions">;
-def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
-                          "Enable support for ARMv8-M Security Extensions">;
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-                          "Enable support for Cryptography extensions",
-                          [FeatureNEON]>;
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
-                          "Enable support for CRC instructions">;
+                                             "Has v8 acquire/release (lda/ldaex "
+                                             " etc) instructions">;
+
+
+def FeatureSlowFPBrcc     : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                             "FP compare + branch is slow">;
+
+def FeaturePerfMon        : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                                             "Enable support for Performance "
+                                             "Monitor extensions">;
+
+
+// TrustZone Security Extensions
+def FeatureTrustZone      : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+                                             "Enable support for TrustZone "
+                                             "security extensions">;
+
+def Feature8MSecExt       : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                                             "Enable support for ARMv8-M "
+                                             "Security Extensions">;
+
+def FeatureCrypto         : SubtargetFeature<"crypto", "HasCrypto", "true",
+                                             "Enable support for "
+                                             "Cryptography extensions",
+                                             [FeatureNEON]>;
+
+def FeatureCRC            : SubtargetFeature<"crc", "HasCRC", "true",
+                                             "Enable support for CRC instructions">;
+
+
 // Not to be confused with FeatureHasRetAddrStack (return address stack)
-def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
-                "Enable Reliability, Availability and Serviceability extensions">;
-def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
-                "Enable fast computation of positive address offsets">;
-def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
-                                      "CPU fuses AES crypto operations">;
+def FeatureRAS            : SubtargetFeature<"ras", "HasRAS", "true",
+                                             "Enable Reliability, Availability "
+                                             "and Serviceability extensions">;
 
-// Cyclone has preferred instructions for zeroing VFP registers, which can
-// execute in 0 cycles.
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
+// Fast computation of non-negative address offsets
+def FeatureFPAO           : SubtargetFeature<"fpao", "HasFPAO", "true",
+                                             "Enable fast computation of "
+                                             "positive address offsets">;
 
-// Whether or not it may be profitable to unpredicate certain instructions
-// during if conversion.
+// Fast execution of AES crypto operations
+def FeatureFuseAES        : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+                                             "CPU fuses AES crypto operations">;
+
+// Cyclone can zero VFP registers in 0 cycles.
+def FeatureZCZeroing      : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                             "Has zero-cycle zeroing instructions">;
+
+// Whether it is profitable to unpredicate certain instructions during if-conversion
 def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
-                                              "IsProfitableToUnpredicate",
-                                              "true",
+                                              "IsProfitableToUnpredicate", "true",
                                               "Is profitable to unpredicate">;
 
 // Some targets (e.g. Swift) have microcoded VGETLNi32.
-def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
-                                            "HasSlowVGETLNi32", "true",
-                                            "Has slow VGETLNi32 - prefer VMOV">;
+def FeatureSlowVGETLNi32  : SubtargetFeature<"slow-vgetlni32",
+                                             "HasSlowVGETLNi32", "true",
+                                             "Has slow VGETLNi32 - prefer VMOV">;
 
 // Some targets (e.g. Swift) have microcoded VDUP32.
-def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
-                                         "Has slow VDUP32 - prefer VMOV">;
+def FeatureSlowVDUP32     : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
+                                             "true",
+                                             "Has slow VDUP32 - prefer VMOV">;
 
 // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
 // for scalar FP, as this allows more effective execution domain optimization.
-def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
-                                           "true", "Prefer VMOVSR">;
+def FeaturePreferVMOVSR   : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                             "true", "Prefer VMOVSR">;
 
 // Swift has ISHST barriers compatible with Atomic Release semantics but weaker
 // than ISH
 def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
-                                           "true", "Prefer ISHST barriers">;
+                                               "true", "Prefer ISHST barriers">;
 
 // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
-def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
-                                         "Has muxed AGU and NEON/FPU">;
+def FeatureMuxedUnits     : SubtargetFeature<"muxed-units", "HasMuxedUnits",
+                                             "true",
+                                             "Has muxed AGU and NEON/FPU">;
 
-// On some targets, a VLDM/VSTM starting with an odd register number needs more
-// microops than single VLDRS.
+// Whether VLDM/VSTM starting with odd register number need more microops
+// than single VLDRS
 def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
-                     "true", "VLDM/VSTM starting with an odd register is slow">;
+                                              "true", "VLDM/VSTM starting "
+                                              "with an odd register is slow">;
 
 // Some targets have a renaming dependency when loading into D subregisters.
 def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
                                               "SlowLoadDSubregister", "true",
                                               "Loading into D subregs is slow">;
+
 // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
 def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
                                              "DontWidenVMOVS", "true",
                                              "Don't widen VMOVS to VMOVD">;
 
 // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
-def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
-                                        "Expand VFP/NEON MLA/MLS instructions">;
+def FeatureExpandMLx      : SubtargetFeature<"expand-fp-mlx",
+                                             "ExpandMLx", "true",
+                                             "Expand VFP/NEON MLA/MLS instructions">;
 
 // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
 def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
@@ -162,15 +190,18 @@ def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
 
 // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
 // VFP to NEON, as an execution domain optimization.
-def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
-                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+def FeatureNEONForFPMovs  : SubtargetFeature<"neon-fpmovs",
+                                             "UseNEONForFPMovs", "true",
+                                             "Convert VMOVSR, VMOVRS, "
+                                             "VMOVS to NEON">;
 
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations. This affects instruction selection and should
 // only be enabled if the handling of denormals is not important.
-def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
-                                        "true",
-                                        "Use NEON for single precision FP">;
+def FeatureNEONForFP      : SubtargetFeature<"neonfp",
+                                             "UseNEONForSinglePrecisionFP",
+                                             "true",
+                                             "Use NEON for single precision FP">;
 
 // On some processors, VLDn instructions that access unaligned data take one
 // extra cycle. Take that into account when computing operand latencies.
@@ -181,18 +212,18 @@ def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
 // Some processors have a nonpipelined VFP coprocessor.
 def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
                                               "NonpipelinedVFP", "true",
-                                          "VFP instructions are not pipelined">;
+                                              "VFP instructions are not pipelined">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
-def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
-                                         "Disable VFP / NEON MAC instructions">;
+def FeatureHasSlowFPVMLx  : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                             "Disable VFP / NEON MAC instructions">;
 
 // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
 def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
-                                       "HasVMLxForwarding", "true",
-                                       "Has multiplier accumulator forwarding">;
+                                             "HasVMLxForwarding", "true",
+                                             "Has multiplier accumulator forwarding">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
@@ -213,14 +244,16 @@ def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
                                                   "true",
                   "Disable +1 predication cost for instructions updating CPSR">;
 
-def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
-                                            "AvoidMOVsShifterOperand", "true",
-                                "Avoid movs instructions with shifter operand">;
+def FeatureAvoidMOVsShOp  : SubtargetFeature<"avoid-movs-shop",
+                                             "AvoidMOVsShifterOperand", "true",
+                                             "Avoid movs instructions with "
+                                             "shifter operand">;
 
 // Some processors perform return stack prediction. CodeGen should avoid issue
 // "normal" call instructions to callees which do not return.
-def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
-                                     "Has return address stack">;
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack",
+                                              "HasRetAddrStack", "true",
+                                              "Has return address stack">;
 
 // Some processors have no branch predictor, which changes the expected cost of
 // taking a branch which affects the choice of whether to use predicated
@@ -230,63 +263,80 @@ def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
                                                    "Has no branch predictor">;
 
 /// DSP extension.
-def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
-                              "Supports DSP instructions in ARM and/or Thumb2">;
+def FeatureDSP            : SubtargetFeature<"dsp", "HasDSP", "true",
+                                             "Supports DSP instructions in "
+                                             "ARM and/or Thumb2">;
 
 // Multiprocessing extension.
-def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
-                                 "Supports Multiprocessing extension">;
+def FeatureMP             : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                        "Supports Multiprocessing extension">;
 
 // Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
 def FeatureVirtualization : SubtargetFeature<"virtualization",
-                                 "HasVirtualization", "true",
-                                 "Supports Virtualization extension",
-                                 [FeatureHWDivThumb, FeatureHWDivARM]>;
+                                             "HasVirtualization", "true",
+                                             "Supports Virtualization extension",
+                                             [FeatureHWDivThumb, FeatureHWDivARM]>;
 
-// M-series ISA
-def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
-                                     "Is microcontroller profile ('M' series)">;
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap       : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                             "NaCl trap">;
 
-// R-series ISA
-def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
-                                     "Is realtime profile ('R' series)">;
+def FeatureStrictAlign    : SubtargetFeature<"strict-align",
+                                             "StrictAlign", "true",
+                                             "Disallow all unaligned memory "
+                                             "access">;
+
+def FeatureLongCalls      : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                             "Generate calls via indirect call "
+                                             "instructions">;
+
+def FeatureExecuteOnly    : SubtargetFeature<"execute-only",
+                                             "GenExecuteOnly", "true",
+                                             "Enable the generation of "
+                                             "execute only code.">;
+
+def FeatureReserveR9      : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                             "Reserve R9, making it unavailable"
+                                             " as GPR">;
+
+def FeatureNoMovt         : SubtargetFeature<"no-movt", "NoMovt", "true",
+                                             "Don't use movt/movw pairs for "
+                                             "32-bit imms">;
+
+def FeatureNoNegativeImmediates
+                          : SubtargetFeature<"no-neg-immediates",
+                                             "NegativeImmediates", "false",
+                                             "Convert immediates and instructions "
+                                             "to their negated or complemented "
+                                             "equivalent when the immediate does "
+                                             "not fit in the encoding.">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM architecture class
+//
 
 // A-series ISA
 def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
                                      "Is application profile ('A' series)">;
 
-// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
-// See ARMInstrInfo.td for details.
-def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
-                                       "NaCl trap">;
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
 
-def FeatureStrictAlign : SubtargetFeature<"strict-align",
-                                          "StrictAlign", "true",
-                                          "Disallow all unaligned memory "
-                                          "access">;
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+                                     "Is microcontroller profile ('M' series)">;
 
-def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
-                                        "Generate calls via indirect call "
-                                        "instructions">;
 
-def FeatureExecuteOnly
-    : SubtargetFeature<"execute-only", "GenExecuteOnly", "true",
-                       "Enable the generation of execute only code.">;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
 
-def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
-                                        "Reserve R9, making it unavailable as "
-                                        "GPR">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
 
-def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
-                                     "Don't use movt/movw pairs for 32-bit "
-                                     "imms">;
-
-def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
-                                        "NegativeImmediates", "false",
-                                        "Convert immediates and instructions "
-                                        "to their negated or complemented "
-                                        "equivalent when the immediate does "
-                                        "not fit in the encoding.">;
 
 //===----------------------------------------------------------------------===//
 // ARM ISAa.
@@ -294,43 +344,57 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
 
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
+
 def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
                                    "Support ARM v5T instructions",
                                    [HasV4TOps]>;
+
 def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
-                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   "Support ARM v5TE, v5TEj, and "
+                                   "v5TExp instructions",
                                    [HasV5TOps]>;
+
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+
 def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
                                    "Support ARM v6M instructions",
                                    [HasV6Ops]>;
+
 def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
                                          "Support ARM v8M Baseline instructions",
                                          [HasV6MOps]>;
+
 def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
                                    "Support ARM v6k instructions",
                                    [HasV6Ops]>;
+
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
                                    [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
+
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon,
                                     FeatureV7Clrex]>;
+
+def HasV8MMainlineOps :
+                  SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                   "Support ARM v8M Mainline instructions",
+                                   [HasV7Ops]>;
+
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
                                    [HasV7Ops, FeatureAcquireRelease]>;
+
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
-def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
                                    "Support ARM v8.2a instructions",
                                    [HasV8_1aOps]>;
-def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
-                                         "Support ARM v8M Mainline instructions",
-                                         [HasV7Ops]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -386,11 +450,17 @@ def ProcR52     : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
 def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
                                    "Cortex-M3 ARM processors", []>;
 
+
 //===----------------------------------------------------------------------===//
-// ARM schedules.
+// ARM Helper classes.
 //
 
-include "ARMSchedule.td"
+class Architecture<string fname, string aname, list<SubtargetFeature> features>
+  : SubtargetFeature<fname, "ARMArch", aname,
+                     !strconcat(aname, " architecture"), features>;
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+  : Processor<Name, NoItineraries, Features>;
 
 
 //===----------------------------------------------------------------------===//
@@ -546,6 +616,12 @@ def ARMv7k   : Architecture<"armv7k",      "ARMv7a",   [ARMv7a]>;
 def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 
 
+//===----------------------------------------------------------------------===//
+// ARM schedules.
+//===----------------------------------------------------------------------===//
+//
+include "ARMSchedule.td"
+
 //===----------------------------------------------------------------------===//
 // ARM processors
 //
@@ -553,6 +629,9 @@ def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 // Dummy CPU, used to target architectures
 def : ProcessorModel<"generic",     CortexA8Model,      []>;
 
+// FIXME: Several processors below are not using their own scheduler
+// model, but one of similar/previous processor. These should be fixed.
+
 def : ProcNoItin<"arm8",                                [ARMv4]>;
 def : ProcNoItin<"arm810",                              [ARMv4]>;
 def : ProcNoItin<"strongarm",                           [ARMv4]>;
@@ -612,7 +691,6 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
                                                          FeatureVFP2,
                                                          FeatureHasSlowFPVMLx]>;
 
-// FIXME: A5 has currently the same Schedule model as A8
 def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -656,7 +734,6 @@ def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
                                                          FeatureCheckVLDnAlign,
                                                          FeatureMP]>;
 
-// FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -666,7 +743,6 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureVirtualization,
                                                          FeatureMP]>;
 
-// FIXME: A15 has currently the same Schedule model as A9.
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureDontWidenVMOVS,
                                                          FeatureHasRetAddrStack,
@@ -678,7 +754,6 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: A17 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -688,9 +763,7 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: krait has currently the same Schedule model as A9
-// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
-//        division features.
+// FIXME: krait has currently the same features as A9 plus VFP4 and  HWDiv
 def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureHasRetAddrStack,
                                                          FeatureMuxedUnits,
@@ -720,12 +793,10 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureSlowVGETLNi32,
                                                          FeatureSlowVDUP32]>;
 
-// FIXME: R4 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R4F has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
@@ -734,7 +805,6 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureD16,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -744,7 +814,6 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -814,14 +883,14 @@ def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
                                                          FeatureCRC,
                                                          FeatureFPAO]>;
 
-def : ProcessorModel<"cortex-a57",  CortexA57Model, [ARMv8a, ProcA57,
-                                                     FeatureHWDivThumb,
-                                                     FeatureHWDivARM,
-                                                     FeatureCrypto,
-                                                     FeatureCRC,
-                                                     FeatureFPAO,
-                                                     FeatureAvoidPartialCPSR,
-                                                     FeatureCheapPredicableCPSR]>;
+def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFPAO,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureCheapPredicableCPSR]>;
 
 def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
@@ -835,7 +904,6 @@ def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
-// Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
@@ -881,9 +949,7 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
 //===----------------------------------------------------------------------===//
 
 include "ARMRegisterInfo.td"
-
 include "ARMRegisterBanks.td"
-
 include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
@@ -891,7 +957,6 @@ include "ARMCallingConv.td"
 //===----------------------------------------------------------------------===//
 
 include "ARMInstrInfo.td"
-
 def ARMInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
@@ -912,7 +977,7 @@ def ARMAsmParserVariant : AsmParserVariant {
 }
 
 def ARM : Target {
-  // Pull in Instruction Info:
+  // Pull in Instruction Info.
   let InstructionSet = ARMInstrInfo;
   let AssemblyWriters = [ARMAsmWriter];
   let AssemblyParserVariants = [ARMAsmParserVariant];
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e97a7ce5067f..370c0a7f5c53 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -117,7 +117,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
 
   if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
@@ -163,7 +163,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   // both or otherwise does not want to enable this optimization, the function
   // should return NULL
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return nullptr;
   return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
                               : CSR_AAPCS_ThisReturn_RegMask;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 384f80356cc8..bf00ef61c2d1 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -250,8 +250,7 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
     return false;
 
   // Look to see if our OptionalDef is defining CPSR or CCR.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     if (MO.getReg() == ARM::CPSR)
       *CPSR = true;
@@ -267,8 +266,8 @@ bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
        AFI->isThumb2Function())
     return MI->isPredicable();
 
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
-    if (MCID.OpInfo[i].isPredicate())
+  for (const MCOperandInfo &opInfo : MCID.operands())
+    if (opInfo.isPredicate())
       return true;
 
   return false;
@@ -1972,7 +1971,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
         break;
       }
       case CCValAssign::AExt:
-        // Intentional fall-through.  Handle AExt and ZExt.
+      // Intentional fall-through.  Handle AExt and ZExt.
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
@@ -2001,6 +2000,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       assert(VA.getLocVT() == MVT::f64 &&
              "Custom lowering for v2f64 args not available");
 
+      // FIXME: ArgLocs[++i] may extend beyond ArgLocs.size()
       CCValAssign &NextVA = ArgLocs[++i];
 
       assert(VA.isRegLoc() && NextVA.isRegLoc() &&
@@ -2172,8 +2172,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(RetOpc));
   AddOptionalDefs(MIB);
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned R : RetRegs)
+    MIB.addReg(R, RegState::Implicit);
   return true;
 }
 
@@ -2233,8 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   ArgRegs.reserve(I->getNumOperands());
   ArgVTs.reserve(I->getNumOperands());
   ArgFlags.reserve(I->getNumOperands());
-  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-    Value *Op = I->getOperand(i);
+  for (Value *Op :  I->operands()) {
     unsigned Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
@@ -2278,8 +2277,8 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2423,8 +2422,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2932,13 +2931,12 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   bool Found = false;
   bool isZExt;
-  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
-       i != e; ++i) {
-    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
-        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
-        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+  for (const FoldableLoadExtendsStruct &FLE : FoldableLoadExtends) {
+    if (FLE.Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FLE.ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FLE.ExpectedVT) == VT) {
       Found = true;
-      isZExt = FoldableLoadExtends[i].isZExt;
+      isZExt = FLE.isZExt;
     }
   }
   if (!Found) return false;
@@ -3057,9 +3055,8 @@ bool ARMFastISel::fastLowerArguments() {
   };
 
   const TargetRegisterClass *RC = &ARM::rGPRRegClass;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I) {
-    unsigned ArgNo = I->getArgNo();
+  for (const Argument &Arg : F->args()) {
+    unsigned ArgNo = Arg.getArgNo();
     unsigned SrcReg = GPRArgRegs[ArgNo];
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
@@ -3069,7 +3066,7 @@ bool ARMFastISel::fastLowerArguments() {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
-    updateValueMap(&*I, ResultReg);
+    updateValueMap(&Arg, ResultReg);
   }
 
   return true;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 7f9fe55a5c38..f75dd4de3f96 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2682,9 +2682,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
-        SDValue Pred = getAL(CurDAG, dl);
-        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
-        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        SDValue Ops[] = {
+          CPIdx,
+          getAL(CurDAG, dl),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
         ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
                                          Ops);
       } else {
@@ -2698,6 +2701,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
         ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
                                          Ops);
       }
+      // Annotate the Node with memory operand information so that MachineInstr
+      // queries work properly. This e.g. gives the register allocation the
+      // required information for rematerialization.
+      MachineFunction& MF = CurDAG->getMachineFunction();
+      MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+      MemOp[0] = MF.getMachineMemOperand(
+          MachinePointerInfo::getConstantPool(MF),
+          MachineMemOperand::MOLoad, 4, 4);
+
+      cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1);
+        
       ReplaceNode(N, ResNode);
       return;
     }
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 29ef69ad0010..faed6b867e2b 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -722,6 +722,29 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
       return false;
     break;
   }
+  case G_BRCOND: {
+    if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) {
+      DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
+      return false;
+    }
+
+    // Set the flags.
+    auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri))
+                    .addReg(I.getOperand(0).getReg())
+                    .addImm(1)
+                    .add(predOps(ARMCC::AL));
+    if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI))
+      return false;
+
+    // Branch conditionally.
+    auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc))
+                      .add(I.getOperand(1))
+                      .add(predOps(ARMCC::EQ, ARM::CPSR));
+    if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
+      return false;
+    I.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index f23e62595d2e..1c17c07e4cb0 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -66,14 +66,16 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
       setAction({Op, s32}, Libcall);
   }
 
-  // FIXME: Support s8 and s16 as well
-  for (unsigned Op : {G_SREM, G_UREM})
+  for (unsigned Op : {G_SREM, G_UREM}) {
+    for (auto Ty : {s8, s16})
+      setAction({Op, Ty}, WidenScalar);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Lower);
     else if (AEABI(ST))
       setAction({Op, s32}, Custom);
     else
       setAction({Op, s32}, Libcall);
+  }
 
   for (unsigned Op : {G_SEXT, G_ZEXT}) {
     setAction({Op, s32}, Legal);
@@ -88,6 +90,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_SELECT, p0}, Legal);
   setAction({G_SELECT, 1, s1}, Legal);
 
+  setAction({G_BRCOND, s1}, Legal);
+
   setAction({G_CONSTANT, s32}, Legal);
   for (auto Ty : {s1, s8, s16})
     setAction({G_CONSTANT, Ty}, WidenScalar);
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 13acea3c28a9..48b02d40b246 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -153,9 +153,7 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     break;
   }
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     if (AP.lowerOperand(MO, MCOp)) {
       if (MCOp.isImm() && EncodeImms) {
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index c0c09e8c15af..844930235894 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -259,6 +259,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   case G_SELECT: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
     LLT Ty2 = MRI.getType(MI.getOperand(1).getReg());
     (void)Ty2;
     assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT");
@@ -282,6 +283,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case G_FCMP: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
     LLT Ty1 = MRI.getType(MI.getOperand(2).getReg());
     LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
     (void)Ty2;
@@ -329,6 +331,13 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                             &ARM::ValueMappings[ARM::DPR3OpsIdx]});
     break;
   }
+  case G_BR:
+    OperandsMapping = getOperandsMapping({nullptr});
+    break;
+  case G_BRCOND:
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
+    break;
   default:
     return getInvalidInstructionMapping();
   }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index f41da3e8e223..22ce949367f3 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -47,6 +47,8 @@ class ARMBaseTargetMachine : public LLVMTargetMachine {
   ~ARMBaseTargetMachine() override;
 
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index cc7a7c3849bc..81b0aa7f8b98 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -515,8 +515,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
+  bool isSelectOp = MI.getOpcode() == BPF::Select;
 
-  assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
+  assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -548,48 +549,40 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   // Insert Branch if Flag
   unsigned LHS = MI.getOperand(1).getReg();
-  unsigned RHS = MI.getOperand(2).getReg();
   int CC = MI.getOperand(3).getImm();
+  int NewCC;
   switch (CC) {
   case ISD::SETGT:
-    BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
     break;
   case ISD::SETUGT:
-    BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
     break;
   case ISD::SETGE:
-    BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
     break;
   case ISD::SETUGE:
-    BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
     break;
   case ISD::SETEQ:
-    BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
     break;
   case ISD::SETNE:
-    BuildMI(BB, DL, TII.get(BPF::JNE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
     break;
   default:
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
+  if (isSelectOp)
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(Copy1MBB);
+  else
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addImm(MI.getOperand(2).getImm())
+        .addMBB(Copy1MBB);
 
   // Copy0MBB:
   //  %FalseValue = ...
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 5ad777268208..f68357809add 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -460,6 +460,11 @@ let usesCustomInserter = 1 in {
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
                       [(set i64:$dst,
                        (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_Ri : Pseudo<(outs GPR:$dst),
+                      (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i64:$lhs, (i64 imm:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
 }
 
 // load 64-bit global addr into register
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index b064778c4bbd..d75d95a6baea 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexbit"
-
 #include "HexagonBitTracker.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
@@ -42,6 +40,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexbit"
+
 using namespace llvm;
 
 static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 2dc74632e9be..30ebf89c9808 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -45863,6 +45863,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000000;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dccleaninva : HInst<
 (outs),
@@ -45872,6 +45873,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000010;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dcfetch : HInst<
 (outs),
@@ -45900,6 +45902,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000001;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 }
 def Y2_dczeroa : HInst<
 (outs),
@@ -45909,6 +45912,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000110;
 let isSoloAin1 = 1;
+let hasSideEffects = 1;
 let mayStore = 1;
 }
 def Y2_icinva : HInst<
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 03c4a83594b3..80361015e649 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -59,8 +59,6 @@
 //         J2_jump <BB#6>, %PC<imp-def,dead>
 //     Successors according to CFG: BB#6 BB#3
 
-#define DEBUG_TYPE "hexagon-eif"
-
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
@@ -90,6 +88,8 @@
 #include <cassert>
 #include <iterator>
 
+#define DEBUG_TYPE "hexagon-eif"
+
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 734f3c6658d9..a2f6dd68c1a1 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -86,8 +86,6 @@
 // however, is that finding the locations where the implicit uses need
 // to be added, and updating the live ranges will be more involved.
 
-#define DEBUG_TYPE "expand-condsets"
-
 #include "HexagonInstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -116,6 +114,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "expand-condsets"
+
 using namespace llvm;
 
 static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index c790579ccebc..e5e75198b2d1 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -8,8 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-pei"
-
 #include "HexagonFrameLowering.h"
 #include "HexagonBlockRanges.h"
 #include "HexagonInstrInfo.h"
@@ -63,6 +61,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexagon-pei"
+
 // Hexagon stack frame layout as defined by the ABI:
 //
 //                                                       Incoming arguments
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index bf31e1699284..0a955aedaf1a 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexinsert"
-
 #include "BitTracker.h"
 #include "HexagonBitTracker.h"
 #include "HexagonInstrInfo.h"
@@ -44,6 +42,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexinsert"
+
 using namespace llvm;
 
 static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 3470480d607d..2da211563e0a 100644
--- a/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "gen-pred"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SetVector.h"
@@ -35,6 +33,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "gen-pred"
+
 using namespace llvm;
 
 namespace llvm {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 67242764d453..3997702bc962 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1286,16 +1286,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-// Creates a SPLAT instruction for a constant value VAL.
-static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
-                           SDValue Val) {
-  EVT T = VT.getVectorElementType();
-  if (T == MVT::i8 || T == MVT::i16)
-    return DAG.getNode(HexagonISD::VSPLAT, dl, VT, Val);
-
-  return SDValue();
-}
-
 static bool isSExtFree(SDValue N) {
   // A sign-extend of a truncate of a sign-extend is free.
   if (N.getOpcode() == ISD::TRUNCATE &&
@@ -1374,79 +1364,6 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// Handle only specific vector loads.
-SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = LoadNode->getChain();
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
-  SDValue Result;
-  SDValue Base = LoadNode->getBasePtr();
-  ISD::LoadExtType Ext = LoadNode->getExtensionType();
-  unsigned Alignment = LoadNode->getAlignment();
-  SDValue LoadChain;
-
-  if(Ext == ISD::NON_EXTLOAD)
-    Ext = ISD::ZEXTLOAD;
-
-  if (VT == MVT::v4i16) {
-    if (Alignment == 2) {
-      SDValue Loads[4];
-      // Base load.
-      Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base+2 load.
-      SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base and base+2.
-      SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
-      SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
-      SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
-      // Base + 4.
-      Increment = DAG.getConstant(4, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base + 6.
-      Increment = DAG.getConstant(6, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base+4 and base+6.
-      Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
-      SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
-      // Combine to i64. This could be optimised out later if we can
-      // affect reg allocation of this code.
-      Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
-      LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                              Loads[0].getValue(1), Loads[1].getValue(1),
-                              Loads[2].getValue(1), Loads[3].getValue(1));
-    } else {
-      // Perform default type expansion.
-      Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
-                           LoadNode->getAlignment(),
-                           LoadNode->getMemOperand()->getFlags());
-      LoadChain = Result.getValue(1);
-    }
-  } else
-    llvm_unreachable("Custom lowering unsupported load");
-
-  Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
-  // Since we pretend to lower a load, we need the original chain
-  // info attached to the result.
-  SDValue Ops[] = { Result, LoadChain };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
@@ -1971,18 +1888,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   // Handling of vector operations.
   //
 
-  // Custom lower v4i16 load only. Let v4i16 store to be
-  // promoted for now.
   promoteLdStType(MVT::v4i8,  MVT::i32);
   promoteLdStType(MVT::v2i16, MVT::i32);
   promoteLdStType(MVT::v8i8,  MVT::i64);
+  promoteLdStType(MVT::v4i16, MVT::i64);
   promoteLdStType(MVT::v2i32, MVT::i64);
 
-  setOperationAction(ISD::LOAD,  MVT::v4i16, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i16, Promote);
-  AddPromotedToType(ISD::LOAD,  MVT::v4i16, MVT::i64);
-  AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
-
   // Set the action for vector operations to "expand", then override it with
   // either "custom" or "legal" for specific cases.
   static const unsigned VectExpOps[] = {
@@ -2301,7 +2212,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
   case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
-  case HexagonISD::VPACK:         return "HexagonISD::VPACK";
+  case HexagonISD::VPACKE:        return "HexagonISD::VPACKE";
+  case HexagonISD::VPACKO:        return "HexagonISD::VPACKO";
   case HexagonISD::VASL:          return "HexagonISD::VASL";
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
@@ -2394,7 +2306,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
 
     // Test if V1 is a SCALAR_TO_VECTOR.
     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return createSplat(DAG, dl, VT, V1.getOperand(0));
+      return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
 
     // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
     // (and probably will turn into a SCALAR_TO_VECTOR once legalization
@@ -2409,28 +2321,26 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
         }
       }
       if (IsScalarToVector)
-        return createSplat(DAG, dl, VT, V1.getOperand(0));
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
     }
-    return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32));
+    return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                       DAG.getConstant(Lane, dl, MVT::i32));
   }
 
   if (UseHVX) {
     ArrayRef<int> Mask = SVN->getMask();
     size_t MaskLen = Mask.size();
-    int ElemSizeInBits = VT.getScalarSizeInBits();
-    if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) ||
-        (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) {
-      // Return 1 for odd and 2 of even
-      StridedLoadKind Pattern = isStridedLoad(Mask);
+    unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen;
 
+    if ((Subtarget.useHVXSglOps() && SizeInBits == 64 * 8) ||
+        (Subtarget.useHVXDblOps() && SizeInBits == 128 * 8)) {
+      StridedLoadKind Pattern = isStridedLoad(Mask);
       if (Pattern == StridedLoadKind::NoPattern)
         return SDValue();
 
-      SDValue Vec0 = Op.getOperand(0);
-      SDValue Vec1 = Op.getOperand(1);
-      SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32);
-      SDValue Ops[] = { Vec1, Vec0, StridePattern };
-      return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops);
+      unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE
+                                                      : HexagonISD::VPACKO;
+      return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)});
     }
     // We used to assert in the "else" part here, but that is bad for Halide
     // Halide creates intermediate double registers by interleaving two
@@ -2531,19 +2441,26 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (Size > 64)
     return SDValue();
 
-  APInt APSplatBits, APSplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
   unsigned NElts = BVN->getNumOperands();
 
   // Try to generate a SPLAT instruction.
-  if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
-      (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                            HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
-    unsigned SplatBits = APSplatBits.getZExtValue();
-    int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
-                       (32 - SplatBitSize));
-    return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32));
+  if (VT == MVT::v4i8 || VT == MVT::v4i16 || VT == MVT::v2i32) {
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, false)) {
+      if (SplatBitSize == VT.getVectorElementType().getSizeInBits()) {
+        unsigned ZV = APSplatBits.getZExtValue();
+        assert(SplatBitSize <= 32 && "Can only handle up to i32");
+        // Sign-extend the splat value from SplatBitSize to 32.
+        int32_t SV = SplatBitSize < 32
+                        ? int32_t(ZV << (32-SplatBitSize)) >> (32-SplatBitSize)
+                        : int32_t(ZV);
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                           DAG.getConstant(SV, dl, MVT::i32));
+      }
+    }
   }
 
   // Try to generate COMBINE to build v2i32 vectors.
@@ -2974,8 +2891,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
     case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
-    // Custom lower some vector loads.
-    case ISD::LOAD:                 return LowerLOAD(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index bfd2c94eeaba..d66cbc95e918 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -62,7 +62,8 @@ namespace HexagonISD {
       EXTRACTU,
       EXTRACTURP,
       VCOMBINE,
-      VPACK,
+      VPACKE,
+      VPACKO,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
@@ -164,7 +165,6 @@ namespace HexagonISD {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 
     bool CanLowerReturn(CallingConv::ID CallConv,
                         MachineFunction &MF, bool isVarArg,
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index c611857ec26a..104a28654dd5 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1366,6 +1366,18 @@ defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>;
 defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>;
 defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>;
 
+//*******************************************************************
+//           SYSTEM
+//*******************************************************************
+
+def: T_R_pat<Y2_dccleana,    int_hexagon_Y2_dccleana>;
+def: T_R_pat<Y2_dccleaninva, int_hexagon_Y2_dccleaninva>;
+def: T_R_pat<Y2_dcinva,      int_hexagon_Y2_dcinva>;
+def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
+
+def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
+def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index a331c978f59d..374ffa3799b0 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -10,8 +10,6 @@
 // load/store instructions.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "opt-addr-mode"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
@@ -36,6 +34,8 @@
 #include <cassert>
 #include <cstdint>
 
+#define DEBUG_TYPE "opt-addr-mode"
+
 static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
   cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
   "optimization"));
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index ba98b8994937..804a547d5b33 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -2250,6 +2250,12 @@ def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>;
 def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>;
 def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>;
 
+// Prefer this pattern to S2_asl_i_p_or for the special case of joining
+// two 32-bit words into a 64-bit word.
+let AddedComplexity = 200 in
+def: Pat<(or (shl (Aext64 I32:$a), (i32 32)), (Zext64 I32:$b)),
+         (A2_combinew I32:$a, I32:$b)>;
+
 def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)),
                      (i64 (zext (i32 (and I32:$a, (i32 65535)))))),
                  (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))),
@@ -2971,45 +2977,40 @@ def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
          (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
          Requires<[UseHVXDbl]>;
 
-def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>,
-                                          SDTCisInt<3>]>;
+def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>;
 
-def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>;
+def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>;
+def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>;
 
-// 0 as the last argument denotes vpacke. 1 denotes vpacko
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                               (v32i16 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                             (v32i16 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
+let Predicates = [UseHVXSgl] in {
+  def: Pat<(v64i8 (HexagonVPACKE (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v64i8 (HexagonVPACKO (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKE (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKO (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>;
+}
 
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 1))),
-         (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                             (v64i16 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                            (v64i16 VecDblRegs:$Vt), (i32 1))),
-        (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-        Requires<[UseHVXDbl]>;
+let Predicates = [UseHVXDbl] in {
+  def: Pat<(v128i8 (HexagonVPACKE (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v128i8 (HexagonVPACKO (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKE (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+           (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKO (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+          (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+}
 
 def V2I1:  PatLeaf<(v2i1  PredRegs:$R)>;
 def V4I1:  PatLeaf<(v4i1  PredRegs:$R)>;
@@ -3073,6 +3074,10 @@ def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
 // four halfwords of 64-bits destination register.
 def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
 
+def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
+         (A2_combineii imm:$s8, imm:$s8)>;
+def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (A2_combinew I32:$Rs, I32:$Rs)>;
+
 
 class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
   : Pat <(Op Type:$Rss, Type:$Rtt),
@@ -3099,14 +3104,11 @@ def: VArith_pat <A2_xorp,   xor, V8I8>;
 def: VArith_pat <A2_xorp,   xor, V4I16>;
 def: VArith_pat <A2_xorp,   xor, V2I32>;
 
-def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_lsr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asl_i_vw V2I32:$b, imm:$c)>;
 
 def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))),
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 34df2ebcc520..ea86c9c42f47 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -53,6 +53,10 @@ static cl::opt<bool>
     EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false),
                  cl::desc("Emit hexagon jump tables in function section"));
 
+static cl::opt<bool>
+    EmitLutInText("hexagon-emit-lut-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon lookup tables in function section"));
+
 // TraceGVPlacement controls messages for all builds. For builds with assertions
 // (debug or release), messages are also controlled by the usual debug flags
 // (e.g. -debug and -debug-only=globallayout)
@@ -136,6 +140,13 @@ MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
          << (Kind.isBSS() ? "kind_bss " : "" )
          << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
 
+  // If the lookup table is used by more than one function, do not place
+  // it in text section.
+  if (EmitLutInText && GO->getName().startswith("switch.table")) {
+    if (const Function *Fn = getLutUsedFunction(GO))
+      return selectSectionForLookupTable(GO, TM, Fn);
+  }
+
   if (isGlobalInSmallSection(GO, TM))
     return selectSmallSectionForGlobal(GO, Kind, TM);
 
@@ -402,3 +413,39 @@ MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
   // Otherwise, we work the same as ELF.
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
 }
+
+// Return the function that uses the lookup table. If there are more
+// than one live function that uses this look table, bail out and place
+// the lookup table in default section.
+const Function *
+HexagonTargetObjectFile::getLutUsedFunction(const GlobalObject *GO) const {
+  const Function *ReturnFn = nullptr;
+  for (auto U : GO->users()) {
+    // validate each instance of user to be a live function.
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    auto *Bb = I->getParent();
+    if (!Bb)
+      continue;
+    auto *UserFn = Bb->getParent();
+    if (!ReturnFn)
+      ReturnFn = UserFn;
+    else if (ReturnFn != UserFn)
+      return nullptr;
+  }
+  return ReturnFn;
+}
+
+MCSection *HexagonTargetObjectFile::selectSectionForLookupTable(
+    const GlobalObject *GO, const TargetMachine &TM, const Function *Fn) const {
+
+  SectionKind Kind = SectionKind::getText();
+  // If the function has explicit section, place the lookup table in this
+  // explicit section.
+  if (Fn->hasSection())
+    return getExplicitSectionGlobal(Fn, Kind, TM);
+
+  const auto *FuncObj = dyn_cast<GlobalObject>(Fn);
+  return SelectSectionForGlobal(FuncObj, Kind, TM);
+}
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 373d850b53be..eff44f097e03 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -36,6 +36,8 @@ namespace llvm {
     bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
                                              const Function &F) const override;
 
+    const Function *getLutUsedFunction(const GlobalObject *GO) const;
+
   private:
     MCSectionELF *SmallDataSection;
     MCSectionELF *SmallBSSSection;
@@ -46,6 +48,10 @@ namespace llvm {
     MCSection *selectSmallSectionForGlobal(const GlobalObject *GO,
                                            SectionKind Kind,
                                            const TargetMachine &TM) const;
+
+    MCSection *selectSectionForLookupTable(const GlobalObject *GO,
+                                           const TargetMachine &TM,
+                                           const Function *Fn) const;
   };
 
 } // namespace llvm
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index b72c9d534478..e12188e70602 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -304,9 +304,6 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                   const MCSubtargetInfo *STI);
 
-  bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                       const MCSubtargetInfo *STI);
-
   bool reportParseError(Twine ErrorMsg);
   bool reportParseError(SMLoc Loc, Twine ErrorMsg);
 
@@ -2514,16 +2511,6 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SEQIMacro:
     return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
-  case Mips::MFTC0:   case Mips::MTTC0:
-  case Mips::MFTGPR:  case Mips::MTTGPR:
-  case Mips::MFTLO:   case Mips::MTTLO:
-  case Mips::MFTHI:   case Mips::MTTHI:
-  case Mips::MFTACX:  case Mips::MTTACX:
-  case Mips::MFTDSP:  case Mips::MTTDSP:
-  case Mips::MFTC1:   case Mips::MTTC1:
-  case Mips::MFTHC1:  case Mips::MTTHC1:
-  case Mips::CFTC1:   case Mips::CTTC1:
-    return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   }
 }
 
@@ -4895,212 +4882,6 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
-// Map the DSP accumulator and control register to the corresponding gpr
-// operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions
-// do not map the DSP registers contigously to gpr registers.
-static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) {
-  switch (Inst.getOpcode()) {
-    case Mips::MFTLO:
-    case Mips::MTTLO:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::ZERO;
-        case Mips::AC1:
-          return Mips::A0;
-        case Mips::AC2:
-          return Mips::T0;
-        case Mips::AC3:
-          return Mips::T4;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTHI:
-    case Mips::MTTHI:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::AT;
-        case Mips::AC1:
-          return Mips::A1;
-        case Mips::AC2:
-          return Mips::T1;
-        case Mips::AC3:
-          return Mips::T5;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTACX:
-    case Mips::MTTACX:
-      switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) {
-        case Mips::AC0:
-          return Mips::V0;
-        case Mips::AC1:
-          return Mips::A2;
-        case Mips::AC2:
-          return Mips::T2;
-        case Mips::AC3:
-          return Mips::T6;
-        default:
-          llvm_unreachable("Unknown register for 'mttr' alias!");
-    }
-    case Mips::MFTDSP:
-    case Mips::MTTDSP:
-      return Mips::S0;
-    default:
-      llvm_unreachable("Unknown instruction for 'mttr' dsp alias!");
-  }
-}
-
-// Map the floating point register operand to the corresponding register
-// operand.
-static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) {
-  switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) {
-    case Mips::F0:  return Mips::ZERO;
-    case Mips::F1:  return Mips::AT;
-    case Mips::F2:  return Mips::V0;
-    case Mips::F3:  return Mips::V1;
-    case Mips::F4:  return Mips::A0;
-    case Mips::F5:  return Mips::A1;
-    case Mips::F6:  return Mips::A2;
-    case Mips::F7:  return Mips::A3;
-    case Mips::F8:  return Mips::T0;
-    case Mips::F9:  return Mips::T1;
-    case Mips::F10: return Mips::T2;
-    case Mips::F11: return Mips::T3;
-    case Mips::F12: return Mips::T4;
-    case Mips::F13: return Mips::T5;
-    case Mips::F14: return Mips::T6;
-    case Mips::F15: return Mips::T7;
-    case Mips::F16: return Mips::S0;
-    case Mips::F17: return Mips::S1;
-    case Mips::F18: return Mips::S2;
-    case Mips::F19: return Mips::S3;
-    case Mips::F20: return Mips::S4;
-    case Mips::F21: return Mips::S5;
-    case Mips::F22: return Mips::S6;
-    case Mips::F23: return Mips::S7;
-    case Mips::F24: return Mips::T8;
-    case Mips::F25: return Mips::T9;
-    case Mips::F26: return Mips::K0;
-    case Mips::F27: return Mips::K1;
-    case Mips::F28: return Mips::GP;
-    case Mips::F29: return Mips::SP;
-    case Mips::F30: return Mips::FP;
-    case Mips::F31: return Mips::RA;
-    default: llvm_unreachable("Unknown register for mttc1 alias!");
-  }
-}
-
-// Map the coprocessor operand the corresponding gpr register operand.
-static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) {
-  switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) {
-    case Mips::COP00:  return Mips::ZERO;
-    case Mips::COP01:  return Mips::AT;
-    case Mips::COP02:  return Mips::V0;
-    case Mips::COP03:  return Mips::V1;
-    case Mips::COP04:  return Mips::A0;
-    case Mips::COP05:  return Mips::A1;
-    case Mips::COP06:  return Mips::A2;
-    case Mips::COP07:  return Mips::A3;
-    case Mips::COP08:  return Mips::T0;
-    case Mips::COP09:  return Mips::T1;
-    case Mips::COP010: return Mips::T2;
-    case Mips::COP011: return Mips::T3;
-    case Mips::COP012: return Mips::T4;
-    case Mips::COP013: return Mips::T5;
-    case Mips::COP014: return Mips::T6;
-    case Mips::COP015: return Mips::T7;
-    case Mips::COP016: return Mips::S0;
-    case Mips::COP017: return Mips::S1;
-    case Mips::COP018: return Mips::S2;
-    case Mips::COP019: return Mips::S3;
-    case Mips::COP020: return Mips::S4;
-    case Mips::COP021: return Mips::S5;
-    case Mips::COP022: return Mips::S6;
-    case Mips::COP023: return Mips::S7;
-    case Mips::COP024: return Mips::T8;
-    case Mips::COP025: return Mips::T9;
-    case Mips::COP026: return Mips::K0;
-    case Mips::COP027: return Mips::K1;
-    case Mips::COP028: return Mips::GP;
-    case Mips::COP029: return Mips::SP;
-    case Mips::COP030: return Mips::FP;
-    case Mips::COP031: return Mips::RA;
-    default: llvm_unreachable("Unknown register for mttc0 alias!");
-  }
-}
-
-/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing
-/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits.
-bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                    const MCSubtargetInfo *STI) {
-  MipsTargetStreamer &TOut = getTargetStreamer();
-  unsigned rd = 0;
-  unsigned u = 1;
-  unsigned sel = 0;
-  unsigned h = 0;
-  bool IsMFTR = false;
-  switch (Inst.getOpcode()) {
-    case Mips::MFTC0:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTC0:
-      u = 0;
-      rd = getRegisterForMxtrC0(Inst, IsMFTR);
-      sel = Inst.getOperand(2).getImm();
-      break;
-    case Mips::MFTGPR:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTGPR:
-      rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg();
-      break;
-    case Mips::MFTLO:
-    case Mips::MFTHI:
-    case Mips::MFTACX:
-    case Mips::MFTDSP:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTLO:
-    case Mips::MTTHI:
-    case Mips::MTTACX:
-    case Mips::MTTDSP:
-      rd = getRegisterForMxtrDSP(Inst, IsMFTR);
-      sel = 1;
-      break;
-    case Mips::MFTHC1:
-      h = 1;
-      LLVM_FALLTHROUGH;
-    case Mips::MFTC1:
-      IsMFTR = true;
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 2;
-      break;
-    case Mips::MTTHC1:
-      h = 1;
-      LLVM_FALLTHROUGH;
-    case Mips::MTTC1:
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 2;
-      break;
-    case Mips::CFTC1:
-      IsMFTR = true;
-      LLVM_FALLTHROUGH;
-    case Mips::CTTC1:
-      rd = getRegisterForMxtrFP(Inst, IsMFTR);
-      sel = 3;
-      break;
-  }
-  unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd;
-  unsigned Op1 =
-      IsMFTR ? rd
-             : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg()
-                                                 : Inst.getOperand(0).getReg());
-
-  TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc,
-                 STI);
-  return false;
-}
-
 unsigned
 MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
                                               const OperandVector &Operands) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 7caeb08589af..2907b7715857 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -193,21 +193,6 @@ void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI);
 }
 
-void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0,
-                                   unsigned Reg1, int16_t Imm0, int16_t Imm1,
-                                   int16_t Imm2, SMLoc IDLoc,
-                                   const MCSubtargetInfo *STI) {
-  MCInst TmpInst;
-  TmpInst.setOpcode(Opcode);
-  TmpInst.addOperand(MCOperand::createReg(Reg0));
-  TmpInst.addOperand(MCOperand::createReg(Reg1));
-  TmpInst.addOperand(MCOperand::createImm(Imm0));
-  TmpInst.addOperand(MCOperand::createImm(Imm1));
-  TmpInst.addOperand(MCOperand::createImm(Imm2));
-  TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
-}
-
 void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
                                   unsigned TrgReg, bool Is64Bit,
                                   const MCSubtargetInfo *STI) {
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index d2f0fdcc6cc1..6ceb05577538 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -190,6 +190,9 @@ def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true",
 
 def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
 
+def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
+                                        "Disable use of the jal instruction">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a6ec9fb2e598..20319f85696c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -364,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
+  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
+    setOperationAction(ISD::ADDC, MVT::i32, Expand);
+    setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -469,6 +481,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::SHL);
 
@@ -923,14 +936,127 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   }
 }
 
+static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
+                                       const MipsSubtarget &Subtarget) {
+  // ROOTNode must have a multiplication as an operand for the match to be
+  // successful.
+  if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
+      ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // We don't handle vector types here.
+  if (ROOTNode->getValueType(0).isVector())
+    return SDValue();
+
+  // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
+  // arithmetic. E.g.
+  // (add (mul a b) c) =>
+  //   let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
+  //   MIPS64:   (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
+  //   or
+  //   MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
+  //
+  // The overhead of setting up the Hi/Lo registers and reassembling the
+  // result makes this a dubious optimzation for MIPS64. The core of the
+  // problem is that Hi/Lo contain the upper and lower 32 bits of the
+  // operand and result.
+  //
+  // It requires a chain of 4 add/mul for MIPS64R2 to get better code
+  // density than doing it naively, 5 for MIPS64. Additionally, using
+  // madd/msub on MIPS64 requires the operands actually be 32 bit sign
+  // extended operands, not true 64 bit values.
+  //
+  // FIXME: For the moment, disable this completely for MIPS64.
+  if (Subtarget.hasMips64())
+    return SDValue();
+
+  SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(0)
+                     : ROOTNode->getOperand(1);
+
+  SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(1)
+                     : ROOTNode->getOperand(0);
+
+  // Transform this to a MADD only if the user of this node is the add.
+  // If there are other users of the mul, this function returns here.
+  if (!Mult.hasOneUse())
+    return SDValue();
+
+  // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
+  // must be in canonical form, i.e. sign extended. For MIPS32, the operands
+  // of the multiply must have 32 or more sign bits, otherwise we cannot
+  // perform this optimization. We have to check this here as we're performing
+  // this optimization pre-legalization.
+  SDValue MultLHS = Mult->getOperand(0);
+  SDValue MultRHS = Mult->getOperand(1);
+
+  bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND &&
+                  MultRHS->getOpcode() == ISD::SIGN_EXTEND;
+  bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND &&
+                    MultRHS->getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSigned && !IsUnsigned)
+    return SDValue();
+
+  // Initialize accumulator.
+  SDLoc DL(ROOTNode);
+  SDValue TopHalf;
+  SDValue BottomHalf;
+  BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                              CurDAG.getIntPtrConstant(0, DL));
+
+  TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                           CurDAG.getIntPtrConstant(1, DL));
+  SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  BottomHalf,
+                                  TopHalf);
+
+  // Create MipsMAdd(u) / MipsMSub(u) node.
+  bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
+  unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
+                          : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
+  SDValue MAddOps[3] = {
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
+  EVT VTs[2] = {MVT::i32, MVT::i32};
+  SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
+
+  SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+  SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+  SDValue Combined =
+      CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
+  return Combined;
+}
+
+static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
+
+    return SDValue();
+  }
+
+  return SDValue();
+}
+
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+  // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
 
-  if (DCI.isBeforeLegalizeOps())
     return SDValue();
+  }
 
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
   SDValue Add = N->getOperand(1);
 
   if (Add.getOpcode() != ISD::ADD)
@@ -1058,6 +1184,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
   case ISD::SHL:
     return performSHLCombine(N, DAG, DCI, Subtarget);
+  case ISD::SUB:
+    return performSUBCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -3021,6 +3149,20 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   EVT Ty = Callee.getValueType();
   bool GlobalOrExternal = false, IsCallReloc = false;
 
+  // The long-calls feature is ignored in case of PIC.
+  // While we do not support -mshared / -mno-shared properly,
+  // ignore long-calls in case of -mabicalls too.
+  if (Subtarget.useLongCalls() && !Subtarget.isABICalls() && !IsPIC) {
+    // Get the address of the callee into a register to prevent
+    // using of the `jal` instruction for the direct call.
+    if (auto *N = dyn_cast<GlobalAddressSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+    else if (auto *N = dyn_cast<ExternalSymbolSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+  }
+
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPIC) {
       const GlobalValue *Val = G->getGlobal();
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 94f3a74be98b..0333fe6520fa 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -443,8 +443,17 @@ let AdditionalPredicates = [NotInMicroMips] in {
 }
 def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
                           bitconvert>, MFC1_FM<0>;
+def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
+def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
+
 let AdditionalPredicates = [NotInMicroMips] in {
   def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
                   MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td
index edc0981e6278..64bee5bfba18 100644
--- a/lib/Target/Mips/MipsMTInstrFormats.td
+++ b/lib/Target/Mips/MipsMTInstrFormats.td
@@ -35,8 +35,6 @@ class FIELD5<bits<5> Val> {
 def FIELD5_1_DMT_EMT  : FIELD5<0b00001>;
 def FIELD5_2_DMT_EMT  : FIELD5<0b01111>;
 def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>;
-def FIELD5_MFTR : FIELD5<0b01000>;
-def FIELD5_MTTR : FIELD5<0b01100>;
 
 class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
   bits<32> Inst;
@@ -52,25 +50,6 @@ class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
   let Inst{2-0}   = 0b001;
 }
 
-class COP0_MFTTR_MT<FIELD5 Op> : MipsMTInst {
-  bits<32> Inst;
-
-  bits<5> rt;
-  bits<5> rd;
-  bits<1> u;
-  bits<1> h;
-  bits<3> sel;
-  let Inst{31-26} = 0b010000; // COP0
-  let Inst{25-21} = Op.Value; // MFMC0
-  let Inst{20-16} = rt;
-  let Inst{15-11} = rd;
-  let Inst{10-6}  = 0b00000;  // rx - currently unsupported.
-  let Inst{5}     = u;
-  let Inst{4}     = h;
-  let Inst{3}     = 0b0;
-  let Inst{2-0}   = sel;
-}
-
 class SPECIAL3_MT_FORK : MipsMTInst {
   bits<32> Inst;
 
diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td
index 72e626cbec40..ab6693f60fd9 100644
--- a/lib/Target/Mips/MipsMTInstrInfo.td
+++ b/lib/Target/Mips/MipsMTInstrInfo.td
@@ -6,13 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the MIPS MT ASE as defined by MD00378 1.12.
-//
-// TODO: Add support for the microMIPS encodings for the MT ASE and add the
-//       instruction mappings.
-//
-//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Encodings
@@ -34,10 +27,6 @@ class FORK_ENC : SPECIAL3_MT_FORK;
 
 class YIELD_ENC : SPECIAL3_MT_YIELD;
 
-class MFTR_ENC : COP0_MFTTR_MT<FIELD5_MFTR>;
-
-class MTTR_ENC : COP0_MFTTR_MT<FIELD5_MTTR>;
-
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -50,22 +39,6 @@ class MT_1R_DESC_BASE<string instr_asm, InstrItinClass Itin = NoItinerary> {
   InstrItinClass Itinerary = Itin;
 }
 
-class MFTR_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h);
-  string AsmString = "mftr\t$rd, $rt, $u, $sel, $h";
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_MFTR;
-}
-
-class MTTR_DESC {
-  dag OutOperandList = (outs GPR32Opnd:$rd);
-  dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h);
-  string AsmString = "mttr\t$rt, $rd, $u, $sel, $h";
-  list<dag> Pattern = [];
-  InstrItinClass Itinerary = II_MTTR;
-}
-
 class FORK_DESC {
   dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd);
   dag InOperandList = (ins GPR32Opnd:$rt);
@@ -106,73 +79,8 @@ let hasSideEffects = 1, isNotDuplicable = 1,
   def FORK : FORK_ENC, FORK_DESC, ASE_MT;
 
   def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT;
-
-  def MFTR : MFTR_ENC, MFTR_DESC, ASE_MT;
-
-  def MTTR : MTTR_ENC, MTTR_DESC, ASE_MT;
 }
 
-//===----------------------------------------------------------------------===//
-// MIPS MT Pseudo Instructions - used to support mtfr & mttr aliases.
-//===----------------------------------------------------------------------===//
-def MFTC0 : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins COP0Opnd:$rt,
-                                                        uimm3:$sel),
-                              "mftc0 $rd, $rt, $sel">, ASE_MT;
-
-def MFTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rt,
-                                                          uimm3:$sel),
-                               "mftgpr $rd, $rt">, ASE_MT;
-
-def MFTLO : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                              "mftlo $rt, $ac">, ASE_MT;
-
-def MFTHI : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                              "mfthi $rt, $ac">, ASE_MT;
-
-def MFTACX : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac),
-                               "mftacx $rt, $ac">, ASE_MT;
-
-def MFTDSP : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins),
-                               "mftdsp $rt">, ASE_MT;
-
-def MFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft),
-                              "mftc1 $rt, $ft">, ASE_MT;
-
-def MFTHC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft),
-                               "mfthc1 $rt, $ft">, ASE_MT;
-
-def CFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGRCCOpnd:$ft),
-                              "cftc1 $rt, $ft">, ASE_MT;
-
-
-def MTTC0 : MipsAsmPseudoInst<(outs COP0Opnd:$rd), (ins GPR32Opnd:$rt,
-                                                        uimm3:$sel),
-                              "mttc0 $rt, $rd, $sel">, ASE_MT;
-
-def MTTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins GPR32Opnd:$rd),
-                               "mttgpr $rd, $rt">, ASE_MT;
-
-def MTTLO : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                              "mttlo $rt, $ac">, ASE_MT;
-
-def MTTHI : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                              "mtthi $rt, $ac">, ASE_MT;
-
-def MTTACX : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt),
-                               "mttacx $rt, $ac">, ASE_MT;
-
-def MTTDSP : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rt),
-                               "mttdsp $rt">, ASE_MT;
-
-def MTTC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt),
-                              "mttc1 $rt, $ft">, ASE_MT;
-
-def MTTHC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt),
-                               "mtthc1 $rt, $ft">, ASE_MT;
-
-def CTTC1 : MipsAsmPseudoInst<(outs FGRCCOpnd:$ft), (ins GPR32Opnd:$rt),
-                              "cttc1 $rt, $ft">, ASE_MT;
-
 //===----------------------------------------------------------------------===//
 // MIPS MT Instruction Definitions
 //===----------------------------------------------------------------------===//
@@ -187,22 +95,4 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT;
 
   def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mftc0 $rd, $rt", (MFTC0 GPR32Opnd:$rd, COP0Opnd:$rt, 0),
-                      1>, ASE_MT;
-
-  def : MipsInstAlias<"mftlo $rt", (MFTLO GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mfthi $rt", (MFTHI GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mftacx $rt", (MFTACX GPR32Opnd:$rt, AC0), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mttc0 $rd, $rt", (MTTC0 COP0Opnd:$rt, GPR32Opnd:$rd, 0),
-                      1>, ASE_MT;
-
-  def : MipsInstAlias<"mttlo $rt", (MTTLO AC0, GPR32Opnd:$rt), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mtthi $rt", (MTTHI AC0, GPR32Opnd:$rt), 1>, ASE_MT;
-
-  def : MipsInstAlias<"mttacx $rt", (MTTACX AC0, GPR32Opnd:$rt), 1>, ASE_MT;
 }
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 49ae6dd4cd39..4be26dd25dc0 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -245,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   }
 }
 
-void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                        SDValue CmpLHS, const SDLoc &DL,
-                                        SDNode *Node) const {
-  unsigned Opc = InFlag.getOpcode(); (void)Opc;
-
-  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
-          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
-         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
-  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
-  if (Subtarget->isGP64bit()) {
-    SLTuOp = Mips::SLTu64;
-    ADDuOp = Mips::DADDu;
-  }
-
-  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
+  SDValue InFlag = Node->getOperand(2);
+  unsigned Opc = InFlag.getOpcode();
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
-
-  if (Subtarget->isGP64bit()) {
-    // On 64-bit targets, sltu produces an i64 but our backend currently says
-    // that SLTu64 produces an i32. We need to fix this in the long run but for
-    // now, just make the DAG type-correct by asserting the upper bits are zero.
-    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
-                                   CurDAG->getTargetConstant(0, DL, VT),
-                                   SDValue(Carry, 0),
-                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
-                                                             VT));
+  // In the base case, we can rely on the carry bit from the addsc
+  // instruction.
+  if (Opc == ISD::ADDC) {
+    SDValue Ops[3] = {LHS, RHS, InFlag};
+    CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
+    return;
   }
 
-  // Generate a second addition only if we know that RHS is not a
-  // constant-zero node.
-  SDNode *AddCarry = Carry;
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
-  if (!C || C->getZExtValue())
-    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+  assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
 
-  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+  // The more complex case is when there is a chain of ISD::ADDE nodes like:
+  // (adde (adde (adde (addc a b) c) d) e).
+  //
+  // The addwc instruction does not write to the carry bit, instead it writes
+  // to bit 20 of the dsp control register. To match this series of nodes, each
+  // intermediate adde node must be expanded to write the carry bit before the
+  // addition.
+
+  // Start by reading the overflow field for addsc and moving the value to the
+  // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
+  // corresponds to reading/writing the entire control register to/from a GPR.
+
+  SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
+
+  SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
+
+  SDNode *DSPCtrlField =
+      CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
+
+  SDNode *Carry = CurDAG->getMachineNode(
+      Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
+
+  SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
+                    CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
+                    SDValue(Carry, 0)};
+  SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
+
+  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+  // would like about whether bit 20 always gets overwritten by addwc.
+  // Hence take an extremely conservative view and presume it's sticky. We
+  // therefore need to clear it.
+
+  SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+  SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
+  SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
+
+  SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
+                                         SDValue(DSPCtrlFinal, 0), CstOne);
+
+  SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
+  CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
 }
 
 /// Match frameindex
@@ -765,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
-  case ISD::SUBE: {
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
-    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
-    return true;
-  }
-
   case ISD::ADDE: {
-    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
-      break;
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
-    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+    selectAddE(Node, DL);
     return true;
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index f89a350cab04..6f38289c5a45 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,8 +41,7 @@ class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
                                            const SDLoc &dl, EVT Ty, bool HasLo,
                                            bool HasHi);
 
-  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                      const SDLoc &DL, SDNode *Node) const;
+  void selectAddE(SDNode *Node, const SDLoc &DL) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 06a97b9d123e..72b2738bfac4 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
-  setTargetDAGCombine(ISD::ADDE);
-  setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   return MipsTargetLowering::LowerOperation(Op, DAG);
 }
 
-// selectMADD -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc multLo, Lo0), (adde multHi, Hi0),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
-  // ADDENode's second operand must be a flag output of an ADDC node in order
-  // for the matching to be successful.
-  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
-
-  if (ADDCNode->getOpcode() != ISD::ADDC)
-    return false;
-
-  SDValue MultHi = ADDENode->getOperand(0);
-  SDValue MultLo = ADDCNode->getOperand(0);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than ADDENode or ADDCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MADD instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(ADDENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  ADDCNode->getOperand(1),
-                                  ADDENode->getOperand(1));
-
-  // create MipsMAdd(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
-
-  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of adde and addc here
-  if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
-  }
-  if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-// selectMSUB -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc Lo0, multLo), (sube Hi0, multHi),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
-  // SUBENode's second operand must be a flag output of an SUBC node in order
-  // for the matching to be successful.
-  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
-
-  if (SUBCNode->getOpcode() != ISD::SUBC)
-    return false;
-
-  SDValue MultHi = SUBENode->getOperand(1);
-  SDValue MultLo = SUBCNode->getOperand(1);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than SUBENode or SUBCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MSUB instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(SUBENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  SUBCNode->getOperand(0),
-                                  SUBENode->getOperand(0));
-
-  // create MipsSub(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
-
-  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of sube and subc here
-  if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
-  }
-  if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
 //
 // Performs the following transformations:
@@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
-      selectMSUB(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
@@ -1110,16 +938,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Val;
 
   switch (N->getOpcode()) {
-  case ISD::ADDE:
-    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     Val = performANDCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::OR:
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
-  case ISD::SUBE:
-    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
@@ -3596,9 +3420,17 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
                                : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
                                                         : &Mips::GPR64RegClass);
   const bool UsingMips32 = RC == &Mips::GPR32RegClass;
-  unsigned Rs = RegInfo.createVirtualRegister(RC);
+  unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp)
+        .addImm(0)
+        .addReg(Rs)
+        .addImm(Mips::sub_32);
+    Rs = Tmp;
+  }
   BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64))
       .addReg(Rs)
       .addReg(Rt)
@@ -3649,6 +3481,12 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
   for (unsigned i = 1; i < MI.getNumOperands(); i++)
     MIB.add(MI.getOperand(i));
 
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32);
+    Rt = Tmp;
+  }
+
   BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
 
   MI.eraseFromParent();
@@ -3716,6 +3554,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
 
   bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+  bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
@@ -3726,7 +3565,9 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1;
+  unsigned MFC1Opc = IsFGR64onMips64
+                         ? Mips::DMFC1
+                         : (IsFGR64onMips32 ? Mips::MFC1_D64 : Mips::MFC1);
   unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
 
   // Perform the register class copy as mentioned above.
@@ -3735,7 +3576,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
   unsigned WPHI = Wtemp;
 
-  if (!Subtarget.hasMips64() && IsFGR64) {
+  if (IsFGR64onMips32) {
     unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
     BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
     unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
@@ -3829,7 +3670,9 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1;
+  unsigned MTC1Opc = IsFGR64onMips64
+                         ? Mips::DMTC1
+                         : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1);
   unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
 
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 8ec55ab6284d..c2947bb44ef5 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -226,7 +226,6 @@ def II_MFC1             : InstrItinClass;
 def II_MFHC1            : InstrItinClass;
 def II_MFC2             : InstrItinClass;
 def II_MFHI_MFLO        : InstrItinClass; // mfhi and mflo
-def II_MFTR             : InstrItinClass;
 def II_MOD              : InstrItinClass;
 def II_MODU             : InstrItinClass;
 def II_MOVE             : InstrItinClass;
@@ -256,7 +255,6 @@ def II_MTC1             : InstrItinClass;
 def II_MTHC1            : InstrItinClass;
 def II_MTC2             : InstrItinClass;
 def II_MTHI_MTLO        : InstrItinClass; // mthi and mtlo
-def II_MTTR             : InstrItinClass;
 def II_MUL              : InstrItinClass;
 def II_MUH              : InstrItinClass;
 def II_MUHU             : InstrItinClass;
@@ -666,14 +664,12 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_MFHC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC1            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFC2            , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MFTR            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC0            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC0           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC1            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTC2            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MFHC1           , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_MTHC1           , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<II_MTTR            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_CACHE           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_PREF            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_CACHEE          , [InstrStage<1,  [ALU]>]>,
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 7619e7b08612..cce3b8c4c8d1 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -152,6 +152,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasMT -- support MT ASE.
   bool HasMT;
 
+  // Disable use of the `jal` instruction.
+  bool UseLongCalls = false;
+
   InstrItineraryData InstrItins;
 
   // We can override the determination of whether we are in mips16 mode
@@ -269,6 +272,8 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
 
   bool useSoftFloat() const { return IsSoftFloat; }
 
+  bool useLongCalls() const { return UseLongCalls; }
+
   bool enableLongBranchPass() const {
     return hasStandardEncoding() || allowMixed16_32();
   }
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index af24838665e1..7d9f99ce071e 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -119,9 +119,6 @@ class MipsTargetStreamer : public MCTargetStreamer {
                SMLoc IDLoc, const MCSubtargetInfo *STI);
   void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm,
                SMLoc IDLoc, const MCSubtargetInfo *STI);
-  void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0,
-                 int16_t Imm1, int16_t Imm2, SMLoc IDLoc,
-                 const MCSubtargetInfo *STI);
   void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit,
                 const MCSubtargetInfo *STI);
   void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index a00b56af0490..92c8c224b71b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -271,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isImm());
+  assert(MO.isImm() && !(MO.getImm() % 16) &&
+         "Expecting an immediate that is a multiple of 16");
 
   return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
 }
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 3aaf7ef2c2a0..901539b682ba 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -178,7 +178,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -211,7 +211,11 @@ namespace {
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4);
+    }
+
+    bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16);
     }
 
     // Select an address into a single register.
@@ -305,6 +309,7 @@ namespace {
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
 
+    bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
   };
 
@@ -2999,6 +3004,25 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
   return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
 }
 
+/// Does this node represent a load/store node whose address can be represented
+/// with a register plus an immediate that's a multiple of \p Val:
+bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
+  LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
+  StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
+  SDValue AddrOp;
+  if (LDN)
+    AddrOp = LDN->getOperand(1);
+  else if (STN)
+    AddrOp = STN->getOperand(2);
+
+  short Imm = 0;
+  if (AddrOp.getOpcode() == ISD::ADD)
+    return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+
+  // If the address comes from the outside, the offset will be zero.
+  return AddrOp.getOpcode() == ISD::CopyFromReg;
+}
+
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0e069ec1665f..b3a3c73f6df0 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2130,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
 
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.  If Aligned is true, only accept displacements
-/// suitable for STD and friends, i.e. multiples of 4.
+/// represented as reg+reg.  If \p Alignment is non-zero, only accept
+/// displacements that are multiples of that value.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
                                             SelectionDAG &DAG,
-                                            bool Aligned) const {
+                                            unsigned Alignment) const {
   // FIXME dl should come from parent load or store, not from address
   SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
@@ -2145,7 +2145,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   if (N.getOpcode() == ISD::ADD) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2169,7 +2169,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   } else if (N.getOpcode() == ISD::OR) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
@@ -2196,7 +2196,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
     int16_t Imm;
-    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+    if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
       Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -2206,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // Handle 32-bit sext immediates with LIS + addr mode.
     if ((CN->getValueType(0) == MVT::i32 ||
          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
-        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+        (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -2321,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
       return false;
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 821927d3b157..49d7d8220af1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -616,7 +616,7 @@ namespace llvm {
     /// is not better represented as reg+reg.  If Aligned is true, only accept
     /// displacements suitable for STD and friends, i.e. multiples of 4.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
-                             SelectionDAG &DAG, bool Aligned) const;
+                             SelectionDAG &DAG, unsigned Alignment) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 13b4f9ab962d..e74ba38c351f 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1635,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (equalityOnly) {
     // We need to check the uses of the condition register in order to reject
     // non-equality comparisons.
-    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
-         IE = MRI->use_instr_end(); I != IE; ++I) {
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
@@ -1658,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
        ++I) {
     bool FoundUse = false;
-    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
-         JE = MRI->use_instr_end(); J != JE; ++J)
+    for (MachineRegisterInfo::use_instr_iterator
+         J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end();
+         J != JE; ++J)
       if (&*J == &*I) {
         FoundUse = true;
         break;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 6d9f55206b6a..dd7fc2659102 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -405,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() < 4;
 }]>;
 
+// This is a somewhat weaker condition than actually checking for 16-byte
+// alignment. It is simply checking that the displacement can be represented
+// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form
+// instructions).
+def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                               (store node:$val, node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                                  (store node:$val, node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Flag Definitions.
 
@@ -815,7 +834,8 @@ def pred : Operand<OtherVT> {
 def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>;  // "std"
+def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
 
 // The address in a single register. This is used with the SjLj
 // pseudo-instructions.
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 43635a8919e2..942e8b392b82 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2606,37 +2606,41 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // IsLittleEndian, HasP9Vector
 
   // D-Form Load/Store
-  def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
 
-  def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst),
+  def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst),
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst),
             (STXV $rS, memrix16:$dst)>;
 
 
-  def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>;
-  def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst),
-            (STXVX $rS, xaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst),
-            (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
   def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
             (v4i32 (LXVWSX xoaddr:$src))>;
   def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
@@ -2788,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   let isPseudo = 1 in {
     def DFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
                             "#DFLOADf32",
-                            [(set f32:$XT, (load iaddr:$src))]>;
+                            [(set f32:$XT, (load ixaddr:$src))]>;
     def DFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
                             "#DFLOADf64",
-                            [(set f64:$XT, (load iaddr:$src))]>;
+                            [(set f64:$XT, (load ixaddr:$src))]>;
     def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
                             "#DFSTOREf32",
-                            [(store f32:$XT, iaddr:$dst)]>;
+                            [(store f32:$XT, ixaddr:$dst)]>;
     def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
                             "#DFSTOREf64",
-                            [(store f64:$XT, iaddr:$dst)]>;
+                            [(store f64:$XT, ixaddr:$dst)]>;
   }
-  def : Pat<(f64 (extloadf32 iaddr:$src)),
-            (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (extloadf32 iaddr:$src))),
-            (f32 (DFLOADf32 iaddr:$src))>;
+  def : Pat<(f64 (extloadf32 ixaddr:$src)),
+            (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
+  def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))),
+            (f32 (DFLOADf32 ixaddr:$src))>;
 } // end HasP9Vector, AddedComplexity
 
 // Integer extend helper dags 32 -> 64
@@ -2881,13 +2885,13 @@ def FltToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
 }
 def FltToLongLoadP9 {
-  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A)))));
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToULongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
 }
 def FltToULongLoadP9 {
-  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A)))));
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
 }
 def FltToLong {
   dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
@@ -2911,13 +2915,13 @@ def DblToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
 }
 def DblToIntLoadP9 {
-  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A)))));
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A)))));
 }
 def DblToUIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
 }
 def DblToUIntLoadP9 {
-  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A)))));
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A)))));
 }
 def DblToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
@@ -3088,17 +3092,17 @@ let AddedComplexity = 400 in {
               (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
     def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+                                (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
     def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
+                                (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
     def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
     def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
   }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 8af7f7e98117..9207165c46a6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -754,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-// Figure out if the offset in the instruction must be a multiple of 4.
-// This is true for instructions like "STD".
-static bool usesIXAddr(const MachineInstr &MI) {
+// If the offset must be a multiple of some value, return what that value is.
+static unsigned offsetMinAlign(const MachineInstr &MI) {
   unsigned OpC = MI.getOpcode();
 
   switch (OpC) {
   default:
-    return false;
+    return 1;
   case PPC::LWA:
   case PPC::LWA_32:
   case PPC::LD:
+  case PPC::LDU:
   case PPC::STD:
-    return true;
+  case PPC::STDU:
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64:
+  case PPC::LXSD:
+  case PPC::LXSSP:
+  case PPC::STXSD:
+  case PPC::STXSSP:
+    return 4;
+  case PPC::LXV:
+  case PPC::STXV:
+    return 16;
   }
 }
 
@@ -852,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum).ChangeToRegister(
     FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
 
-  // Figure out if the offset in the instruction is shifted right two bits.
-  bool isIXAddr = usesIXAddr(MI);
-
   // If the instruction is not present in ImmToIdxMap, then it has no immediate
   // form (and must be r+r).
   bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
@@ -883,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // happen in invalid code.
   assert(OpC != PPC::DBG_VALUE &&
          "This should be handled in a target-independent way");
-  if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+  if (!noImmForm && ((isInt<16>(Offset) &&
+                      ((Offset % offsetMinAlign(MI)) == 0)) ||
                      OpC == TargetOpcode::STACKMAP ||
                      OpC == TargetOpcode::PATCHPOINT)) {
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -1076,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
   return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
          MI->getOpcode() == TargetOpcode::STACKMAP ||
          MI->getOpcode() == TargetOpcode::PATCHPOINT ||
-         (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+         (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0);
 }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 2dc3828334ac..be705507b534 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -41,6 +41,8 @@ class PPCTargetMachine final : public LLVMTargetMachine {
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 11004c5a952f..91cab00b2b65 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -20,6 +20,10 @@ include "llvm/Target/Target.td"
 // SPARC Subtarget features.
 //
 
+def FeatureSoftMulDiv
+  : SubtargetFeature<"soft-mul-div", "UseSoftMulDiv", "true",
+                     "Use software emulation for integer multiply and divide">;
+
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
@@ -75,7 +79,7 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
-def : Proc<"v7",              []>;
+def : Proc<"v7",              [FeatureSoftMulDiv]>;
 def : Proc<"v8",              []>;
 def : Proc<"supersparc",      []>;
 def : Proc<"sparclite",       []>;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 9e7e3c6b705a..6767a59a9757 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1689,6 +1689,19 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
   setOperationAction(ISD::MUL,       MVT::i32, Expand);
 
+  if (Subtarget->useSoftMulDiv()) {
+    // .umul works for both signed and unsigned
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setLibcallName(RTLIB::MUL_I32, ".umul");
+
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::SDIV_I32, ".div");
+
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::UDIV_I32, ".udiv");
+  }
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index ae45c8be6752..3194ad4aeb6b 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -27,6 +27,9 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 // True when generating 64-bit code. This also implies HasV9.
 def Is64Bit : Predicate<"Subtarget->is64Bit()">;
 
+def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">,
+              AssemblerPredicate<"FeatureSoftMulDiv">;
+
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
 def HasV9   : Predicate<"Subtarget->isV9()">,
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 43ddef3cc96e..daac56add87c 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -28,6 +28,7 @@ void SparcSubtarget::anchor() { }
 
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
+  UseSoftMulDiv = false;
   IsV9 = false;
   IsLeon = false;
   V8DeprecatedInsts = false;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index fa42da425ff2..d18139984b87 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -32,6 +32,7 @@ class StringRef;
 class SparcSubtarget : public SparcGenSubtargetInfo {
   Triple TargetTriple;
   virtual void anchor();
+  bool UseSoftMulDiv;
   bool IsV9;
   bool IsLeon;
   bool V8DeprecatedInsts;
@@ -76,6 +77,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
 
   bool enableMachineScheduler() const override;
 
+  bool useSoftMulDiv() const { return UseSoftMulDiv; }
   bool isV9() const { return IsV9; }
   bool isLeon() const { return IsLeon; }
   bool isVIS() const { return IsVIS; }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index ee23692ad1db..33680789ee08 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -275,6 +275,10 @@ class SystemZOperand : public MCParsedAsmOperand {
   SMLoc getEndLoc() const override { return EndLoc; }
   void print(raw_ostream &OS) const override;
 
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   // Used by the TableGen code to add particular types of operand
   // to an instruction.
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -1164,6 +1168,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   return false;
 }
 
+std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS);
+
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
@@ -1209,8 +1215,13 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
 
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = SystemZMnemonicSpellCheck(
+      ((SystemZOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
+                 ((SystemZOperand &)*Operands[0]).getLocRange());
+  }
   }
 
   llvm_unreachable("Unexpected match type");
diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
index 6f8431db7b11..9b8b141fd52a 100644
--- a/lib/Target/SystemZ/LLVMBuild.txt
+++ b/lib/Target/SystemZ/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = SystemZCodeGen
 parent = SystemZ
-required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index c5faa0d62881..fda9c30fe3fc 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -187,6 +187,58 @@ def Arch11NewFeatures : SystemZFeatureList<[
     FeatureVector
 ]>;
 
+//===----------------------------------------------------------------------===//
+//
+// New features added in the Twelvth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureMiscellaneousExtensions2 : SystemZFeature<
+  "miscellaneous-extensions-2", "MiscellaneousExtensions2",
+  "Assume that the miscellaneous-extensions facility 2 is installed"
+>;
+
+def FeatureGuardedStorage : SystemZFeature<
+  "guarded-storage", "GuardedStorage",
+  "Assume that the guarded-storage facility is installed"
+>;
+
+def FeatureMessageSecurityAssist7 : SystemZFeature<
+  "message-security-assist-extension7", "MessageSecurityAssist7",
+  "Assume that the message-security-assist extension facility 7 is installed"
+>;
+
+def FeatureMessageSecurityAssist8 : SystemZFeature<
+  "message-security-assist-extension8", "MessageSecurityAssist8",
+  "Assume that the message-security-assist extension facility 8 is installed"
+>;
+
+def FeatureVectorEnhancements1 : SystemZFeature<
+  "vector-enhancements-1", "VectorEnhancements1",
+  "Assume that the vector enhancements facility 1 is installed"
+>;
+def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">;
+
+def FeatureVectorPackedDecimal : SystemZFeature<
+  "vector-packed-decimal", "VectorPackedDecimal",
+  "Assume that the vector packed decimal facility is installed"
+>;
+
+def FeatureInsertReferenceBitsMultiple : SystemZFeature<
+  "insert-reference-bits-multiple", "InsertReferenceBitsMultiple",
+  "Assume that the insert-reference-bits-multiple facility is installed"
+>;
+
+def Arch12NewFeatures : SystemZFeatureList<[
+    FeatureMiscellaneousExtensions2,
+    FeatureGuardedStorage,
+    FeatureMessageSecurityAssist7,
+    FeatureMessageSecurityAssist8,
+    FeatureVectorEnhancements1,
+    FeatureVectorPackedDecimal,
+    FeatureInsertReferenceBitsMultiple
+]>;
+
 //===----------------------------------------------------------------------===//
 //
 // Cumulative supported and unsupported feature sets
@@ -201,9 +253,13 @@ def Arch10SupportedFeatures
   : SystemZFeatureAdd<Arch9SupportedFeatures.List,  Arch10NewFeatures.List>;
 def Arch11SupportedFeatures
   : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
+def Arch12SupportedFeatures
+  : SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>;
 
-def Arch11UnsupportedFeatures
+def Arch12UnsupportedFeatures
   : SystemZFeatureList<[]>;
+def Arch11UnsupportedFeatures
+  : SystemZFeatureAdd<Arch12UnsupportedFeatures.List, Arch12NewFeatures.List>;
 def Arch10UnsupportedFeatures
   : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>;
 def Arch9UnsupportedFeatures
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2801141cd951..2d916d2e1521 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -101,7 +101,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
     addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
   }
-  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+  if (Subtarget.hasVectorEnhancements1())
+    addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
+  else
+    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   if (Subtarget.hasVector()) {
     addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
@@ -316,7 +319,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::AND, VT, Legal);
       setOperationAction(ISD::OR, VT, Legal);
       setOperationAction(ISD::XOR, VT, Legal);
-      setOperationAction(ISD::CTPOP, VT, Custom);
+      if (Subtarget.hasVectorEnhancements1())
+        setOperationAction(ISD::CTPOP, VT, Legal);
+      else
+        setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::CTTZ, VT, Legal);
       setOperationAction(ISD::CTLZ, VT, Legal);
 
@@ -414,10 +420,60 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
   }
 
+  // The vector enhancements facility 1 has instructions for these.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+    setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f128, Legal);
+  }
+
   // We have fused multiply-addition for f32 and f64 but not f128.
   setOperationAction(ISD::FMA, MVT::f32,  Legal);
   setOperationAction(ISD::FMA, MVT::f64,  Legal);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FMA, MVT::f128, Legal);
+  else
+    setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // We don't have a copysign instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
 
   // Needed so that we don't try to implement f128 constant loads using
   // a load-and-extend of a f80 constant (in cases where the constant
@@ -425,6 +481,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 
+  // We don't have extending load instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+  }
+
   // Floating-point truncation and stores need to be done separately.
   setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
@@ -489,7 +551,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   case MVT::f64:
     return true;
   case MVT::f128:
-    return false;
+    return Subtarget.hasVectorEnhancements1();
   default:
     break;
   }
@@ -1462,21 +1524,25 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
     return true;
 
   case Intrinsic::s390_vfcedbs:
+  case Intrinsic::s390_vfcesbs:
     Opcode = SystemZISD::VFCMPES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchdbs:
+  case Intrinsic::s390_vfchsbs:
     Opcode = SystemZISD::VFCMPHS;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchedbs:
+  case Intrinsic::s390_vfchesbs:
     Opcode = SystemZISD::VFCMPHES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vftcidb:
+  case Intrinsic::s390_vftcisb:
     Opcode = SystemZISD::VFTCI;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -2316,11 +2382,15 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
 
 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
 // producing a result of type VT.
-static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
-                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
-  // There is no hardware support for v4f32, so extend the vector into
-  // two v2f64s and compare those.
-  if (CmpOp0.getValueType() == MVT::v4f32) {
+SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                                            const SDLoc &DL, EVT VT,
+                                            SDValue CmpOp0,
+                                            SDValue CmpOp1) const {
+  // There is no hardware support for v4f32 (unless we have the vector
+  // enhancements facility 1), so extend the vector into two v2f64s
+  // and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32 &&
+      !Subtarget.hasVectorEnhancements1()) {
     SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
     SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
     SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
@@ -2334,9 +2404,11 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
 
 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
 // an integer mask of type VT.
-static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
-                                ISD::CondCode CC, SDValue CmpOp0,
-                                SDValue CmpOp1) {
+SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
+                                                const SDLoc &DL, EVT VT,
+                                                ISD::CondCode CC,
+                                                SDValue CmpOp0,
+                                                SDValue CmpOp1) const {
   bool IsFP = CmpOp0.getValueType().isFloatingPoint();
   bool Invert = false;
   SDValue Cmp;
@@ -2960,6 +3032,12 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     // We define this so that it can be used for constant division.
     lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
                     Op.getOperand(1), Ops[1], Ops[0]);
+  else if (Subtarget.hasMiscellaneousExtensions2())
+    // SystemZISD::SMUL_LOHI returns the low result in the odd register and
+    // the high result in the even register.  ISD::SMUL_LOHI is defined to
+    // return the low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   else {
     // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
     //
@@ -4658,6 +4736,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
     OPCODE(POPCNT);
+    OPCODE(SMUL_LOHI);
     OPCODE(UMUL_LOHI);
     OPCODE(SDIVREM);
     OPCODE(UDIVREM);
@@ -6118,6 +6197,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::SelectF32:
   case SystemZ::SelectF64:
   case SystemZ::SelectF128:
+  case SystemZ::SelectVR128:
     return emitSelect(MI, MBB, 0);
 
   case SystemZ::CondStore8Mux:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 6c9c404816f0..abe8b7233e60 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -88,6 +88,7 @@ enum NodeType : unsigned {
 
   // Wrappers around the ISD opcodes of the same name.  The output is GR128.
   // Input operands may be GR64 or GR32, depending on the instruction.
+  SMUL_LOHI,
   UMUL_LOHI,
   SDIVREM,
   UDIVREM,
@@ -479,6 +480,12 @@ class SystemZTargetLowering : public TargetLowering {
   const SystemZSubtarget &Subtarget;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                       const SDLoc &DL, EVT VT,
+                       SDValue CmpOp0, SDValue CmpOp1) const;
+  SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL,
+                           EVT VT, ISD::CondCode CC,
+                           SDValue CmpOp0, SDValue CmpOp1) const;
   SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 10172bd45203..02aeaadad0d9 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
-def SelectF32  : SelectWrapper<FP32>;
-def SelectF64  : SelectWrapper<FP64>;
-def SelectF128 : SelectWrapper<FP128>;
+def SelectF32  : SelectWrapper<f32, FP32>;
+def SelectF64  : SelectWrapper<f64, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def SelectF128 : SelectWrapper<f128, FP128>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def SelectVR128 : SelectWrapper<f128, VR128>;
 
 defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
                                nonvolatile_load, bdxaddr20only>;
@@ -69,8 +72,9 @@ let Defs = [CC], usesCustomInserter = 1 in {
 let Predicates = [FeatureVector] in {
   defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
   defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
-  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 }
+let Predicates = [FeatureVector, FeatureNoVectorEnhancements1] in
+  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
@@ -83,8 +87,12 @@ let isCodeGenOnly = 1 in {
 }
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 FP128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 VR128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
@@ -92,8 +100,12 @@ let isCodeGenOnly = 1 in
 def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>;
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 FP128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 VR128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
@@ -101,12 +113,14 @@ class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
         (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP32:$src2)>;
-def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP64:$src2)>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP32:$src2)>;
+  def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP64:$src2)>;
+  def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+}
 
 defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
 defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
@@ -166,20 +180,32 @@ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
 def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
              Requires<[FeatureFPExtension]>;
 
-def : Pat<(f32 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
-def : Pat<(f64 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f32 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+  def : Pat<(f64 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+}
 
 // Extend register floating-point values to wider representations.
-def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64,  FP32>;
-def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>;
-def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>;
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend,  FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>;
+  def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>;
+}
 
 // Extend memory floating-point values to wider representations.
 def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
-def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
-def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag,  FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag,  FP128, 8>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (extloadf32 bdxaddr12only:$src)),
+            (LXEB bdxaddr12only:$src)>;
+  def : Pat<(f128 (extloadf64 bdxaddr12only:$src)),
+            (LXDB bdxaddr12only:$src)>;
+}
 
 // Convert a signed integer register value to a floating-point one.
 def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
@@ -426,16 +452,18 @@ def : Pat<(fmul (f64 (fpextend FP32:$src1)),
 
 // f128 multiplication of two FP64 registers.
 def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
-          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_h64), FP64:$src2)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+            (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                  FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
 def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)),
-                (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
-                bdxaddr12only:$addr)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+                  (f128 (extloadf64 bdxaddr12only:$addr))),
+            (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                  bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
 def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 7620e06ccbc9..033a0a879d37 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -1091,6 +1091,94 @@ class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRIf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = I4;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> I2;
+  bits<4> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = R2;
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M4;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 // Depending on the instruction mnemonic, certain bits may be or-ed into
 // the M4 value provided as explicit operand.  These are passed as m4or.
 class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -1259,6 +1347,67 @@ class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRRg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-28} = V2{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9}     = V2{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1321,6 +1470,25 @@ class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRSd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<4> R3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1358,6 +1526,24 @@ class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVSI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<8> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction classes for .insn directives
 //===----------------------------------------------------------------------===//
@@ -1910,6 +2096,25 @@ class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
   let M1 = V.ccmask;
 }
 
+class CondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr20only:$XBD2),
+             !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
+             mnemonic#"\t$M1, $XBD2", []>;
+
+class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
+                         SDPatternOperator operator = null_frag>
+  : InstRXYb<opcode, (outs), (ins bdxaddr20only:$XBD2),
+             !subst("#", V.suffix, mnemonic)#"\t$XBD2",
+             [(operator (load bdxaddr20only:$XBD2))]> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
 class CmpBranchRIEa<string mnemonic, bits<16> opcode,
                     RegisterOperand cls, Immediate imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
@@ -2272,6 +2477,24 @@ class StoreLengthVRSb<string mnemonic, bits<16> opcode,
   let AccessBytes = bytes;
 }
 
+class StoreLengthVRSd<string mnemonic, bits<16> opcode,
+                      SDPatternOperator operator, bits<5> bytes>
+  : InstVRSd<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreLengthVSI<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, bits<5> bytes>
+  : InstVSI<opcode, (outs), (ins VR128:$V1, bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(operator VR128:$V1, imm32zx8:$I3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
                       AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
@@ -2700,6 +2923,11 @@ class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
   : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
             mnemonic##"\t$R1, $XBD2", []>;
 
+class SideEffectBinaryRXY<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls>
+  : InstRXYa<opcode, (outs), (ins cls:$R1, bdxaddr20only:$XBD2),
+             mnemonic##"\t$R1, $XBD2", []>;
+
 class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
                             RegisterOperand cls>
   : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
@@ -3188,6 +3416,11 @@ class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>;
 
+class BinaryVRIh<string mnemonic, bits<16> opcode>
+  : InstVRIh<opcode, (outs VR128:$V1),
+             (ins imm32zx16:$I2, imm32zx4:$I3),
+             mnemonic#"\t$V1, $I2, $I3", []>;
+
 class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
@@ -3316,6 +3549,10 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$V1, $R2, $R3",
              [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
 
+class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $V2, $M3", []>;
+
 class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type>
   : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
@@ -3353,6 +3590,15 @@ class BinaryVRScGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4),
              mnemonic#"\t$R1, $V3, $BD2, $M4", []>;
 
+class BinaryVRSd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 bits<5> bytes>
+  : InstVRSd<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr, bits<5> bytes>
   : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
@@ -3398,6 +3644,15 @@ class StoreBinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
   let mayStore = 1;
 }
 
+class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                bits<5> bytes>
+  : InstVSI<opcode, (outs VR128:$V1), (ins bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(set VR128:$V1, (operator imm32zx8:$I3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3625,6 +3880,12 @@ class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode>
   let M5 = 0;
 }
 
+class CompareVRRh<string mnemonic, bits<16> opcode>
+  : InstVRRh<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $V2, $M3", []> {
+  let isCompare = 1;
+}
+
 class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
               RegisterOperand cls>
   : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
@@ -3639,6 +3900,10 @@ class TestRSL<string mnemonic, bits<16> opcode>
   let mayLoad = 1;
 }
 
+class TestVRRg<string mnemonic, bits<16> opcode>
+  : InstVRRg<opcode, (outs), (ins VR128:$V1),
+             mnemonic#"\t$V1", []>;
+
 class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
   : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
                                  shift12only:$BD2, imm32zx4:$I3),
@@ -3842,6 +4107,11 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M5 = type;
 }
 
+class TernaryVRIi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRIi<opcode, (outs VR128:$V1),
+             (ins cls:$R2, imm32zx8:$I3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $R2, $I3, $M4", []>;
+
 class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
   : InstVRRa<opcode, (outs tr1.op:$V1),
@@ -3914,6 +4184,25 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M6 = 0;
 }
 
+class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
+                       SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+                       bits<4> type = 0, bits<4> m5 = 0>
+  : InstVRRc<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M6",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx4:$M6)))]> {
+  let M4 = type;
+  let M5 = m5;
+}
+
+class TernaryVRRcFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+                  imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
+
 class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
@@ -4019,20 +4308,38 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
   let DisableEncoding = "$V1src";
 }
 
+class QuaternaryVRIf<string mnemonic, bits<16> opcode>
+  : InstVRIf<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
+
+class QuaternaryVRIg<string mnemonic, bits<16> opcode>
+  : InstVRIg<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx8:$I3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $I3, $I4, $M5", []>;
+
 class QuaternaryVRRd<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
-                     bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
+                     TypedReg tr3, TypedReg tr4, bits<4> type,
+                     SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
-             (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
+             (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
              mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
              [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 (tr2.vt tr2.op:$V4),
+                                                 (tr3.vt tr3.op:$V3),
+                                                 (tr4.vt tr4.op:$V4),
                                                  m6mask:$M6)))],
              m6or> {
   let M5 = type;
 }
 
+class QuaternaryVRRdGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRd<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+
 // Declare a pair of instructions, one which sets CC and one which doesn't.
 // The CC-setting form ends with "S" and sets the low bit of M6.
 // Also create aliases to make use of M6 operand optional in assembler.
@@ -4041,13 +4348,15 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
                                 SDPatternOperator operator_cc,
                                 TypedReg tr1, TypedReg tr2, bits<4> type,
                                 bits<4> modifier = 0> {
-  def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
+  def "" : QuaternaryVRRd<mnemonic, opcode, operator,
+                          tr1, tr2, tr2, tr2, type,
                           imm32zx4even, !and (modifier, 14)>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, tr2.op:$V4, 0)>;
   let Defs = [CC] in
-    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
+                           tr1, tr2, tr2, tr2, type,
                            imm32zx4even, !add (!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
@@ -4055,10 +4364,7 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
 }
 
 multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
-  def "" : InstVRRd<opcode, (outs VR128:$V1),
-                   (ins VR128:$V2, VR128:$V3, VR128:$V4,
-                        imm32zx4:$M5, imm32zx4:$M6),
-                   mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+  def "" : QuaternaryVRRdGeneric<mnemonic, opcode>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
                   (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
                                             VR128:$V4, imm32zx4:$M5, 0)>;
@@ -4366,10 +4672,10 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
 
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
-class SelectWrapper<RegisterOperand cls>
+class SelectWrapper<ValueType vt, RegisterOperand cls>
   : Pseudo<(outs cls:$dst),
            (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
-           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+           [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
                                             imm32zx4:$valid, imm32zx4:$cc))]> {
   let usesCustomInserter = 1;
   // Although the instructions used by these nodes do not in themselves
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 66a5ff12be46..4533f4fdf21a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -869,6 +869,37 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Move 128-bit floating-point values between VR128 and FP128.
+  if (SystemZ::VR128BitRegClass.contains(DestReg) &&
+      SystemZ::FP128BitRegClass.contains(SrcReg)) {
+    unsigned SrcRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned SrcRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VMRHG), DestReg)
+      .addReg(SrcRegHi, getKillRegState(KillSrc))
+      .addReg(SrcRegLo, getKillRegState(KillSrc));
+    return;
+  }
+  if (SystemZ::FP128BitRegClass.contains(DestReg) &&
+      SystemZ::VR128BitRegClass.contains(SrcReg)) {
+    unsigned DestRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned DestRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    if (DestRegHi != SrcReg)
+      copyPhysReg(MBB, MBBI, DL, DestRegHi, SrcReg, false);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VREPG), DestRegLo)
+      .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1);
+    return;
+  }
+
   // Everything else needs only one instruction.
   unsigned Opcode;
   if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
@@ -1434,6 +1465,7 @@ SystemZII::Branch
 SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case SystemZ::BR:
+  case SystemZ::BI:
   case SystemZ::J:
   case SystemZ::JG:
     return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 4569be7602e4..f64c0d15ef83 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -48,6 +48,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BC  : CondBranchRX<"b#",  0x47>;
       def BCR : CondBranchRR<"b#r", 0x07>;
+      def BIC : CondBranchRXY<"bi#", 0xe347>,
+                Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 
@@ -58,6 +60,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
   let isIndirectBranch = 1 in {
     def BCAsm  : AsmCondBranchRX<"bc",  0x47>;
     def BCRAsm : AsmCondBranchRR<"bcr", 0x07>;
+    def BICAsm : AsmCondBranchRXY<"bic", 0xe347>,
+                 Requires<[FeatureMiscellaneousExtensions2]>;
   }
 
   // Define AsmParser extended mnemonics for each general condition-code mask
@@ -69,6 +73,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BAsm#V  : FixedCondBranchRX <CV<V>, "b#",  0x47>;
       def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
+      def BIAsm#V : FixedCondBranchRXY<CV<V>, "bi#", 0xe347>,
+                    Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 }
@@ -81,6 +87,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isIndirectBranch = 1 in {
     def B  : FixedCondBranchRX<CondAlways, "b",  0x47>;
     def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
+    def BI : FixedCondBranchRXY<CondAlways, "bi", 0xe347, brind>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   }
 }
 
@@ -316,9 +324,9 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
-def Select32    : SelectWrapper<GR32>;
-def Select64    : SelectWrapper<GR64>;
+def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<i32, GR32>;
+def Select64    : SelectWrapper<i64, GR64>;
 
 // We don't define 32-bit Mux stores if we don't have STOCFH, because the
 // low-only STOC should then always be used if possible.
@@ -921,6 +929,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Addition of memory.
   defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
   defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
   def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
@@ -1006,6 +1016,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
   def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
@@ -1207,6 +1219,15 @@ defm : RMWIByte<xor, bdaddr20pair, XIY>;
 // Multiplication
 //===----------------------------------------------------------------------===//
 
+// Multiplication of a register, setting the condition code.  We prefer these
+// over MS(G)R if available, even though we cannot use the condition code,
+// since they are three-operand instructions.
+let Predicates = [FeatureMiscellaneousExtensions2],
+    Defs = [CC], isCommutable = 1 in {
+  def MSRKC  : BinaryRRFa<"msrkc",  0xB9FD, mul, GR32, GR32, GR32>;
+  def MSGRKC : BinaryRRFa<"msgrkc", 0xB9ED, mul, GR64, GR64, GR64>;
+}
+
 // Multiplication of a register.
 let isCommutable = 1 in {
   def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
@@ -1226,21 +1247,37 @@ def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 // Multiplication of memory.
 defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
 defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+            Requires<[FeatureMiscellaneousExtensions2]>;
 def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
+// Multiplication of memory, setting the condition code.
+let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
+  def MSC  : BinaryRXY<"msc",  0xE353, null_frag, GR32, load, 4>;
+  def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>;
+}
+
 // Multiplication of a register, producing two results.
-def MR   : BinaryRR <"mr",   0x1C,   null_frag, GR128, GR32>;
+def MR   : BinaryRR <"mr",    0x1C,   null_frag, GR128, GR32>;
+def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>,
+           Requires<[FeatureMiscellaneousExtensions2]>;
 def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
 def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>;
+def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2),
+          (MGRK GR64:$src1, GR64:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
           (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
 def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
 def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+          Requires<[FeatureMiscellaneousExtensions2]>;
 def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
 def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
           (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
@@ -1765,8 +1802,29 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
                                                GR128, GR128, GR128>;
     def PCC   : SideEffectInherentRRE<"pcc", 0xB92C>;
   }
+
   let Predicates = [FeatureMessageSecurityAssist5] in
-    def PPNO  : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+    def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+  let Predicates = [FeatureMessageSecurityAssist7], isAsmParserOnly = 1 in
+    def PRNO : SideEffectBinaryMemMemRRE<"prno", 0xB93C, GR128, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist8] in
+    def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929,
+                                              GR128, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureGuardedStorage] in {
+  def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>;
+  def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>;
+
+  let mayLoad = 1 in
+    def LGSC : SideEffectBinaryRXY<"lgsc", 0xE34D, GR64>;
+  let mayStore = 1 in
+    def STGSC : SideEffectBinaryRXY<"stgsc", 0xE349, GR64>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
index a9803c2d83e9..0112ebf1eb10 100644
--- a/lib/Target/SystemZ/SystemZInstrSystem.td
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -126,6 +126,10 @@ let hasSideEffects = 1, Defs = [CC] in
 let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in
   def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>;
 
+// Insert reference bits multiple.
+let Predicates = [FeatureInsertReferenceBitsMultiple], hasSideEffects = 1 in
+  def IRBM : UnaryRRE<"irbm", 0xB9AC, null_frag, GR64, GR64>;
+
 // Perform frame management function.
 let hasSideEffects = 1 in
   def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>;
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 0158fe6aec08..c9a02d9c8082 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -14,7 +14,7 @@
 let Predicates = [FeatureVector] in {
   // Register move.
   def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
-  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32sb, v32sb>;
   def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
 
   // Load GR from VR element.
@@ -141,7 +141,7 @@ let Predicates = [FeatureVector] in {
   // LEY and LDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
   def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
 
   // Load logical element and zero.
@@ -154,6 +154,11 @@ let Predicates = [FeatureVector] in {
             (VLLEZF bdxaddr12only:$addr)>;
   def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
             (VLLEZG bdxaddr12only:$addr)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VLLEZLF : UnaryVRX<"vllezlf", 0xE704, z_vllezli32, v128f, 4, 6>;
+    def : Pat<(v4f32 (z_vllezlf32 bdxaddr12only:$addr)),
+              (VLLEZLF bdxaddr12only:$addr)>;
+  }
 
   // Load element.
   def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8,  v128b, v128b, 1, imm32zx4>;
@@ -170,6 +175,13 @@ let Predicates = [FeatureVector] in {
   def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Load rightmost with length.  The number of loaded bytes is only known
+  // at run time.
+  def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+  def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+}
+
 // Use replicating loads if we're inserting a single element into an
 // undefined vector.  This avoids a false dependency on the previous
 // register contents.
@@ -219,7 +231,7 @@ let Predicates = [FeatureVector] in {
   // STEY and STDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
   def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
 
   // Scatter element.
@@ -227,6 +239,13 @@ let Predicates = [FeatureVector] in {
   def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Store rightmost with length.  The number of stored bytes is only known
+  // at run time.
+  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+  def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Selects and permutes
 //===----------------------------------------------------------------------===//
@@ -256,6 +275,10 @@ let Predicates = [FeatureVector] in {
   // Permute doubleword immediate.
   def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
 
+  // Bit Permute.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VBPERM : BinaryVRRc<"vbperm", 0xE785, int_s390_vbperm, v128g, v128b>;
+
   // Replicate.
   def VREP:   BinaryVRIcGeneric<"vrep", 0xE74D>;
   def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
@@ -424,6 +447,10 @@ let Predicates = [FeatureVector] in {
   def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
   def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
 
+  // Not exclusive or.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>;
+
   // Exclusive or.
   def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
 
@@ -567,6 +594,17 @@ let Predicates = [FeatureVector] in {
   def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
   def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
 
+  // Multiply sum logical.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VMSL  : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>;
+    def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg,
+                               v128q, v128g, v128g, v128q, 3>;
+  }
+
+  // Nand.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>;
+
   // Nor.
   def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
   def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
@@ -574,9 +612,19 @@ let Predicates = [FeatureVector] in {
   // Or.
   def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
 
+  // Or with complement.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VOC : BinaryVRRc<"voc", 0xE76F, null_frag, v128any, v128any>;
+
   // Population count.
   def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>;
   def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VPOPCTB : UnaryVRRa<"vpopctb", 0xE750, ctpop, v128b, v128b, 0>;
+    def VPOPCTH : UnaryVRRa<"vpopcth", 0xE750, ctpop, v128h, v128h, 1>;
+    def VPOPCTF : UnaryVRRa<"vpopctf", 0xE750, ctpop, v128f, v128f, 2>;
+    def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>;
+  }
 
   // Element rotate left logical (with vector shift amount).
   def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
@@ -724,6 +772,14 @@ multiclass BitwiseVectorOps<ValueType type> {
               (VNO VR128:$x, VR128:$y)>;
     def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
   }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))),
+              (VNX VR128:$x, VR128:$y)>;
+    def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))),
+              (VNN VR128:$x, VR128:$y)>;
+    def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))),
+              (VOC VR128:$x, VR128:$y)>;
+  }
 }
 
 defm : BitwiseVectorOps<v16i8>;
@@ -879,6 +935,11 @@ let Predicates = [FeatureVector] in {
   def VFA   : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
   def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
   def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>;
+    def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>;
+    def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>;
+  }
 
   // Convert from fixed 64-bit.
   def VCDG  : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
@@ -910,6 +971,11 @@ let Predicates = [FeatureVector] in {
   def VFD   : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
   def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
   def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>;
+    def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>;
+    def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>;
+  }
 
   // Load FP integer.
   def VFI   : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
@@ -917,66 +983,213 @@ let Predicates = [FeatureVector] in {
   def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
   defm : VectorRounding<VFIDB, v128db>;
   defm : VectorRounding<WFIDB, v64db>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>;
+    def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>;
+    def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>;
+    defm : VectorRounding<VFISB, v128sb>;
+    defm : VectorRounding<WFISB, v32sb>;
+    defm : VectorRounding<WFIXB, v128xb>;
+  }
 
   // Load lengthened.
   def VLDE  : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
-  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLL  : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>;
+      def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>;
+      def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>;
+    }
+    def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>;
+    def : Pat<(f128 (fpextend (f32 VR32:$src))),
+              (WFLLD (WLDEB VR32:$src))>;
+  }
 
-  // Load rounded,
+  // Load rounded.
   def VLED  : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
-  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
-  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
-  def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+  def : FPConversion<WLEDB, fpround, v32sb, v64db, 0, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLR  : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>;
+      def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+      def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+    }
+    def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>;
+    def : FPConversion<WFLRX, fpround, v64db, v128xb, 0, 0>;
+    def : Pat<(f32 (fpround (f128 VR128:$src))),
+              (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>;
+  }
+
+  // Maximum.
+  multiclass VectorMax<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fmaxnum, tr, 4>;
+    def : FPMinMax<insn, fmaxnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
+    def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
+                                   v128db, v128db, 3, 0>;
+    def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMax<VFMAXDB, v128db>;
+    defm : VectorMax<WFMAXDB, v64db>;
+    defm : VectorMax<VFMAXSB, v128sb>;
+    defm : VectorMax<WFMAXSB, v32sb>;
+    defm : VectorMax<WFMAXXB, v128xb>;
+  }
+
+  // Minimum.
+  multiclass VectorMin<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fminnum, tr, 4>;
+    def : FPMinMax<insn, fminnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
+    def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
+                                   v128db, v128db, 3, 0>;
+    def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMin<VFMINDB, v128db>;
+    defm : VectorMin<WFMINDB, v64db>;
+    defm : VectorMin<VFMINSB, v128sb>;
+    defm : VectorMin<WFMINSB, v32sb>;
+    defm : VectorMin<WFMINXB, v128xb>;
+  }
 
   // Multiply.
   def VFM   : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
   def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>;
+    def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>;
+    def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>;
+  }
 
   // Multiply and add.
   def VFMA   : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
   def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
   def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>;
+    def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>;
+    def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>;
+  }
 
   // Multiply and subtract.
   def VFMS   : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
   def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
   def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>;
+    def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>;
+    def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and add.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMA   : TernaryVRReFloatGeneric<"vfnma", 0xE79F>;
+    def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>;
+    def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>;
+    def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>;
+    def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>;
+    def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and subtract.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMS   : TernaryVRReFloatGeneric<"vfnms", 0xE79E>;
+    def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>;
+    def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>;
+    def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>;
+    def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>;
+    def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>;
+  }
 
   // Perform sign operation.
   def VFPSO   : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>;
   def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>;
   def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFPSOSB : BinaryVRRa<"vfpsosb", 0xE7CC, null_frag, v128sb, v128sb, 2, 0>;
+    def WFPSOSB : BinaryVRRa<"wfpsosb", 0xE7CC, null_frag, v32sb, v32sb, 2, 8>;
+    def WFPSOXB : BinaryVRRa<"wfpsoxb", 0xE7CC, null_frag, v128xb, v128xb, 4, 8>;
+  }
 
   // Load complement.
   def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
   def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLCSB : UnaryVRRa<"vflcsb", 0xE7CC, fneg, v128sb, v128sb, 2, 0, 0>;
+    def WFLCSB : UnaryVRRa<"wflcsb", 0xE7CC, fneg, v32sb, v32sb, 2, 8, 0>;
+    def WFLCXB : UnaryVRRa<"wflcxb", 0xE7CC, fneg, v128xb, v128xb, 4, 8, 0>;
+  }
 
   // Load negative.
   def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
   def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLNSB : UnaryVRRa<"vflnsb", 0xE7CC, fnabs, v128sb, v128sb, 2, 0, 1>;
+    def WFLNSB : UnaryVRRa<"wflnsb", 0xE7CC, fnabs, v32sb, v32sb, 2, 8, 1>;
+    def WFLNXB : UnaryVRRa<"wflnxb", 0xE7CC, fnabs, v128xb, v128xb, 4, 8, 1>;
+  }
 
   // Load positive.
   def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
   def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLPSB : UnaryVRRa<"vflpsb", 0xE7CC, fabs, v128sb, v128sb, 2, 0, 2>;
+    def WFLPSB : UnaryVRRa<"wflpsb", 0xE7CC, fabs, v32sb, v32sb, 2, 8, 2>;
+    def WFLPXB : UnaryVRRa<"wflpxb", 0xE7CC, fabs, v128xb, v128xb, 4, 8, 2>;
+  }
 
   // Square root.
   def VFSQ   : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
   def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
   def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>;
+    def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>;
+    def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>;
+  }
 
   // Subtract.
   def VFS   : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
   def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
   def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>;
+    def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>;
+    def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>;
+  }
 
   // Test data class immediate.
   let Defs = [CC] in {
     def VFTCI   : BinaryVRIeFloatGeneric<"vftci", 0xE74A>;
     def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
     def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def VFTCISB : BinaryVRIe<"vftcisb", 0xE74A, z_vftci, v128f, v128sb, 2, 0>;
+      def WFTCISB : BinaryVRIe<"wftcisb", 0xE74A, null_frag, v32f, v32sb, 2, 8>;
+      def WFTCIXB : BinaryVRIe<"wftcixb", 0xE74A, null_frag, v128q, v128xb, 4, 8>;
+    }
   }
 }
 
@@ -989,12 +1202,20 @@ let Predicates = [FeatureVector] in {
   let Defs = [CC] in {
     def WFC   : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
     def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_fcmp, v32sb, 2>;
+      def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_fcmp, v128xb, 4>;
+    }
   }
 
   // Compare and signal scalar.
   let Defs = [CC] in {
     def WFK   : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
     def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFKSB : CompareVRRa<"wfksb", 0xE7CA, null_frag, v32sb, 2>;
+      def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, null_frag, v128xb, 4>;
+    }
   }
 
   // Compare equal.
@@ -1003,6 +1224,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKESB : BinaryVRRcSPair<"vfkesb", 0xE7E8, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKESB : BinaryVRRcSPair<"wfkesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKEXB : BinaryVRRcSPair<"wfkexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high.
   def  VFCH   : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
@@ -1010,6 +1253,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKHSB : BinaryVRRcSPair<"vfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKHSB : BinaryVRRcSPair<"wfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKHXB : BinaryVRRcSPair<"wfkhxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high or equal.
   def  VFCHE   : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
@@ -1017,6 +1282,28 @@ let Predicates = [FeatureVector] in {
                                  v128g, v128db, 3, 0>;
   defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
                                  v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+                                   v128f, v128sb, 2, 0>;
+    defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 8>;
+    defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high or equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v128g, v128db, 3, 4>;
+    defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v64g, v64db, 3, 12>;
+    defm VFKHESB : BinaryVRRcSPair<"vfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v128f, v128sb, 2, 4>;
+    defm WFKHESB : BinaryVRRcSPair<"wfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 12>;
+    defm WFKHEXB : BinaryVRRcSPair<"wfkhexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 12>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1028,36 +1315,49 @@ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (f128  VR128:$src))), (v16i8 VR128:$src)>;
 
 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (f128  VR128:$src))), (v8i16 VR128:$src)>;
 
 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (f128  VR128:$src))), (v4i32 VR128:$src)>;
 
 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (f128  VR128:$src))), (v2i64 VR128:$src)>;
 
 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (f128  VR128:$src))), (v4f32 VR128:$src)>;
 
 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (f128  VR128:$src))), (v2f64 VR128:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64 VR128:$src))), (f128  VR128:$src)>;
 
 //===----------------------------------------------------------------------===//
 // Replicating scalars
@@ -1133,6 +1433,20 @@ let AddedComplexity = 4 in {
             (EXTRACT_SUBREG (VREPG VR128:$vec, imm32zx1:$index), subreg_r64)>;
 }
 
+//===----------------------------------------------------------------------===//
+// Support for 128-bit floating-point values in vector registers
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorEnhancements1] in {
+  def : Pat<(f128 (load bdxaddr12only:$addr)),
+            (VL bdxaddr12only:$addr)>;
+  def : Pat<(store (f128 VR128:$src), bdxaddr12only:$addr),
+            (VST VR128:$src, bdxaddr12only:$addr)>;
+
+  def : Pat<(f128 fpimm0), (VZERO)>;
+  def : Pat<(f128 fpimmneg0), (WFLNXB (VZERO))>;
+}
+
 //===----------------------------------------------------------------------===//
 // String instructions
 //===----------------------------------------------------------------------===//
@@ -1202,3 +1516,37 @@ let Predicates = [FeatureVector] in {
   defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
                                         z_vstrcz_cc, v128f, v128f, 2, 2>;
 }
+
+//===----------------------------------------------------------------------===//
+// Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorPackedDecimal] in {
+  def VLIP : BinaryVRIh<"vlip", 0xE649>;
+
+  def VPKZ : BinaryVSI<"vpkz", 0xE634, null_frag, 0>;
+  def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>;
+
+  let Defs = [CC] in {
+    def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>;
+    def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>;
+    def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>;
+    def VCVDG : TernaryVRIi<"vcvdg", 0xE65A, GR64>;
+
+    def VAP : QuaternaryVRIf<"vap", 0xE671>;
+    def VSP : QuaternaryVRIf<"vsp", 0xE673>;
+
+    def VMP : QuaternaryVRIf<"vmp", 0xE678>;
+    def VMSP : QuaternaryVRIf<"vmsp", 0xE679>;
+
+    def VDP : QuaternaryVRIf<"vdp", 0xE67A>;
+    def VRP : QuaternaryVRIf<"vrp", 0xE67B>;
+    def VSDP : QuaternaryVRIf<"vsdp", 0xE67E>;
+
+    def VSRP : QuaternaryVRIg<"vsrp", 0xE659>;
+    def VPSOP : QuaternaryVRIg<"vpsop", 0xE65B>;
+
+    def VTP : TestVRRg<"vtp", 0xE65F>;
+    def VCP : CompareVRRh<"vcp", 0xE677>;
+  }
+}
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 9c6d5819f8a7..759a8bb0ce14 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -181,6 +181,7 @@ def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
     		                 [SDNPInGlue]>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
@@ -549,6 +550,12 @@ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                     (fma node:$src2, node:$src3, (fneg node:$src1))>;
 
+// Negative fused multiply-add and multiply-subtract.
+def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fma node:$src1, node:$src2, node:$src3))>;
+def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fms node:$src1, node:$src2, node:$src3))>;
+
 // Floating-point negative absolute.
 def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
 
@@ -624,6 +631,19 @@ def z_vllezf64 : PatFrag<(ops node:$addr),
                           (scalar_to_vector (f64 (load node:$addr))),
                           (z_vzero))>;
 
+// Similarly for the high element of a zeroed vector.
+def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezlf32 : PatFrag<(ops node:$addr),
+                          (bitconvert
+                           (z_merge_high
+                            (v2i64
+                             (bitconvert
+                              (z_merge_high
+                               (v4f32 (scalar_to_vector
+                                       (f32 (load node:$addr)))),
+                               (v4f32 (z_vzero))))),
+                            (v2i64 (z_vzero))))>;
+
 // Store one element of a vector.
 class z_vste<ValueType scalartype, SDPatternOperator store>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 16a7ed784d70..152521fb66a8 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -167,3 +167,10 @@ class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
                    TypedReg tr2, bits<3> suppress, bits<4> mode>
   : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
         (insn tr2.op:$vec, suppress, mode)>;
+
+// Use INSN to perform mininum/maximum operation OPERATOR on type TR.
+// FUNCTION is the type of minimum/maximum function to perform.
+class FPMinMax<Instruction insn, SDPatternOperator operator, TypedReg tr,
+               bits<4> function>
+  : Pat<(tr.vt (operator (tr.vt tr.op:$vec1), (tr.vt tr.op:$vec2))),
+        (insn tr.op:$vec1, tr.op:$vec2, function)>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 1cdc0949ff4a..0dca4582dc0d 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -33,3 +33,6 @@ def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>;
 def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>;
 def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
 
+def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
+def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 36809ea81dc1..52ba1a584017 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -260,10 +260,10 @@ defm VF128 : SystemZRegClass<"VF128",
 
 // All vector registers.
 defm VR128 : SystemZRegClass<"VR128",
-                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
-                             (add (sequence "V%u", 0, 7),
-                                  (sequence "V%u", 16, 31),
-                                  (sequence "V%u", 8, 15))>;
+                             [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (add (sequence "V%u", 0, 7),
+                                       (sequence "V%u", 16, 31),
+                                       (sequence "V%u", 8, 15))>;
 
 // Attaches a ValueType to a register operand, to make the instruction
 // definitions easier.
@@ -272,7 +272,8 @@ class TypedReg<ValueType vtin, RegisterOperand opin> {
   RegisterOperand op = opin;
 }
 
-def v32eb   : TypedReg<f32,     VR32>;
+def v32f    : TypedReg<i32,     VR32>;
+def v32sb   : TypedReg<f32,     VR32>;
 def v64g    : TypedReg<i64,     VR64>;
 def v64db   : TypedReg<f64,     VR64>;
 def v128b   : TypedReg<v16i8,   VR128>;
@@ -280,8 +281,9 @@ def v128h   : TypedReg<v8i16,   VR128>;
 def v128f   : TypedReg<v4i32,   VR128>;
 def v128g   : TypedReg<v2i64,   VR128>;
 def v128q   : TypedReg<v16i8,   VR128>;
-def v128eb  : TypedReg<v4f32,   VR128>;
+def v128sb  : TypedReg<v4f32,   VR128>;
 def v128db  : TypedReg<v2f64,   VR128>;
+def v128xb  : TypedReg<f128,    VR128>;
 def v128any : TypedReg<untyped, VR128>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index 1ce0168f95e9..8dba89f70a42 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -59,7 +59,7 @@ def FPU2 : SchedWrite;
 def DFU  : SchedWrite;
 def DFU2 : SchedWrite;
 
-// Vector sub units (z13)
+// Vector sub units (z13 and later)
 def VecBF     : SchedWrite;
 def VecBF2    : SchedWrite;
 def VecDF     : SchedWrite;
@@ -75,6 +75,7 @@ def VecXsPm   : SchedWrite;
 def VBU         : SchedWrite;
 
 
+include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
 include "SystemZScheduleZEC12.td"
 include "SystemZScheduleZ196.td"
diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td
new file mode 100644
index 000000000000..f11177af91a5
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -0,0 +1,1611 @@
+//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z14 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z14Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch12UnsupportedFeatures.List;
+
+    let IssueWidth = 8;
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z14Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<BeginGroup, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z14_FXaUnit     : ProcResource<2>;
+def Z14_FXbUnit     : ProcResource<2>;
+def Z14_LSUnit      : ProcResource<2>;
+def Z14_VecUnit     : ProcResource<2>;
+def Z14_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z14_VBUnit      : ProcResource<2>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXa,     [Z14_FXaUnit]> { let Latency = 1; }
+def : WriteRes<FXa2,    [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; }
+def : WriteRes<FXb,     [Z14_FXbUnit]> { let Latency = 1; }
+def : WriteRes<LSU,     [Z14_LSUnit]>  { let Latency = 4; }
+def : WriteRes<VecBF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecBF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecDF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z14_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; }
+def : WriteRes<VecFPd,  [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]>
+                         { let Latency = 30; }
+def : WriteRes<VecMul,  [Z14_VecUnit]> { let Latency = 5; }
+def : WriteRes<VecStr,  [Z14_VecUnit]> { let Latency = 4; }
+def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; }
+def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[FXb], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXa], (instregex "LTGFR$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
+// Store multiple (estimated average of ceil(5/2) FXb ops)
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
+              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[FXa], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LP(G)?R$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXa], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "IILF(64)?$")>;
+def : InstRW<[FXa], (instregex "IILH(64)?$")>;
+def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXa], (instregex "AIH$")>;
+def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
+def : InstRW<[FXa], (instregex "AGFI$")>;
+def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AGR(K)?$")>;
+def : InstRW<[FXa], (instregex "AHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXa], (instregex "ALGHSIK$")>;
+def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXa], (instregex "ALR(K)?$")>;
+def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXa], (instregex "SGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLFI$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLR(K)?$")>;
+def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "NGR(K)?$")>;
+def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "NILF(64)?$")>;
+def : InstRW<[FXa], (instregex "NILH(64)?$")>;
+def : InstRW<[FXa], (instregex "NILL(64)?$")>;
+def : InstRW<[FXa], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "OGR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "OILF(64)?$")>;
+def : InstRW<[FXa], (instregex "OILH(64)?$")>;
+def : InstRW<[FXa], (instregex "OILL(64)?$")>;
+def : InstRW<[FXa], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXa], (instregex "XIFMux$")>;
+def : InstRW<[FXa], (instregex "XGR(K)?$")>;
+def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "XILF(64)?$")>;
+def : InstRW<[FXa], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGR$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa, Lat4], (instregex "MGHI$")>;
+def : InstRW<[FXa, Lat4], (instregex "MHI$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>;
+def : InstRW<[FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>;
+def : InstRW<[FXa, Lat8, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[FXa, LSU, Lat9, GroupAlone], (instregex "MSC$")>;
+def : InstRW<[FXa, LSU, Lat11, GroupAlone], (instregex "MSGC$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
+def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXb], (instregex "C(G)?R$")>;
+def : InstRW<[FXb], (instregex "CIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXb], (instregex "CLGR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXb], (instregex "CLIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXb], (instregex "CLR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[FXb], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>;
+def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LGG$")>;
+def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXb], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
+              (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXa], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXa], (instregex "AEXT128$")>;
+def : InstRW<[FXa], (instregex "ZEXT128$")>;
+
+// String instructions
+def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+
+// Execute
+def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>;
+def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[VecXsPm], (instregex "LER$")>;
+def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[VecBF], (instregex "LDEBR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
+def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[VecBF], (instregex "LEXR$")>;
+def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXb], (instregex "LDER$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
+def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIER$")>;
+def : InstRW<[VecBF], (instregex "FIDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecDF], (instregex "LTDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[VecDF], (instregex "LDETR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[VecDF], (instregex "FIDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[VecDF], (instregex "QADTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
+def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[VecDF], (instregex "CEDTR$")>;
+def : InstRW<[VecDF], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
+def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VZERO$")>;
+def : InstRW<[VecXsPm], (instregex "VONE$")>;
+def : InstRW<[VecXsPm], (instregex "VGBM$")>;
+def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
+def : InstRW<[LSU], (instregex "VL(32|64)$")>;
+def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+              (instregex "VLM$")>;
+def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
+              (instregex "VSTM$")>;
+def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VPDI$")>;
+def : InstRW<[VecXsPm], (instregex "VBPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[VecXsPm], (instregex "VO(C)?$")>;
+def : InstRW<[VecMul], (instregex "VCKSM$")>;
+def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VX$")>;
+def : InstRW<[VecMul], (instregex "VGFM?$")>;
+def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
+def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[VecBF], (instregex "VCD(L)?G$")>;
+def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GD$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "VFL(L|R)$")>;
+def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[VecBF2], (instregex "WFLLD$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>;
+def : InstRW<[VecBF2], (instregex "VFI$")>;
+def : InstRW<[VecBF], (instregex "VFIDB$")>;
+def : InstRW<[VecBF], (instregex "WFIDB$")>;
+def : InstRW<[VecBF2], (instregex "VFISB$")>;
+def : InstRW<[VecBF], (instregex "WFISB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[VecBF2], (instregex "VFM$")>;
+def : InstRW<[VecBF], (instregex "VFMDB$")>;
+def : InstRW<[VecBF], (instregex "WFMDB$")>;
+def : InstRW<[VecBF2], (instregex "VFMSB$")>;
+def : InstRW<[VecBF], (instregex "WFMSB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[VecFPd], (instregex "VFD$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>;
+def : InstRW<[VecFPd], (instregex "WFDXB$")>;
+def : InstRW<[VecFPd], (instregex "VFSQ$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[VecFPd], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LEFR$")>;
+def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
+def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecDF, VecDF, Lat10, GroupAlone], (instregex "VLIP$")>;
+def : InstRW<[VecDFX, LSU, Lat12, GroupAlone], (instregex "VPKZ$")>;
+def : InstRW<[VecDFX, FXb, LSU, Lat12, GroupAlone], (instregex "VUPKZ$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>;
+def : InstRW<[VecDFX], (instregex "V(A|S)P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>;
+def : InstRW<[VecDFX], (instregex "VPSOP$")>;
+def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "IRBM$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
+}
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index e3e1999d8ad8..4d986e8391cf 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -311,7 +311,7 @@ def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
 def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>;
 def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
@@ -337,7 +337,7 @@ def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
 def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -403,13 +403,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -436,7 +436,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
 def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -474,7 +475,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
 def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
-def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
@@ -499,7 +500,7 @@ def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
 // Compare logical characters under mask
-def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Prefetch
@@ -532,7 +533,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -548,36 +549,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
-def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
 
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
              (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
              (instregex "SRP$")>;
-def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -621,7 +630,7 @@ def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
@@ -632,14 +641,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -780,9 +789,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -791,7 +800,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
@@ -813,9 +822,9 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
 def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
@@ -900,16 +909,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
 def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
 def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2,  Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
 def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
@@ -949,16 +962,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
 def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
 def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
 def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
@@ -967,7 +985,7 @@ def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
 def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
 
 // Perform floating-point operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
@@ -979,7 +997,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
@@ -1010,15 +1028,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
 def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1027,15 +1045,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 // Compare
 def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
 def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1046,19 +1064,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
-def : InstRW<[LSU], (instregex "SPKA$")>;
-def : InstRW<[LSU], (instregex "SSM$")>;
-def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
 def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "STCT(L|G)$")>;
 def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
@@ -1103,16 +1122,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
 def : InstRW<[FXU, Lat30], (instregex "PR$")>;
 def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
@@ -1124,7 +1144,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
 def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
 def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
 
@@ -1161,9 +1181,9 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
 def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
-def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>;
 def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
 def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
@@ -1176,7 +1196,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
 def : InstRW<[FXU], (instregex "LPP$")>;
 def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
 def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
 def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 59f37205f412..a0f2115eb9d7 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -69,7 +69,7 @@ def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
 def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
-def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; }
 def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -251,7 +251,7 @@ def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
 // Load multiple disjoint
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
 
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
@@ -413,13 +413,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
-def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -446,7 +446,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
-def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -544,7 +545,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -560,36 +561,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 // Translate and convert
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Message-security assist
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
 
 //===----------------------------------------------------------------------===//
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
-def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
 
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
              (instregex "(A|S|ZA)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
              (instregex "(M|D)P$")>;
-def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
              (instregex "SRP$")>;
-def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
-def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
 
 //===----------------------------------------------------------------------===//
@@ -659,7 +668,7 @@ def : InstRW<[FXU], (instregex "PPA$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
@@ -670,14 +679,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
-def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
 // Various complex instructions
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -818,9 +827,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -829,7 +838,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
 // Divide to integer
-def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
 
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
@@ -851,10 +860,10 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
-def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
 
 // --------------------- Hexadecimal floating point ------------------------- //
@@ -938,16 +947,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
 def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
 def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
-def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
-def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
 
 // Division
 def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
@@ -987,16 +1000,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
 def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
 
 // Convert from fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
 def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
-def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
 
 // Convert to fixed / logical
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
-def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
-def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
 
 // Convert from / to signed / unsigned packed
 def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
@@ -1007,11 +1025,11 @@ def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")
 // Convert from / to zoned
 def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
 def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
-def : InstRW<[FXU, LSU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
 def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
 
 // Perform floating-point operation
-def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
 
 //===----------------------------------------------------------------------===//
 // DFP: Unary arithmetic
@@ -1023,7 +1041,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
 
 // Extract biased exponent
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
-def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
 
 // Extract significance
 def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
@@ -1054,15 +1072,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
 def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
 
 // Reround
-def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
 
 // Shift significand left/right
-def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
 
 // Insert biased exponent
-def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
 def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1071,15 +1089,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
 
 // Compare
 def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
-def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
 
 // Compare biased exponent
 def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
-def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>;
 
 // Test Data Class/Group
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
-def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
 
 
 // -------------------------------- System ---------------------------------- //
@@ -1090,19 +1108,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
 def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
-def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
-def : InstRW<[LSU], (instregex "SPKA$")>;
-def : InstRW<[LSU], (instregex "SSM$")>;
-def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
 def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
-def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Control Register Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
-def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "STCT(L|G)$")>;
 def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
 def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
@@ -1148,16 +1167,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
-def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
-def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
 
 //===----------------------------------------------------------------------===//
 // System: Address-Space Instructions
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
-def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
 def : InstRW<[FXU, Lat30], (instregex "PR$")>;
 def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
@@ -1169,7 +1189,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
 // System: Linkage-Stack Instructions
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
 def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
 def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
 
@@ -1206,7 +1226,7 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
 //===----------------------------------------------------------------------===//
 
 def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
 def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
 def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
 def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
@@ -1221,7 +1241,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
 def : InstRW<[FXU], (instregex "LPP$")>;
 def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
 def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
-def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
 def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
 def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
 
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 7391df8342ef..13ceb371a425 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -200,14 +200,26 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
       break;
 
+    case SystemZ::WFASB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::AEBR);
+      break;
+
     case SystemZ::WFDDB:
       Changed |= shortenOn001(MI, SystemZ::DDBR);
       break;
 
+    case SystemZ::WFDSB:
+      Changed |= shortenOn001(MI, SystemZ::DEBR);
+      break;
+
     case SystemZ::WFIDB:
       Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
       break;
 
+    case SystemZ::WFISB:
+      Changed |= shortenFPConv(MI, SystemZ::FIEBRA);
+      break;
+
     case SystemZ::WLDEB:
       Changed |= shortenOn01(MI, SystemZ::LDEBR);
       break;
@@ -220,30 +232,58 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001(MI, SystemZ::MDBR);
       break;
 
+    case SystemZ::WFMSB:
+      Changed |= shortenOn001(MI, SystemZ::MEEBR);
+      break;
+
     case SystemZ::WFLCDB:
       Changed |= shortenOn01(MI, SystemZ::LCDFR);
       break;
 
+    case SystemZ::WFLCSB:
+      Changed |= shortenOn01(MI, SystemZ::LCDFR_32);
+      break;
+
     case SystemZ::WFLNDB:
       Changed |= shortenOn01(MI, SystemZ::LNDFR);
       break;
 
+    case SystemZ::WFLNSB:
+      Changed |= shortenOn01(MI, SystemZ::LNDFR_32);
+      break;
+
     case SystemZ::WFLPDB:
       Changed |= shortenOn01(MI, SystemZ::LPDFR);
       break;
 
+    case SystemZ::WFLPSB:
+      Changed |= shortenOn01(MI, SystemZ::LPDFR_32);
+      break;
+
     case SystemZ::WFSQDB:
       Changed |= shortenOn01(MI, SystemZ::SQDBR);
       break;
 
+    case SystemZ::WFSQSB:
+      Changed |= shortenOn01(MI, SystemZ::SQEBR);
+      break;
+
     case SystemZ::WFSDB:
       Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
       break;
 
+    case SystemZ::WFSSB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::SEBR);
+      break;
+
     case SystemZ::WFCDB:
       Changed |= shortenOn01(MI, SystemZ::CDBR);
       break;
 
+    case SystemZ::WFCSB:
+      Changed |= shortenOn01(MI, SystemZ::CEBR);
+      break;
+
     case SystemZ::VL32:
       // For z13 we prefer LDE over LE to avoid partial register dependencies.
       Changed |= shortenOn0(MI, SystemZ::LDE32);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index eb4a0962f7eb..9cd09b0f911e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -47,6 +47,10 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasVector(false), HasLoadStoreOnCond2(false),
       HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       HasDFPPackedConversion(false),
+      HasMiscellaneousExtensions2(false), HasGuardedStorage(false),
+      HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false),
+      HasVectorEnhancements1(false), HasVectorPackedDecimal(false),
+      HasInsertReferenceBitsMultiple(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index b05a1bb6cafd..4829f73e080e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -56,6 +56,13 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   bool HasLoadAndZeroRightmostByte;
   bool HasMessageSecurityAssist5;
   bool HasDFPPackedConversion;
+  bool HasMiscellaneousExtensions2;
+  bool HasGuardedStorage;
+  bool HasMessageSecurityAssist7;
+  bool HasMessageSecurityAssist8;
+  bool HasVectorEnhancements1;
+  bool HasVectorPackedDecimal;
+  bool HasInsertReferenceBitsMultiple;
 
 private:
   Triple TargetTriple;
@@ -168,6 +175,33 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
+  // Return true if the target has the miscellaneous-extensions facility 2.
+  bool hasMiscellaneousExtensions2() const {
+    return HasMiscellaneousExtensions2;
+  }
+
+  // Return true if the target has the guarded-storage facility.
+  bool hasGuardedStorage() const { return HasGuardedStorage; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 7.
+  bool hasMessageSecurityAssist7() const { return HasMessageSecurityAssist7; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 8.
+  bool hasMessageSecurityAssist8() const { return HasMessageSecurityAssist8; }
+
+  // Return true if the target has the vector-enhancements facility 1.
+  bool hasVectorEnhancements1() const { return HasVectorEnhancements1; }
+
+  // Return true if the target has the vector-packed-decimal facility.
+  bool hasVectorPackedDecimal() const { return HasVectorPackedDecimal; }
+
+  // Return true if the target has the insert-reference-bits-multiple facility.
+  bool hasInsertReferenceBitsMultiple() const {
+    return HasInsertReferenceBitsMultiple;
+  }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index cb81c0e5276e..025bf73d2df0 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -143,8 +143,10 @@ class SystemZPassConfig : public TargetPassConfig {
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createSystemZTDCPass());
+    addPass(createLoopDataPrefetchPass());
+  }
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9ac768b2189d..506dc7427993 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -372,6 +372,9 @@ int SystemZTTIImpl::getArithmeticInstrCost(
         Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
       switch (ScalarBits) {
       case 32: {
+        // The vector enhancements facility 1 provides v4f32 instructions.
+        if (ST->hasVectorEnhancements1())
+          return NumVectors;
         // Return the cost of multiple scalar invocation plus the cost of
         // inserting and extracting the values.
         unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 6923fc6fc910..a0c6fa94f8c1 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,6 +56,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector) const;
 
+  unsigned getCacheLineSize() { return 256; }
+  unsigned getPrefetchDistance() { return 2000; }
+  unsigned getMinPrefetchStride() { return 2048; }
+
   bool prefersVectorizedAddressing() { return false; }
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index fc4adddc149b..6e08d4cff6ea 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -37,6 +37,7 @@ endif()
 set(sources
   X86AsmPrinter.cpp
   X86CallFrameOptimization.cpp
+  X86CmovConversion.cpp
   X86ExpandPseudo.cpp
   X86FastISel.cpp
   X86FixupBWInsts.cpp
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 19c93cfff0fe..91201d1fec85 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -83,6 +83,9 @@ FunctionPass *createX86WinEHStatePass();
 /// the MachineInstr to MC.
 FunctionPass *createX86ExpandPseudoPass();
 
+/// This pass converts X86 cmov instructions into branch when profitable.
+FunctionPass *createX86CmovConverterPass();
+
 /// Return a Machine IR pass that selectively replaces
 /// certain byte and word instructions by equivalent 32 bit instructions,
 /// in order to eliminate partial register usage, false dependences on
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 4ca57fe9fb00..54eabeac5126 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -814,10 +814,8 @@ def : Proc<"bdver4", [
   FeatureMWAITX
 ]>;
 
-// TODO: The scheduler model falls to BTVER2 model.
-// The znver1 model has to be put in place.
-// Zen
-def: ProcessorModel<"znver1", BtVer2Model, [
+// Znver1
+def: ProcessorModel<"znver1", Znver1Model, [
   FeatureADX,
   FeatureAES,
   FeatureAVX2,
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 6decb550ad5f..26461986427d 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -448,7 +448,7 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
 
   // Handle explicit CC selection
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
   // Handle Vectorcall CC
@@ -1004,7 +1004,7 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
   CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
   CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
new file mode 100644
index 000000000000..bfc834435de5
--- /dev/null
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -0,0 +1,611 @@
+//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a pass that converts X86 cmov instructions into branch
+/// when profitable. This pass is conservative, i.e., it applies transformation
+/// if and only if it can gaurantee a gain with high confidence.
+///
+/// Thus, the optimization applies under the following conditions:
+///   1. Consider as a candidate only CMOV in most inner loop, assuming that
+///       most hotspots are represented by these loops.
+///   2. Given a group of CMOV instructions, that are using same EFLAGS def
+///      instruction:
+///      a. Consider them as candidates only if all have same code condition or
+///         opposite one, to prevent generating more than one conditional jump
+///         per EFLAGS def instruction.
+///      b. Consider them as candidates only if all are profitable to be
+///         converted, assuming that one bad conversion may casue a degradation.
+///   3. Apply conversion only for loop that are found profitable and only for
+///      CMOV candidates that were found profitable.
+///      a. Loop is considered profitable only if conversion will reduce its
+///         depth cost by some thrishold.
+///      b. CMOV is considered profitable if the cost of its condition is higher
+///         than the average cost of its true-value and false-value by 25% of
+///         branch-misprediction-penalty, this to assure no degredassion even
+///         with 25% branch misprediction.
+///
+/// Note: This pass is assumed to run on SSA machine code.
+//===----------------------------------------------------------------------===//
+//
+//  External interfaces:
+//      FunctionPass *llvm::createX86CmovConverterPass();
+//      bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
+//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cmov-converter"
+
+STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
+STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
+STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
+STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
+
+namespace {
+// This internal switch can be used to turn off the cmov/branch optimization.
+static cl::opt<bool>
+    EnableCmovConverter("x86-cmov-converter",
+                        cl::desc("Enable the X86 cmov-to-branch optimization."),
+                        cl::init(true), cl::Hidden);
+
+/// Converts X86 cmov instructions into branches when profitable.
+class X86CmovConverterPass : public MachineFunctionPass {
+public:
+  X86CmovConverterPass() : MachineFunctionPass(ID) {}
+  ~X86CmovConverterPass() {}
+
+  StringRef getPassName() const override { return "X86 cmov Conversion"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  TargetSchedModel TSchedModel;
+
+  /// List of consecutive CMOV instructions.
+  typedef SmallVector<MachineInstr *, 2> CmovGroup;
+  typedef SmallVector<CmovGroup, 2> CmovGroups;
+
+  /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
+  /// CmovInstGroups accordingly.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff it found any CMOV-group-candidate.
+  bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups);
+
+  /// Check if it is profitable to transform each CMOV-group-candidates into
+  /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff any CMOV-group-candidate remain.
+  bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop,
+                                        CmovGroups &CmovInstGroups);
+
+  /// Convert the given list of consecutive CMOV instructions into a branch.
+  ///
+  /// \param Group Consecutive CMOV instructions to be converted into branch.
+  void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
+};
+
+char X86CmovConverterPass::ID = 0;
+
+void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineLoopInfo>();
+}
+
+bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  if (!EnableCmovConverter)
+    return false;
+
+  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+               << "**********\n");
+
+  bool Changed = false;
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  MRI = &MF.getRegInfo();
+  TII = STI.getInstrInfo();
+  TSchedModel.init(STI.getSchedModel(), &STI, TII);
+
+  //===--------------------------------------------------------------------===//
+  // Algorithm
+  // ---------
+  //   For each inner most loop
+  //     collectCmovCandidates() {
+  //       Find all CMOV-group-candidates.
+  //     }
+  //
+  //     checkForProfitableCmovCandidates() {
+  //       * Calculate both loop-depth and optimized-loop-depth.
+  //       * Use these depth to check for loop transformation profitability.
+  //       * Check for CMOV-group-candidate transformation profitability.
+  //     }
+  //
+  //     For each profitable CMOV-group-candidate
+  //       convertCmovInstsToBranches() {
+  //           * Create FalseBB, SinkBB, Conditional branch to SinkBB.
+  //           * Replace each CMOV instruction with a PHI instruction in SinkBB.
+  //       }
+  //
+  // Note: For more details, see each function description.
+  //===--------------------------------------------------------------------===//
+  for (MachineBasicBlock &MBB : MF) {
+    MachineLoop *CurrLoop = MLI.getLoopFor(&MBB);
+
+    // Optimize only inner most loops.
+    if (!CurrLoop || CurrLoop->getHeader() != &MBB ||
+        !CurrLoop->getSubLoops().empty())
+      continue;
+
+    // List of consecutive CMOV instructions to be processed.
+    CmovGroups CmovInstGroups;
+
+    if (!collectCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    Changed = true;
+    for (auto &Group : CmovInstGroups)
+      convertCmovInstsToBranches(Group);
+  }
+  return Changed;
+}
+
+bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
+                                                 CmovGroups &CmovInstGroups) {
+  //===--------------------------------------------------------------------===//
+  // Collect all CMOV-group-candidates and add them into CmovInstGroups.
+  //
+  // CMOV-group:
+  //   CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
+  //
+  // CMOV-group-candidate:
+  //   CMOV-group where all the CMOV instructions are
+  //     1. consecutive.
+  //     2. have same condition code or opposite one.
+  //     3. have only operand registers (X86::CMOVrr).
+  //===--------------------------------------------------------------------===//
+  // List of possible improvement (TODO's):
+  // --------------------------------------
+  //   TODO: Add support for X86::CMOVrm instructions.
+  //   TODO: Add support for X86::SETcc instructions.
+  //   TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
+  //===--------------------------------------------------------------------===//
+
+  // Current processed CMOV-Group.
+  CmovGroup Group;
+  for (auto *MBB : CurrLoop->getBlocks()) {
+    Group.clear();
+    // Condition code of first CMOV instruction current processed range and its
+    // opposite condition code.
+    X86::CondCode FirstCC, FirstOppCC;
+    // Indicator of a non CMOVrr instruction in the current processed range.
+    bool FoundNonCMOVInst = false;
+    // Indicator for current processed CMOV-group if it should be skipped.
+    bool SkipGroup = false;
+
+    for (auto &I : *MBB) {
+      X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
+      // Check if we found a X86::CMOVrr instruction.
+      if (CC != X86::COND_INVALID && !I.mayLoad()) {
+        if (Group.empty()) {
+          // We found first CMOV in the range, reset flags.
+          FirstCC = CC;
+          FirstOppCC = X86::GetOppositeBranchCondition(CC);
+          FoundNonCMOVInst = false;
+          SkipGroup = false;
+        }
+        Group.push_back(&I);
+        // Check if it is a non-consecutive CMOV instruction or it has different
+        // condition code than FirstCC or FirstOppCC.
+        if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
+          // Mark the SKipGroup indicator to skip current processed CMOV-Group.
+          SkipGroup = true;
+        continue;
+      }
+      // If Group is empty, keep looking for first CMOV in the range.
+      if (Group.empty())
+        continue;
+
+      // We found a non X86::CMOVrr instruction.
+      FoundNonCMOVInst = true;
+      // Check if this instruction define EFLAGS, to determine end of processed
+      // range, as there would be no more instructions using current EFLAGS def.
+      if (I.definesRegister(X86::EFLAGS)) {
+        // Check if current processed CMOV-group should not be skipped and add
+        // it as a CMOV-group-candidate.
+        if (!SkipGroup)
+          CmovInstGroups.push_back(Group);
+        else
+          ++NumOfSkippedCmovGroups;
+        Group.clear();
+      }
+    }
+    // End of basic block is considered end of range, check if current processed
+    // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
+    if (Group.empty())
+      continue;
+    if (!SkipGroup)
+      CmovInstGroups.push_back(Group);
+    else
+      ++NumOfSkippedCmovGroups;
+  }
+
+  NumOfCmovGroupCandidate += CmovInstGroups.size();
+  return !CmovInstGroups.empty();
+}
+
+/// \returns Depth of CMOV instruction as if it was converted into branch.
+/// \param TrueOpDepth depth cost of CMOV true value operand.
+/// \param FalseOpDepth depth cost of CMOV false value operand.
+static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
+  //===--------------------------------------------------------------------===//
+  // With no info about branch weight, we assume 50% for each value operand.
+  // Thus, depth of optimized CMOV instruction is the rounded up average of
+  // its True-Operand-Value-Depth and False-Operand-Value-Depth.
+  //===--------------------------------------------------------------------===//
+  return (TrueOpDepth + FalseOpDepth + 1) / 2;
+}
+
+bool X86CmovConverterPass::checkForProfitableCmovCandidates(
+    MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) {
+  struct DepthInfo {
+    /// Depth of original loop.
+    unsigned Depth;
+    /// Depth of optimized loop.
+    unsigned OptDepth;
+  };
+  /// Number of loop iterations to calculate depth for ?!
+  static const unsigned LoopIterations = 2;
+  DenseMap<MachineInstr *, DepthInfo> DepthMap;
+  DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
+  enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
+  /// For each register type maps the register to its last def instruction.
+  DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
+  /// Maps register operand to its def instruction, which can be nullptr if it
+  /// is unknown (e.g., operand is defined outside the loop).
+  DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
+
+  // Set depth of unknown instruction (i.e., nullptr) to zero.
+  DepthMap[nullptr] = {0, 0};
+
+  SmallPtrSet<MachineInstr *, 4> CmovInstructions;
+  for (auto &Group : CmovInstGroups)
+    CmovInstructions.insert(Group.begin(), Group.end());
+
+  //===--------------------------------------------------------------------===//
+  // Step 1: Calculate instruction depth and loop depth.
+  // Optimized-Loop:
+  //   loop with CMOV-group-candidates converted into branches.
+  //
+  // Instruction-Depth:
+  //   instruction latency + max operand depth.
+  //     * For CMOV instruction in optimized loop the depth is calculated as:
+  //       CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
+  // TODO: Find a better way to estimate the latency of the branch instruction
+  //       rather than using the CMOV latency.
+  //
+  // Loop-Depth:
+  //   max instruction depth of all instructions in the loop.
+  // Note: instruction with max depth represents the critical-path in the loop.
+  //
+  // Loop-Depth[i]:
+  //   Loop-Depth calculated for first `i` iterations.
+  //   Note: it is enough to calculate depth for up to two iterations.
+  //
+  // Depth-Diff[i]:
+  //   Number of cycles saved in first 'i` iterations by optimizing the loop.
+  //===--------------------------------------------------------------------===//
+  for (unsigned I = 0; I < LoopIterations; ++I) {
+    DepthInfo &MaxDepth = LoopDepth[I];
+    for (auto *MBB : CurrLoop->getBlocks()) {
+      // Clear physical registers Def map.
+      RegDefMaps[PhyRegType].clear();
+      for (MachineInstr &MI : *MBB) {
+        unsigned MIDepth = 0;
+        unsigned MIDepthOpt = 0;
+        bool IsCMOV = CmovInstructions.count(&MI);
+        for (auto &MO : MI.uses()) {
+          // Checks for "isUse()" as "uses()" returns also implicit definitions.
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned Reg = MO.getReg();
+          auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+          if (MachineInstr *DefMI = RDM.lookup(Reg)) {
+            OperandToDefMap[&MO] = DefMI;
+            DepthInfo Info = DepthMap.lookup(DefMI);
+            MIDepth = std::max(MIDepth, Info.Depth);
+            if (!IsCMOV)
+              MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
+          }
+        }
+
+        if (IsCMOV)
+          MIDepthOpt = getDepthOfOptCmov(
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
+
+        // Iterates over all operands to handle implicit definitions as well.
+        for (auto &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          unsigned Reg = MO.getReg();
+          RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+        }
+
+        unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+        DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
+        MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
+        MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
+      }
+    }
+  }
+
+  unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
+                                   LoopDepth[1].Depth - LoopDepth[1].OptDepth};
+
+  //===--------------------------------------------------------------------===//
+  // Step 2: Check if Loop worth to be optimized.
+  // Worth-Optimize-Loop:
+  //   case 1: Diff[1] == Diff[0]
+  //           Critical-path is iteration independent - there is no dependency
+  //           of critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is enough to check gain percent of 1st iteration -
+  //           To be conservative, the optimized loop need to have a depth of
+  //           12.5% cycles less than original loop, per iteration.
+  //
+  //   case 2: Diff[1] > Diff[0]
+  //           Critical-path is iteration dependent - there is dependency of
+  //           critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is required to check the gradient of the gain - the
+  //           change in Depth-Diff compared to the change in Loop-Depth between
+  //           1st and 2nd iterations.
+  //           To be conservative, the gradient need to be at least 50%.
+  //
+  // If loop is not worth optimizing, remove all CMOV-group-candidates.
+  //===--------------------------------------------------------------------===//
+  bool WorthOptLoop = false;
+  if (Diff[1] == Diff[0])
+    WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
+  else if (Diff[1] > Diff[0])
+    WorthOptLoop =
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+
+  if (!WorthOptLoop)
+    return false;
+
+  ++NumOfLoopCandidate;
+
+  //===--------------------------------------------------------------------===//
+  // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
+  // Worth-Optimize-Group:
+  //   Iff it worths to optimize all CMOV instructions in the group.
+  //
+  // Worth-Optimize-CMOV:
+  //   Predicted branch is faster than CMOV by the difference between depth of
+  //   condition operand and depth of taken (predicted) value operand.
+  //   To be conservative, the gain of such CMOV transformation should cover at
+  //   at least 25% of branch-misprediction-penalty.
+  //===--------------------------------------------------------------------===//
+  unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+  CmovGroups TempGroups;
+  std::swap(TempGroups, CmovInstGroups);
+  for (auto &Group : TempGroups) {
+    bool WorthOpGroup = true;
+    for (auto *MI : Group) {
+      // Avoid CMOV instruction which value is used as a pointer to load from.
+      // This is another conservative check to avoid converting CMOV instruction
+      // used with tree-search like algorithm, where the branch is unpredicted.
+      auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
+      if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
+        unsigned Op = UIs.begin()->getOpcode();
+        if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
+          WorthOpGroup = false;
+          break;
+        }
+      }
+
+      unsigned CondCost =
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
+      unsigned ValCost = getDepthOfOptCmov(
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
+      if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
+        WorthOpGroup = false;
+        break;
+      }
+    }
+
+    if (WorthOpGroup)
+      CmovInstGroups.push_back(Group);
+  }
+
+  return !CmovInstGroups.empty();
+}
+
+static bool checkEFLAGSLive(MachineInstr *MI) {
+  if (MI->killsRegister(X86::EFLAGS))
+    return false;
+
+  // The EFLAGS operand of MI might be missing a kill marker.
+  // Figure out whether EFLAGS operand should LIVE after MI instruction.
+  MachineBasicBlock *BB = MI->getParent();
+  MachineBasicBlock::iterator ItrMI = MI;
+
+  // Scan forward through BB for a use/def of EFLAGS.
+  for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
+    if (I->readsRegister(X86::EFLAGS))
+      return true;
+    if (I->definesRegister(X86::EFLAGS))
+      return false;
+  }
+
+  // We hit the end of the block, check whether EFLAGS is live into a successor.
+  for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
+    if ((*I)->isLiveIn(X86::EFLAGS))
+      return true;
+  }
+
+  return false;
+}
+
+void X86CmovConverterPass::convertCmovInstsToBranches(
+    SmallVectorImpl<MachineInstr *> &Group) const {
+  assert(!Group.empty() && "No CMOV instructions to convert");
+  ++NumOfOptimizedCmovGroups;
+
+  // To convert a CMOVcc instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+
+  // Before
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   v1 = CMOVge t1, f1, cond
+  //   v2 = CMOVlt t2, f2, cond
+  //   v3 = CMOVge v1, f3, cond
+  //
+  // After
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   jge %SinkMBB
+  //
+  // FalseMBB:
+  //   jmp %SinkMBB
+  //
+  // SinkMBB:
+  //   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+  //   %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+  //                                          ; true-value with false-value
+  //   %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
+  //                                          ; previous Phi instruction result
+
+  MachineInstr &MI = *Group.front();
+  MachineInstr *LastCMOV = Group.back();
+  DebugLoc DL = MI.getDebugLoc();
+  X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineFunction *F = MBB->getParent();
+  const BasicBlock *BB = MBB->getBasicBlock();
+
+  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
+  F->insert(It, FalseMBB);
+  F->insert(It, SinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  if (checkEFLAGSLive(LastCMOV)) {
+    FalseMBB->addLiveIn(X86::EFLAGS);
+    SinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to SinkMBB.
+  SinkMBB->splice(SinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
+  SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Add the false and sink blocks as its successors.
+  MBB->addSuccessor(FalseMBB);
+  MBB->addSuccessor(SinkMBB);
+
+  // Create the conditional branch instruction.
+  BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
+
+  // Add the sink block to the false block successors.
+  FalseMBB->addSuccessor(SinkMBB);
+
+  MachineInstrBuilder MIB;
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+      std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are processing is the opposite condition from the jump we
+    // generated, then we have to swap the operands for the PHI that is going to
+    // be generated.
+    if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
+      std::swap(Op1Reg, Op2Reg);
+
+    auto Op1Itr = RegRewriteTable.find(Op1Reg);
+    if (Op1Itr != RegRewriteTable.end())
+      Op1Reg = Op1Itr->second.first;
+
+    auto Op2Itr = RegRewriteTable.find(Op2Reg);
+    if (Op2Itr != RegRewriteTable.end())
+      Op2Reg = Op2Itr->second.second;
+
+    //  SinkMBB:
+    //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
+    //  ...
+    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+              .addReg(Op1Reg)
+              .addMBB(FalseMBB)
+              .addReg(Op2Reg)
+              .addMBB(MBB);
+    (void)MIB;
+    DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+    DEBUG(dbgs() << "\tTo: "; MIB->dump());
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // Now remove the CMOV(s).
+  MBB->erase(MIItBegin, MIItEnd);
+}
+
+} // End anonymous namespace.
+
+FunctionPass *llvm::createX86CmovConverterPass() {
+  return new X86CmovConverterPass();
+}
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ee9e78146305..527e5d568ac6 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1187,7 +1187,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       CC != CallingConv::X86_StdCall &&
       CC != CallingConv::X86_ThisCall &&
       CC != CallingConv::X86_64_SysV &&
-      CC != CallingConv::X86_64_Win64)
+      CC != CallingConv::Win64)
     return false;
 
   // Don't handle popping bytes if they don't fit the ret's immediate.
@@ -3171,7 +3171,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   case CallingConv::X86_FastCall:
   case CallingConv::X86_StdCall:
   case CallingConv::X86_ThisCall:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
     break;
   }
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index c28746f96439..95c6f2a3fa34 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -22,7 +22,7 @@
 /// instructions and register-to-register moves.  It would
 /// seem like cmov(s) would also be affected, but because of the way cmov is
 /// really implemented by most machines as reading both the destination and
-/// and source regsters, and then "merging" the two based on a condition,
+/// and source registers, and then "merging" the two based on a condition,
 /// it really already should be considered as having a true dependence on the
 /// destination register as well.
 ///
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 65486cf7f529..44eecd664714 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1335,6 +1335,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTTZ,             VT, Custom);
     }
 
+    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
+                    MVT::v8i64}) {
+      setOperationAction(ISD::ROTL,             VT, Custom);
+      setOperationAction(ISD::ROTR,             VT, Custom);
+    }
+
     // Need to promote to 64-bit even though we have 32-bit masked instructions
     // because the IR optimizers rearrange bitcasts around logic ops leaving
     // too many variations to handle if we don't promote them.
@@ -1663,10 +1670,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
 
-  // TODO: These control memcmp expansion in CGP and are set low to prevent
-  // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
-  MaxLoadsPerMemcmp = 1;
-  MaxLoadsPerMemcmpOptSize = 1;
+  // TODO: These control memcmp expansion in CGP and could be raised higher, but
+  // that needs to benchmarked and balanced with the potential use of vector
+  // load/store types (PR33329).
+  MaxLoadsPerMemcmp = 4;
+  MaxLoadsPerMemcmpOptSize = 2;
 
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
@@ -2661,7 +2669,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   // C calling conventions:
   case CallingConv::C:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
   // Callee pop conventions:
   case CallingConv::X86_ThisCall:
@@ -20188,7 +20196,10 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
                                  const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   EVT MaskVT = Mask.getValueType();
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
@@ -20210,7 +20221,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
@@ -20235,7 +20249,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -20254,7 +20271,10 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -22665,10 +22685,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  unsigned Opcode = Op.getOpcode();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  if (Subtarget.hasAVX512()) {
+    // Attempt to rotate by immediate.
+    APInt UndefElts;
+    SmallVector<APInt, 16> EltBits;
+    if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
+      if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
+            return EltBits[0] == V;
+          })) {
+        unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+        uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
+        return DAG.getNode(Op, DL, VT, R,
+                           DAG.getConstant(RotateAmt, DL, MVT::i8));
+      }
+    }
+
+    // Else, fall-back on VPROLV/VPRORV.
+    return Op;
+  }
 
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
-  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right.
@@ -22683,7 +22724,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
-      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      assert(RotateAmt < EltSizeInBits && "Rotation out of range");
       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
@@ -24030,7 +24071,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
-  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
+  case ISD::ROTL:
+  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index cc5c09cbf0e5..705d0f7a5cf7 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1759,29 +1759,29 @@ let Predicates = Preds in {
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
                                              (_.VT (bitconvert (_.LdFrag addr:$src2))))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and _.KRCWM:$mask, 
+                              (_.KVT (and _.KRCWM:$mask,
                                           (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
                                                                 _.RC:$src1, _.RC:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
-                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                                         (_.VT (bitconvert 
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
                                                                 (_.LdFrag addr:$src2))))))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask,
                                                                  _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
 }
@@ -1798,7 +1798,7 @@ let Predicates = Preds in {
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (and (_.KVT _.KRCWM:$mask),
                                           (_.KVT (OpNode (_.VT _.RC:$src1),
@@ -1879,7 +1879,7 @@ defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
 defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
 
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, 
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ", [HasAVX512]>;
 defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
                                        "VPCMPEQQZ", [HasAVX512]>;
@@ -2127,17 +2127,17 @@ multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne
                                           list<Predicate> Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                             (_.VT _.RC:$src2), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (_.VT _.RC:$src2),
                                              imm:$cc)),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
                                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                              imm:$cc)),
                               (i64 0)),
@@ -2145,37 +2145,37 @@ let Predicates = Preds in {
                                                                  addr:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and _.KRCWM:$mask, 
+                              (_.KVT (and _.KRCWM:$mask,
                                           (OpNode (_.VT _.RC:$src1),
                                                   (_.VT _.RC:$src2),
                                                   imm:$cc))),
                               (i64 0)),
             (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
-                                                                  _.RC:$src1, 
+                                                                  _.RC:$src1,
                                                                   _.RC:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
-                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
-                                                         (_.VT (bitconvert 
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
                                                                 (_.LdFrag addr:$src2))),
                                                          imm:$cc)))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
                                                                   _.RC:$src1,
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
 }
 }
-  
+
 multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
                                               SDNode OpNode, string InstrStr,
-                                              list<Predicate> Preds> 
+                                              list<Predicate> Preds>
          : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
@@ -2187,7 +2187,7 @@ let Predicates = Preds in {
                                                                   addr:$src2,
                                                                   imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
                               (_.KVT (and (_.KVT _.KRCWM:$mask),
                                           (_.KVT (OpNode (_.VT _.RC:$src1),
@@ -2447,17 +2447,17 @@ multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne
                                           string InstrStr, list<Predicate> Preds> {
 let Predicates = Preds in {
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
-                                              (_.VT _.RC:$src2), 
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (_.VT _.RC:$src2),
                                               imm:$cc)),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
-  
+
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
                                               (_.VT (bitconvert (_.LdFrag addr:$src2))),
                                               imm:$cc)),
                               (i64 0)),
@@ -2477,19 +2477,19 @@ let Predicates = Preds in {
                               NewInf.KRC)>;
 }
 }
-  
+
 multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
-                                              string InstrStr, list<Predicate> Preds> 
+                                              string InstrStr, list<Predicate> Preds>
          : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
 
 let Predicates = Preds in
   def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
-                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), 
-                                                 (_.VT _.RC:$src2), 
+                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1),
+                                                 (_.VT _.RC:$src2),
                                                  imm:$cc,
                                                  (i32 FROUND_NO_EXC))),
                               (i64 0)),
-            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, 
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1,
                                                                  _.RC:$src2,
                                                                  imm:$cc),
                               NewInf.KRC)>;
@@ -2817,16 +2817,16 @@ let Predicates = [HasAVX512] in {
     def : Pat<(maskVT (scalar_to_vector GR32:$src)),
               (COPY_TO_REGCLASS GR32:$src, maskRC)>;
 
-    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), 
+    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
               (COPY_TO_REGCLASS maskRC:$src, GR32)>;
 
     def : Pat<(maskVT (scalar_to_vector GR8:$src)),
               (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
 
-    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), 
+    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
               (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
 
-    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), 
+    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
               (COPY_TO_REGCLASS maskRC:$src, GR32)>;
   }
 
@@ -3036,7 +3036,7 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
             (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
                             (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
@@ -3044,8 +3044,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV),
                      (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
             (i8 8)), (i8 8))>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
-                            (v8i1 (and VK8:$mask, 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
                                        (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
@@ -3063,7 +3063,7 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2)
             (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
             imm:$cc), VK8)>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
                             (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
@@ -3072,8 +3072,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV),
                      imm:$cc),
             (i8 8)), (i8 8))>;
 
-def : Pat<(insert_subvector (v16i1 immAllZerosV), 
-                            (v8i1 (and VK8:$mask, 
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
                                        (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
                             (i64 0)),
             (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
@@ -3379,35 +3379,35 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512, "VMOVDQA32">, 
+                                       HasAVX512, "VMOVDQA32">,
                  PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512, "VMOVDQA64">, 
+                                    HasAVX512, "VMOVDQA64">,
                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI, "VMOVDQU8">, 
+                                 HasBWI, "VMOVDQU8">,
                 XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI, "VMOVDQU16">, 
+                                 HasBWI, "VMOVDQU16">,
                  XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512, "VMOVDQU32">, 
+                                 HasAVX512, "VMOVDQU32">,
                  XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512, "VMOVDQU64">, 
+                                 HasAVX512, "VMOVDQU64">,
                  XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
@@ -3964,49 +3964,49 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
 let hasSideEffects = 0 in {
-  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, FR32X:$src2),
                            "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
                            FoldGenData<"VMOVSSZrr">;
 
 let Constraints = "$src0 = $dst" in
-  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, 
+  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
                                                    VR128X:$src1, FR32X:$src2),
                              "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
                              FoldGenData<"VMOVSSZrrk">;
- 
-  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+
+  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
                          "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
                          [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
                          FoldGenData<"VMOVSSZrrkz">;
 
-  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, FR64X:$src2),
                            "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
                            FoldGenData<"VMOVSDZrr">;
 
 let Constraints = "$src0 = $dst" in
-  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, 
+  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
                                                    VR128X:$src1, FR64X:$src2),
                              "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
                              [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
-                             VEX_W, FoldGenData<"VMOVSDZrrk">; 
+                             VEX_W, FoldGenData<"VMOVSDZrrk">;
 
-  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
-                              (ins f64x_info.KRCWM:$mask, VR128X:$src1, 
+  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                              (ins f64x_info.KRCWM:$mask, VR128X:$src1,
                                                           FR64X:$src2),
                               "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, 
+                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
                               VEX_W, FoldGenData<"VMOVSDZrrkz">;
 }
 
@@ -5676,6 +5676,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
 
+
+// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
+// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // 1-src variable permutation VPERMW/D/Q
 //===-------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 7e4cba1c8345..343da2573b55 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -224,7 +224,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
 const TargetRegisterClass *
 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
   const Function *F = MF.getFunction();
-  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64))
     return &X86::GR64_TCW64RegClass;
   else if (Is64Bit)
     return &X86::GR64_TCRegClass;
@@ -334,7 +334,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     if (Is64Bit)
       return CSR_64_MostRegs_SaveList;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     if (!HasSSE)
       return CSR_Win64_NoSSE_SaveList;
     return CSR_Win64_SaveList;
@@ -450,7 +450,7 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     if (Is64Bit)
       return CSR_64_MostRegs_RegMask;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     return CSR_Win64_RegMask;
   case CallingConv::X86_64_SysV:
     return CSR_64_RegMask;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index a12fa68faf4f..d831a7974359 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -663,5 +663,6 @@ include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
 include "X86ScheduleSLM.td"
+include "X86ScheduleZnver1.td"
 include "X86ScheduleBtVer2.td"
 
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index ed53893b779c..9dcc968a1a7a 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -370,6 +370,22 @@ def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
 def : WriteRes<WriteFence,  [JSAGU]>;
 def : WriteRes<WriteNop, []>;
 
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteEXTRQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>;
+
+def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [4];
+}
+def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // AVX instructions.
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
new file mode 100644
index 000000000000..d5b4cfe2ddee
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -0,0 +1,223 @@
+//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver1 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver1Model : SchedMachineModel {
+  // Zen can decode 4 instructions per cycle.
+  let IssueWidth = 4;
+  // Based on the reorder buffer we define MicroOpBufferSize
+  let MicroOpBufferSize = 192;
+  let LoadLatency = 4;
+  let MispredictPenalty = 17;
+  let HighLatency = 25;
+  let PostRAScheduler = 1;
+
+  // FIXME: This variable is required for incomplete model.
+  // We haven't catered all instructions.
+  // So, we reset the value of this variable so as to
+  // say that the model is incomplete.
+  let CompleteModel = 0;
+}
+
+let SchedModel = Znver1Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+//  * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+//  * Two AGU units (ZAGU0, ZAGU1)
+//  * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def ZnALU0 : ProcResource<1>;
+def ZnALU1 : ProcResource<1>;
+def ZnALU2 : ProcResource<1>;
+def ZnALU3 : ProcResource<1>;
+
+// Two AGU units are defined below
+def ZnAGU0 : ProcResource<1>;
+def ZnAGU1 : ProcResource<1>;
+
+// Four FPU units are defined below
+def ZnFPU0 : ProcResource<1>;
+def ZnFPU1 : ProcResource<1>;
+def ZnFPU2 : ProcResource<1>;
+def ZnFPU3 : ProcResource<1>;
+
+// FPU grouping
+def ZnFPU     : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
+def ZnFPU013  : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
+def ZnFPU01   : ProcResGroup<[ZnFPU0, ZnFPU1]>;
+def ZnFPU12   : ProcResGroup<[ZnFPU1, ZnFPU2]>;
+def ZnFPU13   : ProcResGroup<[ZnFPU1, ZnFPU3]>;
+def ZnFPU23   : ProcResGroup<[ZnFPU2, ZnFPU3]>;
+def ZnFPU02   : ProcResGroup<[ZnFPU0, ZnFPU2]>;
+def ZnFPU03   : ProcResGroup<[ZnFPU0, ZnFPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// ZnALU03 - 0,3 grouping
+def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
+
+// 56 Entry (14x4 entries) Int Scheduler
+def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
+  let BufferSize=56;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
+  let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def ZnMultiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def ZnDivider : ProcResource<1>;
+
+// 4 Cycles load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops 
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+//      a. load and
+//      b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU 
+  // adds 4 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU
+  // adds 7 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 7);
+  }
+}
+
+// WriteRMW is set for instructions with Memory write 
+// operation in codegen
+def : WriteRes<WriteRMW, [ZnAGU]>;
+
+def : WriteRes<WriteStore, [ZnAGU]>;
+def : WriteRes<WriteMove,  [ZnALU]>;
+def : WriteRes<WriteLoad,  [ZnAGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero,  []>;
+def : WriteRes<WriteLEA, [ZnALU]>;
+defm : ZnWriteResPair<WriteALU,   ZnALU, 1>;
+defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
+defm : ZnWriteResPair<WriteJump,  ZnALU, 1>;
+
+// IDIV
+def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 41];
+}
+
+def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
+  let Latency = 45;
+  let ResourceCycles = [1, 4, 41];
+}
+
+// IMUL
+def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
+  let Latency = 4;
+}
+def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
+  let Latency = 4;
+}
+
+def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
+  let Latency = 8;
+}
+
+// Floating point operations
+defm : ZnWriteResFpuPair<WriteFHAdd,     ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFAdd,      ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFBlend,    ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteVarBlend,  ZnFPU0,  1>;
+defm : ZnWriteResFpuPair<WriteCvtI2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2I,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteFDiv,      ZnFPU3, 15>;
+defm : ZnWriteResFpuPair<WriteFShuffle,  ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteFMul,      ZnFPU0,  5>;
+defm : ZnWriteResFpuPair<WriteFRcp,      ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrt,    ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFSqrt,     ZnFPU3, 20>;
+
+// Vector integer operations which uses FPU units
+defm : ZnWriteResFpuPair<WriteVecShift,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecLogic,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WritePHAdd,      ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecALU,     ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecIMul,    ZnFPU0,  4>;
+defm : ZnWriteResFpuPair<WriteShuffle,    ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteBlend,      ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU,   2>;
+
+// Vector Shift Operations
+defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+
+// AES Instructions.
+defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+
+def : WriteRes<WriteFence,  [ZnAGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
+
+//Microcoded Instructions
+let Latency = 100 in {
+  def : WriteRes<WriteMicrocoded, []>;
+  def : WriteRes<WriteSystem, []>;
+  def : WriteRes<WriteMPSAD, []>;
+  def : WriteRes<WriteMPSADLd, []>;
+  def : WriteRes<WriteCLMul, []>;
+  def : WriteRes<WriteCLMulLd, []>;
+  def : WriteRes<WritePCmpIStrM, []>;
+  def : WriteRes<WritePCmpIStrMLd, []>;
+  def : WriteRes<WritePCmpEStrI, []>;
+  def : WriteRes<WritePCmpEStrILd, []>;
+  def : WriteRes<WritePCmpEStrM, []>;
+  def : WriteRes<WritePCmpEStrMLd, []>;
+  def : WriteRes<WritePCmpIStrI, []>;
+  def : WriteRes<WritePCmpIStrILd, []>;
+  }
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index fa0afe29586b..427a0001bef9 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -597,7 +597,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
     case CallingConv::Intel_OCL_BI:
       return isTargetWin64();
     // This convention allows using the Win64 convention on other targets.
-    case CallingConv::X86_64_Win64:
+    case CallingConv::Win64:
       return true;
     // This convention allows using the SysV convention on Windows targets.
     case CallingConv::X86_64_SysV:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8d891c983fab..08c2cdaefe71 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -375,6 +375,7 @@ bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
+  addPass(createX86CmovConverterPass());
   return true;
 }
 
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index aaa6d58bd134..c16207973b39 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -40,6 +40,8 @@ class X86TargetMachine final : public LLVMTargetMachine {
   ~X86TargetMachine() override;
 
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
   const X86Subtarget *getSubtargetImpl() const = delete;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt
index ad458450fda3..28da36bba209 100644
--- a/lib/ToolDrivers/CMakeLists.txt
+++ b/lib/ToolDrivers/CMakeLists.txt
@@ -1 +1,2 @@
+add_subdirectory(llvm-dlltool)
 add_subdirectory(llvm-lib)
diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt
index 7da9a5c01005..a49e04bdf3c1 100644
--- a/lib/ToolDrivers/LLVMBuild.txt
+++ b/lib/ToolDrivers/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = llvm-lib
+subdirectories = llvm-dlltool llvm-lib
 
 [component_0]
 type = Group
diff --git a/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt b/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
new file mode 100644
index 000000000000..52bd5cba86f4
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(DllOptionsTableGen)
+
+add_llvm_library(LLVMDlltoolDriver
+  DlltoolDriver.cpp
+  )
+
+add_dependencies(LLVMDlltoolDriver DllOptionsTableGen)
diff --git a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
new file mode 100644
index 000000000000..a7de79306074
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -0,0 +1,160 @@
+//===- DlltoolDriver.cpp - dlltool.exe-compatible driver ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a dlltool.exe-compatible driver.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/COFFModuleDefinition.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Path.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::COFF;
+
+namespace {
+
+enum {
+  OPT_INVALID = 0,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
+#include "Options.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Options.inc"
+#undef PREFIX
+
+static const llvm::opt::OptTable::Info infoTable[] = {
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12)      \
+  {X1, X2, X10,         X11,         OPT_##ID, llvm::opt::Option::KIND##Class, \
+   X9, X8, OPT_##GROUP, OPT_##ALIAS, X7,       X12},
+#include "Options.inc"
+#undef OPTION
+};
+
+class DllOptTable : public llvm::opt::OptTable {
+public:
+  DllOptTable() : OptTable(infoTable, false) {}
+};
+
+} // namespace
+
+std::vector<std::unique_ptr<MemoryBuffer>> OwningMBs;
+
+// Opens a file. Path has to be resolved already.
+// Newly created memory buffers are owned by this driver.
+MemoryBufferRef openFile(StringRef Path) {
+  ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB = MemoryBuffer::getFile(Path);
+
+  if (std::error_code EC = MB.getError())
+    llvm::errs() << "fail openFile: " << EC.message() << "\n";
+
+  MemoryBufferRef MBRef = MB.get()->getMemBufferRef();
+  OwningMBs.push_back(std::move(MB.get())); // take ownership
+  return MBRef;
+}
+
+static MachineTypes getEmulation(StringRef S) {
+  return StringSwitch<MachineTypes>(S)
+      .Case("i386", IMAGE_FILE_MACHINE_I386)
+      .Case("i386:x86-64", IMAGE_FILE_MACHINE_AMD64)
+      .Case("arm", IMAGE_FILE_MACHINE_ARMNT)
+      .Default(IMAGE_FILE_MACHINE_UNKNOWN);
+}
+
+static std::string getImplibPath(std::string Path) {
+  SmallString<128> Out = StringRef("lib");
+  Out.append(Path);
+  sys::path::replace_extension(Out, ".a");
+  return Out.str();
+}
+
+int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
+  DllOptTable Table;
+  unsigned MissingIndex;
+  unsigned MissingCount;
+  llvm::opt::InputArgList Args =
+      Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
+  if (MissingCount) {
+    llvm::errs() << Args.getArgString(MissingIndex) << ": missing argument\n";
+    return 1;
+  }
+
+  // Handle when no input or output is specified
+  if (Args.hasArgNoClaim(OPT_INPUT) ||
+      (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) {
+    Table.PrintHelp(outs(), ArgsArr[0], "dlltool", false);
+    llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm\n";
+    return 1;
+  }
+
+  if (!Args.hasArgNoClaim(OPT_m) && Args.hasArgNoClaim(OPT_d)) {
+    llvm::errs() << "error: no target machine specified\n"
+                 << "supported targets: i386, i386:x86-64, arm\n";
+    return 1;
+  }
+
+  for (auto *Arg : Args.filtered(OPT_UNKNOWN))
+    llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
+
+  MemoryBufferRef MB;
+  if (auto *Arg = Args.getLastArg(OPT_d))
+    MB = openFile(Arg->getValue());
+
+  if (!MB.getBufferSize()) {
+    llvm::errs() << "definition file empty\n";
+    return 1;
+  }
+
+  COFF::MachineTypes Machine = IMAGE_FILE_MACHINE_UNKNOWN;
+  if (auto *Arg = Args.getLastArg(OPT_m))
+    Machine = getEmulation(Arg->getValue());
+
+  if (Machine == IMAGE_FILE_MACHINE_UNKNOWN) {
+    llvm::errs() << "unknown target\n";
+    return 1;
+  }
+
+  Expected<COFFModuleDefinition> Def =
+      parseCOFFModuleDefinition(MB, Machine, true);
+
+  if (!Def) {
+    llvm::errs() << "error parsing definition\n"
+                 << errorToErrorCode(Def.takeError()).message();
+    return 1;
+  }
+
+  // Do this after the parser because parseCOFFModuleDefinition sets OutputFile.
+  if (auto *Arg = Args.getLastArg(OPT_D))
+    Def->OutputFile = Arg->getValue();
+
+  if (Def->OutputFile.empty()) {
+    llvm::errs() << "no output file specified\n";
+    return 1;
+  }
+
+  std::string Path = Args.getLastArgValue(OPT_l);
+  if (Path.empty())
+    Path = getImplibPath(Def->OutputFile);
+
+  if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine))
+    return 1;
+  return 0;
+}
diff --git a/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt b/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt
new file mode 100644
index 000000000000..11736eb47bcb
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/ToolDrivers/llvm-dlltool/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = DlltoolDriver
+parent = Libraries
+required_libraries = Object Option Support
diff --git a/lib/ToolDrivers/llvm-dlltool/Options.td b/lib/ToolDrivers/llvm-dlltool/Options.td
new file mode 100644
index 000000000000..213c6a4d7674
--- /dev/null
+++ b/lib/ToolDrivers/llvm-dlltool/Options.td
@@ -0,0 +1,26 @@
+include "llvm/Option/OptParser.td"
+
+def m: JoinedOrSeparate<["-"], "m">, HelpText<"Set target machine">;
+def m_long : JoinedOrSeparate<["--"], "machine">, Alias<m>;
+
+def l: JoinedOrSeparate<["-"], "l">, HelpText<"Generate an import lib">;
+def l_long : JoinedOrSeparate<["--"], "output-lib">, Alias<l>;
+
+def D: JoinedOrSeparate<["-"], "D">, HelpText<"Specify the input DLL Name">;
+def D_long : JoinedOrSeparate<["--"], "dllname">, Alias<D>;
+
+def d: JoinedOrSeparate<["-"], "d">, HelpText<"Input .def File">;
+def d_long : JoinedOrSeparate<["--"], "input-def">, Alias<d>;
+
+//==============================================================================
+// The flags below do nothing. They are defined only for dlltool compatibility.
+//==============================================================================
+
+def k: Flag<["-"], "k">, HelpText<"Kill @n Symbol from export">;
+def k_alias: Flag<["--"], "kill-at">, Alias<k>;
+
+def S: JoinedOrSeparate<["-"], "S">, HelpText<"Assembler">;
+def S_alias: JoinedOrSeparate<["--"], "as">, Alias<S>;
+
+def f: JoinedOrSeparate<["-"], "f">, HelpText<"Assembler Flags">;
+def f_alias: JoinedOrSeparate<["--"], "as-flags">, Alias<f>;
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 3d57acf06e74..93eab680ca6b 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2026,6 +2026,24 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
       continue;
     }
 
+    // LLVM's definition of dominance allows instructions that are cyclic
+    // in unreachable blocks, e.g.:
+    // %pat = select i1 %condition, @global, i16* %pat
+    // because any instruction dominates an instruction in a block that's
+    // not reachable from entry.
+    // So, remove unreachable blocks from the function, because a) there's
+    // no point in analyzing them and b) GlobalOpt should otherwise grow
+    // some more complicated logic to break these cycles.
+    // Removing unreachable blocks might invalidate the dominator so we
+    // recalculate it.
+    if (!F->isDeclaration()) {
+      if (removeUnreachableBlocks(*F)) {
+        auto &DT = LookupDomTree(*F);
+        DT.recalculate(*F);
+        Changed = true;
+      }
+    }
+
     Changed |= processGlobal(*F, TLI, LookupDomTree);
 
     if (!F->hasLocalLinkage())
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 00ddb93df830..317770d133b3 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -909,7 +909,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         // To check this we also need to nuke any dead constant uses (perhaps
         // made dead by this operation on other functions).
         Callee.removeDeadConstantUsers();
-        if (Callee.use_empty()) {
+        if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
           Calls.erase(
               std::remove_if(Calls.begin() + i + 1, Calls.end(),
                              [&Callee](const std::pair<CallSite, int> &Call) {
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index ac4765f96075..6baada2c1ae1 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -173,8 +173,10 @@ class SampleProfileLoader {
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
   bool computeBlockWeights(Function &F);
   void findEquivalenceClasses(Function &F);
+  template <bool IsPostDom>
   void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
-                           DominatorTreeBase<BasicBlock> *DomTree);
+                           DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
+
   void propagateWeights(Function &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
@@ -217,7 +219,7 @@ class SampleProfileLoader {
 
   /// \brief Dominance, post-dominance and loop information.
   std::unique_ptr<DominatorTree> DT;
-  std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT;
+  std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
   std::unique_ptr<LoopInfo> LI;
 
   AssumptionCacheTracker *ACT;
@@ -773,9 +775,10 @@ bool SampleProfileLoader::inlineHotFunctions(
 /// \param DomTree  Opposite dominator tree. If \p Descendants is filled
 ///                 with blocks from \p BB1's dominator tree, then
 ///                 this is the post-dominator tree, and vice versa.
+template <bool IsPostDom>
 void SampleProfileLoader::findEquivalencesFor(
     BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
-    DominatorTreeBase<BasicBlock> *DomTree) {
+    DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
   const BasicBlock *EC = EquivalenceClass[BB1];
   uint64_t Weight = BlockWeights[EC];
   for (const auto *BB2 : Descendants) {
@@ -1283,7 +1286,7 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
   DT.reset(new DominatorTree);
   DT->recalculate(F);
 
-  PDT.reset(new DominatorTreeBase<BasicBlock>(true));
+  PDT.reset(new PostDomTreeBase<BasicBlock>());
   PDT->recalculate(F);
 
   LI.reset(new LoopInfo);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 773c86e23707..fdc9c373b95e 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1284,6 +1284,16 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyBSwap(I, Builder))
     return replaceInstUsesWith(I, V);
 
+  if (match(Op1, m_One())) {
+    // (1 << x) & 1 --> zext(x == 0)
+    // (1 >> x) & 1 --> zext(x == 0)
+    Value *X;
+    if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X))))) {
+      Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0));
+      return new ZExtInst(IsZero, I.getType());
+    }
+  }
+
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
 
@@ -1315,23 +1325,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
 
         break;
       }
-      case Instruction::Sub:
-        // -x & 1 -> x & 1
-        if (AndRHSMask.isOneValue() && match(Op0LHS, m_Zero()))
-          return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
-
-        break;
-
-      case Instruction::Shl:
-      case Instruction::LShr:
-        // (1 << x) & 1 --> zext(x == 0)
-        // (1 >> x) & 1 --> zext(x == 0)
-        if (AndRHSMask.isOneValue() && Op0LHS == AndRHS) {
-          Value *NewICmp =
-            Builder.CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType()));
-          return new ZExtInst(NewICmp, I.getType());
-        }
-        break;
       }
 
       // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
@@ -1417,12 +1410,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
     }
 
-    // (A&((~A)|B)) -> A&B
-    if (match(Op0, m_c_Or(m_Not(m_Specific(Op1)), m_Value(A))))
-      return BinaryOperator::CreateAnd(A, Op1);
-    if (match(Op1, m_c_Or(m_Not(m_Specific(Op0)), m_Value(A))))
-      return BinaryOperator::CreateAnd(A, Op0);
-
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
@@ -2020,18 +2007,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   Value *A, *B;
 
-  // ((~A & B) | A) -> (A | B)
-  if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A))))
-    return BinaryOperator::CreateOr(A, Op1);
-  if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A))))
-    return BinaryOperator::CreateOr(Op0, A);
-
-  // ((A & B) | ~A) -> (~A | B)
-  // The NOT is guaranteed to be in the RHS by complexity ordering.
-  if (match(Op1, m_Not(m_Value(A))) &&
-      match(Op0, m_c_And(m_Specific(A), m_Value(B))))
-    return BinaryOperator::CreateOr(Op1, B);
-
   // (A & C)|(B & D)
   Value *C = nullptr, *D = nullptr;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
@@ -2176,17 +2151,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         return BinaryOperator::CreateOr(Not, Op0);
       }
 
-  // (A & B) | (~A ^ B) -> (~A ^ B)
-  // (A & B) | (B ^ ~A) -> (~A ^ B)
-  // (B & A) | (~A ^ B) -> (~A ^ B)
-  // (B & A) | (B ^ ~A) -> (~A ^ B)
-  // The match order is important: match the xor first because the 'not'
-  // operation defines 'A'. We do not need to match the xor as Op0 because the
-  // xor was canonicalized to Op1 above.
-  if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
-      match(Op0, m_c_And(m_Specific(A), m_Specific(B))))
-    return BinaryOperator::CreateXor(Builder.CreateNot(A), B);
-
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 60d1cde971dd..a8faaecb5c34 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1814,9 +1814,21 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
         Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
     Value *CmpQ =
         Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
-    auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And
-                                                         : Instruction::Or;
-    return BinaryOperator::Create(LogicOpc, CmpP, CmpQ);
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, CmpP, CmpQ);
+  }
+
+  // Are we using xors to bitwise check for a pair of (in)equalities? Convert to
+  // a shorter form that has more potential to be folded even further.
+  Value *X1, *X2, *X3, *X4;
+  if (match(Or->getOperand(0), m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
+      match(Or->getOperand(1), m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
+    // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
+    // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
+    Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
+    Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
   }
 
   return nullptr;
@@ -3737,6 +3749,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
           const APInt &CVal = CI->getValue();
           if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
             return nullptr;
+        } else {
+          // In this case we could have the operand of the binary operation
+          // being defined in another block, and performing the replacement
+          // could break the dominance relation.
+          return nullptr;
         }
       } else {
         // Other uses prohibit this transformation.
@@ -3856,18 +3873,17 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
       } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
         assert(BO->getOpcode() == Instruction::And);
         // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
-        Value *ShortMask =
-            Builder.CreateTrunc(BO->getOperand(1), Builder.getIntNTy(MulWidth));
+        ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+        APInt ShortMask = CI->getValue().trunc(MulWidth);
         Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
-        Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
-        if (auto *ZextI = dyn_cast<Instruction>(Zext))
-          IC.Worklist.Add(ZextI);
+        Instruction *Zext =
+            cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType()));
+        IC.Worklist.Add(Zext);
         IC.replaceInstUsesWith(*BO, Zext);
       } else {
         llvm_unreachable("Unexpected Binary operation");
       }
-      if (auto *UI = dyn_cast<Instruction>(U))
-        IC.Worklist.Add(UI);
+      IC.Worklist.Add(cast<Instruction>(U));
     }
   }
   if (isa<Instruction>(OtherVal))
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c59e1ce69ac2..451036545741 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -998,8 +998,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     // that this code is not reachable.  We do this instead of inserting
     // an unreachable instruction directly because we cannot modify the
     // CFG.
-    new StoreInst(UndefValue::get(LI.getType()),
-                  Constant::getNullValue(Op->getType()), &LI);
+    StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
+                                  Constant::getNullValue(Op->getType()), &LI);
+    SI->setDebugLoc(LI.getDebugLoc());
     return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 5689c0604239..a20f474cbf40 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -417,8 +417,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // the highest demanded bit, we just return the other side.
       if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
         return I->getOperand(0);
-      // We can't do this with the LHS for subtraction.
-      if (I->getOpcode() == Instruction::Add &&
+      // We can't do this with the LHS for subtraction, unless we are only
+      // demanding the LSB.
+      if ((I->getOpcode() == Instruction::Add ||
+           DemandedFromOps.isOneValue()) &&
           DemandedFromOps.isSubsetOf(LHSKnown.Zero))
         return I->getOperand(1);
     }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 90e232399155..c7766568fd9d 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -636,17 +636,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
     Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
 
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+    Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I));
+
     // Do "A op C" and "B op C" both simplify?
-    if (Value *L =
-            SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I)))
-      if (Value *R =
-              SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I))) {
-        // They do! Return "L op' R".
-        ++NumExpand;
-        C = Builder.CreateBinOp(InnerOpcode, L, R);
-        C->takeName(&I);
-        return C;
-      }
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      C = Builder.CreateBinOp(InnerOpcode, L, R);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "B op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, B, C);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "B op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      C->takeName(&I);
+      return C;
+    }
   }
 
   if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
@@ -655,17 +673,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
     Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
 
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I));
+    Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+
     // Do "A op B" and "A op C" both simplify?
-    if (Value *L =
-            SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I)))
-      if (Value *R =
-              SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I))) {
-        // They do! Return "L op' R".
-        ++NumExpand;
-        A = Builder.CreateBinOp(InnerOpcode, L, R);
-        A->takeName(&I);
-        return A;
-      }
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      A = Builder.CreateBinOp(InnerOpcode, L, R);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op B" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op B".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, B);
+      A->takeName(&I);
+      return A;
+    }
   }
 
   // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0))
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 184940b7ea58..057f746e052d 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -22,9 +22,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
@@ -192,6 +195,11 @@ static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
                                       cl::desc("Check stack-use-after-return"),
                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
+                                        cl::desc("Create redzones for byval "
+                                                 "arguments (extra copy "
+                                                 "required)"), cl::Hidden,
+                                        cl::init(true));
 static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
                                      cl::desc("Check stack-use-after-scope"),
                                      cl::Hidden, cl::init(false));
@@ -747,6 +755,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   bool runOnFunction() {
     if (!ClStack) return false;
+
+    if (ClRedzoneByvalArgs) copyArgsPassedByValToAllocas();
+
     // Collect alloca, ret, lifetime instructions etc.
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
 
@@ -763,6 +774,11 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     return true;
   }
 
+  // Arguments marked with the "byval" attribute are implicitly copied without
+  // using an alloca instruction.  To produce redzones for those arguments, we
+  // copy them a second time into memory allocated with an alloca instruction.
+  void copyArgsPassedByValToAllocas();
+
   // Finds all Alloca instructions and puts
   // poisoned red zones around all of them.
   // Then unpoison everything back before the function returns.
@@ -2528,6 +2544,28 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) {
   llvm_unreachable("impossible LocalStackSize");
 }
 
+void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(&FirstBB, FirstBB.getFirstInsertionPt());
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr()) {
+      Type *Ty = Arg.getType()->getPointerElementType();
+      unsigned Align = Arg.getParamAlignment();
+      if (Align == 0) Align = DL.getABITypeAlignment(Ty);
+
+      const std::string &Name = Arg.hasName() ? Arg.getName().str() :
+          "Arg" + llvm::to_string(Arg.getArgNo());
+      AllocaInst *AI = IRB.CreateAlloca(Ty, nullptr, Twine(Name) + ".byval");
+      AI->setAlignment(Align);
+      Arg.replaceAllUsesWith(AI);
+
+      uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+      IRB.CreateMemCpy(AI, &Arg, AllocSize, Align);
+    }
+  }
+}
+
 PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
                                           Value *ValueIfTrue,
                                           Instruction *ThenTerm,
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 1348e0ed0ed0..b7c6271869cd 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3039,7 +3039,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVAStartInst(VAStartInst &I) override {
-    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+    if (F.getCallingConv() == CallingConv::Win64)
       return;
     IRBuilder<> IRB(&I);
     VAStartInstrumentationList.push_back(&I);
@@ -3053,7 +3053,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVACopyInst(VACopyInst &I) override {
-    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+    if (F.getCallingConv() == CallingConv::Win64)
       return;
     IRBuilder<> IRB(&I);
     Value *VAListTag = I.getArgOperand(0);
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index e3c36c98ab0d..06fe07598374 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -281,6 +281,16 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   SanCovTraceSwitchFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
           SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy));
+  // Make sure smaller parameters are zero-extended to i64 as required by the
+  // x86_64 ABI.
+  if (TargetTriple.getArch() == Triple::x86_64) {
+    for (int i = 0; i < 3; i++) {
+      SanCovTraceCmpFunction[i]->addParamAttr(0, Attribute::ZExt);
+      SanCovTraceCmpFunction[i]->addParamAttr(1, Attribute::ZExt);
+    }
+    SanCovTraceDivFunction[0]->addParamAttr(0, Attribute::ZExt);
+  }
+
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 7fd77a082b82..c5c9b2c185d6 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -562,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
   if (!MSSA)
     return false;
 
+  // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+  // read/write memory, then we can safely return true here.
+  // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+  // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+  // by also checking the MemorySSA MemoryAccess on the instruction.  Initial
+  // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+  // with the default optimization pipeline.
+  auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+  if (!EarlierMA)
+    return true;
+  auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+  if (!LaterMA)
+    return true;
+
   // Since we know LaterDef dominates LaterInst and EarlierInst dominates
   // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
   // EarlierInst and LaterInst and neither can any other write that potentially
   // clobbers LaterInst.
   MemoryAccess *LaterDef =
       MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
-  return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
+  return MSSA->dominates(LaterDef, EarlierMA);
 }
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 0fe72f3f7331..ea28705e684d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1168,6 +1168,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
                                  LI->isVolatile(), LI->getAlignment(),
                                  LI->getOrdering(), LI->getSyncScopeID(),
                                  UnavailablePred->getTerminator());
+    NewLoad->setDebugLoc(LI->getDebugLoc());
 
     // Transfer the old load's AA tags to the new load.
     AAMDNodes Tags;
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index a40c22c3fce9..99b4458ea0fa 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -805,6 +805,25 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
   // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
+    bool DecreasedRightValueByOne = false;
+    // Try to turn eq/ne predicates to those we can work with.
+    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+      // while (++i != len) {         while (++i < len) {
+      //   ...                 --->     ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SLT;
+    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+             !CanBeSMin(SE, RightSCEV)) {
+      // while (true) {               while (true) {
+      //   if (++i == len)     --->     if (++i > len - 1)
+      //     break;                       break;
+      //   ...                          ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SGT;
+      RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+      DecreasedRightValueByOne = true;
+    }
+
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
@@ -829,16 +848,41 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(Preheader->getTerminator());
-      RightValue = B.CreateAdd(RightValue, One);
+      // We need to increase the right value unless we have already decreased
+      // it virtually when we replaced EQ with SGT.
+      if (!DecreasedRightValueByOne) {
+        IRBuilder<> B(Preheader->getTerminator());
+        RightValue = B.CreateAdd(RightValue, One);
+      }
     } else {
       if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
                                        RightSCEV)) {
         FailureReason = "Induction variable start not bounded by upper limit";
         return None;
       }
+      assert(!DecreasedRightValueByOne &&
+             "Right value can be decreased only for LatchBrExitIdx == 0!");
     }
   } else {
+    bool IncreasedRightValueByOne = false;
+    // Try to turn eq/ne predicates to those we can work with.
+    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+      // while (--i != len) {         while (--i > len) {
+      //   ...                 --->     ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SGT;
+    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+             !CanBeSMax(SE, RightSCEV)) {
+      // while (true) {               while (true) {
+      //   if (--i == len)     --->     if (--i < len + 1)
+      //     break;                       break;
+      //   ...                          ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SLT;
+      RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+      IncreasedRightValueByOne = true;
+    }
+
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
         (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
@@ -863,14 +907,20 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(Preheader->getTerminator());
-      RightValue = B.CreateSub(RightValue, One);
+      // We need to decrease the right value unless we have already increased
+      // it virtually when we replaced EQ with SLT.
+      if (!IncreasedRightValueByOne) {
+        IRBuilder<> B(Preheader->getTerminator());
+        RightValue = B.CreateSub(RightValue, One);
+      }
     } else {
       if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
                                        RightSCEV)) {
         FailureReason = "Induction variable start not bounded by lower limit";
         return None;
       }
+      assert(!IncreasedRightValueByOne &&
+             "Right value can be increased only for LatchBrExitIdx == 0!");
     }
   }
 
@@ -922,14 +972,18 @@ LoopConstrainer::calculateSubRanges() const {
 
   bool Increasing = MainLoopStructure.IndVarIncreasing;
 
-  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
-  // range of values the induction variable takes.
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+  // [Smallest, GreatestSeen] is the range of values the induction variable
+  // takes.
 
-  const SCEV *Smallest = nullptr, *Greatest = nullptr;
+  const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
 
+  const SCEV *One = SE.getOne(Ty);
   if (Increasing) {
     Smallest = Start;
     Greatest = End;
+    // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+    GreatestSeen = SE.getMinusSCEV(End, One);
   } else {
     // These two computations may sign-overflow.  Here is why that is okay:
     //
@@ -947,9 +1001,9 @@ LoopConstrainer::calculateSubRanges() const {
     //    will be an empty range.  Returning an empty range is always safe.
     //
 
-    const SCEV *One = SE.getOne(Ty);
     Smallest = SE.getAddExpr(End, One);
     Greatest = SE.getAddExpr(Start, One);
+    GreatestSeen = Start;
   }
 
   auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
@@ -964,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const {
     Result.LowLimit = Clamp(Range.getBegin());
 
   bool ProvablyNoPostLoop =
-      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+      SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd());
   if (!ProvablyNoPostLoop)
     Result.HighLimit = Clamp(Range.getEnd());
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ee3de51b1360..4056cc5cb346 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -2168,11 +2168,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
   return false;
 }
 
-/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// same BB in the form
 /// bb:
 ///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
-///   %s = select p, trueval, falseval
+///   %s = select %p, trueval, falseval
 ///
+/// or
+///
+/// bb:
+///   %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+///   %c = cmp %p, 0
+///   %s = select %c, trueval, falseval
+//
 /// And expand the select into a branch structure. This later enables
 /// jump-threading over bb in this pass.
 ///
@@ -2186,44 +2194,54 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   if (LoopHeaders.count(BB))
     return false;
 
-  // Look for a Phi/Select pair in the same basic block.  The Phi feeds the
-  // condition of the Select and at least one of the incoming values is a
-  // constant.
   for (BasicBlock::iterator BI = BB->begin();
        PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
-    unsigned NumPHIValues = PN->getNumIncomingValues();
-    if (NumPHIValues == 0 || !PN->hasOneUse())
+    // Look for a Phi having at least one constant incoming value.
+    if (llvm::all_of(PN->incoming_values(),
+                     [](Value *V) { return !isa<ConstantInt>(V); }))
       continue;
 
-    SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
-    if (!SI || SI->getParent() != BB)
-      continue;
-
-    Value *Cond = SI->getCondition();
-    if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
-      continue;
-
-    bool HasConst = false;
-    for (unsigned i = 0; i != NumPHIValues; ++i) {
-      if (PN->getIncomingBlock(i) == BB)
+    auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+      // Check if SI is in BB and use V as condition.
+      if (SI->getParent() != BB)
         return false;
-      if (isa<ConstantInt>(PN->getIncomingValue(i)))
-        HasConst = true;
+      Value *Cond = SI->getCondition();
+      return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+    };
+
+    SelectInst *SI = nullptr;
+    for (Use &U : PN->uses()) {
+      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+        // Look for a ICmp in BB that compares PN with a constant and is the
+        // condition of a Select.
+        if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+            isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+          if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+            if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+              SI = SelectI;
+              break;
+            }
+      } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+        // Look for a Select in BB that uses PN as condtion.
+        if (isUnfoldCandidate(SelectI, U.get())) {
+          SI = SelectI;
+          break;
+        }
+      }
     }
 
-    if (HasConst) {
-      // Expand the select.
-      TerminatorInst *Term =
-          SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
-      PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
-      NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
-      NewPN->addIncoming(SI->getFalseValue(), BB);
-      SI->replaceAllUsesWith(NewPN);
-      SI->eraseFromParent();
-      return true;
-    }
+    if (!SI)
+      continue;
+    // Expand the select.
+    TerminatorInst *Term =
+        SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+    PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+    NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+    NewPN->addIncoming(SI->getFalseValue(), BB);
+    SI->replaceAllUsesWith(NewPN);
+    SI->eraseFromParent();
+    return true;
   }
-  
   return false;
 }
 
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 606136dc31a4..2e0d8e0374c0 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
 class LoopInterchangeLegality {
 public:
   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
+                          OptimizationRemarkEmitter *ORE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+        PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
 
   /// Check if the loops can be interchanged.
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -353,6 +355,8 @@ class LoopInterchangeLegality {
   LoopInfo *LI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
 
   bool InnerLoopHasReduction;
 };
@@ -361,8 +365,9 @@ class LoopInterchangeLegality {
 /// loop.
 class LoopInterchangeProfitability {
 public:
-  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                               OptimizationRemarkEmitter *ORE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -376,6 +381,8 @@ class LoopInterchangeProfitability {
 
   /// Scev analysis.
   ScalarEvolution *SE;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass {
   DependenceInfo *DI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
   LoopInterchange()
       : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
@@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass {
     AU.addRequired<DependenceAnalysisWrapperPass>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
@@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass {
     DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
     // Build up a worklist of loop pairs to analyze.
@@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass {
     Loop *OuterLoop = LoopList[OuterLoopId];
 
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
-                                PreserveLCSSA);
+                                PreserveLCSSA, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
       return false;
     }
     DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Interchanging loops not profitable\n");
       return false;
     }
 
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
+                                 InnerLoop->getStartLoc(),
+                                 InnerLoop->getHeader())
+              << "Loop interchanged with enclosing loop.");
+
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
                                  LoopNestExit, LIL.hasInnerLoopReduction());
     LIT.transform();
@@ -760,6 +777,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
     DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedPHIInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with induction or reduction PHI nodes can be"
+                 " interchange currently.");
     return true;
   }
 
@@ -767,6 +790,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
                  << "Failed to interchange due to current limitation\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "MultiInductionInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with 1 induction variable can be "
+                 "interchanged currently.");
     return true;
   }
   if (Reductions.size() > 0)
@@ -777,6 +806,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
     DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedPHIOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with induction or reduction PHI nodes can be"
+                 " interchanged currently.");
     return true;
   }
 
@@ -785,18 +820,35 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!Reductions.empty()) {
     DEBUG(dbgs() << "Outer loops with reductions are not supported "
                  << "currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "ReductionsOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Outer loops with reductions cannot be interchangeed "
+                 "currently.");
     return true;
   }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
                  << "supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "MultiIndutionOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with 1 induction variable can be "
+                 "interchanged currently.");
     return true;
   }
 
   // TODO: Triangular loops are not handled for now.
   if (!isLoopStructureUnderstood(InnerInductionVar)) {
     DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedStructureInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Inner loop structure not understood currently.");
     return true;
   }
 
@@ -805,12 +857,24 @@ bool LoopInterchangeLegality::currentLimitations() {
       getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoLCSSAPHIOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with LCSSA PHIs can be interchange "
+                 "currently.");
     return true;
   }
 
   LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoLCSSAPHIOuterInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with LCSSA PHIs can be interchange "
+                 "currently.");
     return true;
   }
 
@@ -835,6 +899,11 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!InnerIndexVarInc) {
     DEBUG(dbgs() << "Did not find an instruction to increment the induction "
                  << "variable.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoIncrementInInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "The inner loop does not increment the induction variable.");
     return true;
   }
 
@@ -852,6 +921,12 @@ bool LoopInterchangeLegality::currentLimitations() {
     if (!I.isIdenticalTo(InnerIndexVarInc)) {
       DEBUG(dbgs() << "Found unsupported instructions between induction "
                    << "variable increment and branch.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedInsBetweenInduction",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Found unsupported instruction between induction variable "
+                 "increment and branch.");
       return true;
     }
 
@@ -862,6 +937,11 @@ bool LoopInterchangeLegality::currentLimitations() {
   // current limitation.
   if (!FoundInduction) {
     DEBUG(dbgs() << "Did not find the induction variable.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoIndutionVariable",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Did not find the induction variable.");
     return true;
   }
   return false;
@@ -875,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
                  << " and OuterLoopId = " << OuterLoopId
                  << " due to dependence\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "Dependence",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Cannot interchange loops due to dependences.");
     return false;
   }
 
@@ -910,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   // Check if the loops are tightly nested.
   if (!tightlyNested(OuterLoop, InnerLoop)) {
     DEBUG(dbgs() << "Loops not tightly nested\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NotTightlyNested",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Cannot interchange loops because they are not tightly "
+                 "nested.");
     return false;
   }
 
@@ -1005,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
 
   // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
-  bool ImprovesPar =
-      isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
-  return ImprovesPar;
+  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+    return true;
+
+  ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                     "InterchangeNotProfitable",
+                                     InnerLoop->getStartLoc(),
+                                     InnerLoop->getHeader())
+            << "Interchanging loops is too costly (cost="
+            << ore::NV("Cost", Cost) << ", threshold="
+            << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
+            ") and it does not improve parallelism.");
+  return false;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
@@ -1291,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 
 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
                     "Interchanges loops for cache reuse", false, false)
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 9397b87cdf56..90c5c243f464 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -68,6 +68,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
   // Because of PR962, we don't TRE dynamic allocas.
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
-        if (!AI->isStaticAlloca())
-          return false;
-      }
-    }
-  }
-
-  return true;
+  return llvm::all_of(instructions(F), [](Instruction &I) {
+    auto *AI = dyn_cast<AllocaInst>(&I);
+    return !AI || AI->isStaticAlloca();
+  });
 }
 
 namespace {
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5170c68e2915..d43ce7abb7cd 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -736,7 +737,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // remainder are connected to the original Loop's exit blocks. The remaining
   // work is to update the phi nodes in the original loop, and take in the
   // values from the cloned region. Also update the dominator info for
-  // OtherExits, since we have new edges into OtherExits.
+  // OtherExits and their immediate successors, since we have new edges into
+  // OtherExits.
+  SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
   for (auto *BB : OtherExits) {
    for (auto &II : *BB) {
 
@@ -759,12 +762,35 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                            cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
      }
    }
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+    for (BasicBlock *SuccBB : successors(BB)) {
+      assert(!(any_of(OtherExits,
+                      [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
+               SuccBB == LatchExit) &&
+             "Breaks the definition of dedicated exits!");
+    }
+#endif
    // Update the dominator info because the immediate dominator is no longer the
    // header of the original Loop. BB has edges both from L and remainder code.
    // Since the preheader determines which loop is run (L or directly jump to
    // the remainder code), we set the immediate dominator as the preheader.
-   if (DT)
+   if (DT) {
      DT->changeImmediateDominator(BB, PreHeader);
+     // Also update the IDom for immediate successors of BB.  If the current
+     // IDom is the header, update the IDom to be the preheader because that is
+     // the nearest common dominator of all predecessors of SuccBB.  We need to
+     // check for IDom being the header because successors of exit blocks can
+     // have edges from outside the loop, and we should not incorrectly update
+     // the IDom in that case.
+     for (BasicBlock *SuccBB: successors(BB))
+       if (ImmediateSuccessorsOfExitBlocks.insert(SuccBB).second) {
+         if (DT->getNode(SuccBB)->getIDom()->getBlock() == Header) {
+           assert(!SuccBB->getSinglePredecessor() &&
+                  "BB should be the IDom then!");
+           DT->changeImmediateDominator(SuccBB, PreHeader);
+         }
+       }
+    }
   }
 
   // Loop structure should be the following:
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index eb82ee283d44..012b10c8a9b0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -574,11 +574,9 @@ class InnerLoopVectorizer {
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 
-  /// Emit a bypass check to see if the trip count would overflow, or we
-  /// wouldn't have enough iterations to execute one vector loop.
+  /// Emit a bypass check to see if the vector trip count is zero, including if
+  /// it overflows.
   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
-  /// Emit a bypass check to see if the vector trip count is nonzero.
-  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
   /// Emit a bypass check to see if all of the SCEV assumptions we've
   /// had to make are correct.
   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
@@ -3289,37 +3287,16 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
-  Value *CheckMinIters = Builder.CreateICmpULT(
-      Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+  // Generate code to check if the loop's trip count is less than VF * UF, or
+  // equal to it in case a scalar epilogue is required; this implies that the
+  // vector trip count is zero. This check also covers the case where adding one
+  // to the backedge-taken count overflowed leading to an incorrect trip count
+  // of zero. In this case we will also jump to the scalar loop.
+  auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+                                           : ICmpInst::ICMP_ULT;
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
 
-  BasicBlock *NewBB =
-      BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
-  // Update dominator tree immediately if the generated block is a
-  // LoopBypassBlock because SCEV expansions to generate loop bypass
-  // checks may query it before the current function is finished.
-  DT->addNewBlock(NewBB, BB);
-  if (L->getParentLoop())
-    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
-  ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
-  LoopBypassBlocks.push_back(BB);
-}
-
-void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
-                                                     BasicBlock *Bypass) {
-  Value *TC = getOrCreateVectorTripCount(L);
-  BasicBlock *BB = L->getLoopPreheader();
-  IRBuilder<> Builder(BB->getTerminator());
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
-                                    "cmp.zero");
-
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   // Update dominator tree immediately if the generated block is a
   // LoopBypassBlock because SCEV expansions to generate loop bypass
@@ -3328,7 +3305,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, Cmp));
+                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
   LoopBypassBlocks.push_back(BB);
 }
 
@@ -3477,14 +3454,13 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
 
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
 
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  emitMinimumIterationCountCheck(Lp, ScalarPH);
   // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  // jump to the scalar loop. This check also covers the case where the
+  // backedge-taken count is uint##_max: adding one to it will overflow leading
+  // to an incorrect trip count of zero. In this (rare) case we will also jump
+  // to the scalar loop.
+  emitMinimumIterationCountCheck(Lp, ScalarPH);
+
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
   emitSCEVChecks(Lp, ScalarPH);
@@ -3527,7 +3503,7 @@ void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
       // We know what the end value is.
       EndValue = CountRoundDown;
     } else {
-      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
@@ -4168,7 +4144,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // To do so, we need to generate the 'identity' vector and override
   // one of the elements with the incoming scalar reduction. We need
   // to do it in the vector-loop preheader.
-  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
   // This is the vector-clone of the value that leaves the loop.
   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4425043ad39a..dcbcab459a6b 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -434,7 +434,7 @@ class BoUpSLP {
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
-  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
+  Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -857,7 +857,7 @@ class BoUpSLP {
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
+    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
 
     /// Un-bundles a group of instructions.
     void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
@@ -1212,7 +1212,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+  BasicBlock *BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
@@ -1237,7 +1237,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this)) {
+  if (!BS.tryScheduleBundle(VL, this, VL0)) {
     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
@@ -2427,8 +2427,8 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
-  if (const TreeEntry *En = getTreeEntry(VL[0])) {
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
+  if (const TreeEntry *En = getTreeEntry(OpValue)) {
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -2553,7 +2553,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Value *InVec = vectorizeTree(INVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       CastInst *CI = dyn_cast<CastInst>(VL0);
@@ -2575,7 +2575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
@@ -2604,7 +2604,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *True = vectorizeTree(TrueVec);
       Value *False = vectorizeTree(FalseVec);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       Value *V = Builder.CreateSelect(Cond, True, False);
@@ -2644,7 +2644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
@@ -2806,7 +2806,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       // Create a vector of LHS op1 RHS
@@ -3097,8 +3097,8 @@ void BoUpSLP::optimizeGatherSequence() {
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP) {
-  if (isa<PHINode>(VL[0]))
+                                                 BoUpSLP *SLP, Value *OpValue) {
+  if (isa<PHINode>(OpValue))
     return true;
 
   // Initialize the instruction bundle.
@@ -3106,7 +3106,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
-  DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+  DEBUG(dbgs() << "SLP:  bundle: " << *OpValue << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
@@ -3177,7 +3177,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL, VL[0]);
+    cancelScheduling(VL, OpValue);
     return false;
   }
   return true;
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 3e3eff39d637..f475878e2f2a 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -30,7 +30,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   list(INSERT CMAKE_MODULE_PATH 0
     "${CMAKE_CURRENT_SOURCE_DIR}/../cmake"
     "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules"
-    "${LLVM_BINARY_DIR}/lib/cmake/llvm"
+    "${LLVM_LIBRARY_DIR}/cmake/llvm"
   )
 
   # Some of the runtimes will conditionally use the compiler-rt sanitizers
@@ -123,7 +123,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
         set(LLVM_RUNTIME_OUTPUT_INTDIR "${LLVM_TOOLS_BINARY_DIR}/${LLVM_RUNTIMES_TARGET}")
       endif()
     endif()
-    
+
     # Between each sub-project we want to cache and clear the LIT properties
     set_property(GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
     set_property(GLOBAL PROPERTY LLVM_LIT_PARAMS)
@@ -154,7 +154,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   if(LLVM_INCLUDE_TESTS)
     # Add a global check rule now that all subdirectories have been traversed
     # and we know the total set of lit testsuites.
-    
+
     add_lit_target(check-runtimes
       "Running all regression tests"
       ${RUNTIMES_LIT_TESTSUITES}
@@ -331,6 +331,7 @@ else() # if this is included from LLVM's CMake
                              # Builtins were built separately above
                              CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off
                                         -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
+                                        -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR}
                                         -DCMAKE_C_COMPILER_TARGET=${target}
                                         -DCMAKE_CXX_COMPILER_TARGET=${target}
                                         -DCMAKE_ASM_COMPILER_TARGET=${target}
@@ -339,6 +340,7 @@ else() # if this is included from LLVM's CMake
                                         -DCMAKE_ASM_COMPILER_WORKS=ON
                                         -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON
                                         ${${target}_extra_args}
+                             TOOLCHAIN_TOOLS clang lld llvm-ar llvm-ranlib
                              PASSTHROUGH_PREFIXES ${prefixes}
                              EXTRA_TARGETS ${${target}_extra_targets}
                                            ${${target}_test_targets}
@@ -376,6 +378,7 @@ else() # if this is included from LLVM's CMake
                                # Builtins were built separately above
                                CMAKE_ARGS -DCOMPILER_RT_BUILD_BUILTINS=Off
                                           -DLLVM_INCLUDE_TESTS=${LLVM_INCLUDE_TESTS}
+                                          -DLLVM_LIBRARY_DIR=${LLVM_LIBRARY_DIR}
                                PASSTHROUGH_PREFIXES ${prefixes}
                                EXTRA_TARGETS ${extra_targets}
                                               ${test_targets}
diff --git a/test/Analysis/CostModel/SystemZ/fp-arith.ll b/test/Analysis/CostModel/SystemZ/fp-arith.ll
index 08a7c291138f..5f92db1ababf 100644
--- a/test/Analysis/CostModel/SystemZ/fp-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-arith.ll
@@ -1,4 +1,7 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-Z13 %s
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-Z14 %s
 ;
 ; Note: The scalarized vector instructions cost is not including any
 ; extracts, due to the undef operands
@@ -21,13 +24,17 @@ define void @fadd() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fadd float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fadd double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fadd fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fadd <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fadd <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fadd <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fadd <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fadd <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fadd <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fadd <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fadd <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fadd <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fadd <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fadd <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fadd <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fadd <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fadd <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fadd <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fadd <16 x double> undef, undef
 
   ret void;
@@ -49,13 +56,17 @@ define void @fsub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fsub float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fsub double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fsub fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fsub <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fsub <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fsub <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fsub <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fsub <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fsub <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fsub <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fsub <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fsub <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fsub <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fsub <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fsub <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fsub <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fsub <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fsub <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fsub <16 x double> undef, undef
 
   ret void;
@@ -77,13 +88,17 @@ define void @fmul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fmul float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fmul double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fmul fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fmul <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fmul <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fmul <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fmul <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fmul <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fmul <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fmul <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fmul <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fmul <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fmul <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fmul <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fmul <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fmul <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fmul <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fmul <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fmul <16 x double> undef, undef
 
   ret void;
@@ -105,13 +120,17 @@ define void @fdiv() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fdiv float undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fdiv double undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fdiv fp128 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fdiv <2 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fdiv <2 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = fdiv <2 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fdiv <2 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fdiv <4 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fdiv <4 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = fdiv <4 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fdiv <4 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fdiv <8 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fdiv <8 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 2 for instruction:   %res7 = fdiv <8 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fdiv <8 x double> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fdiv <16 x float> undef, undef
+; CHECK-Z13: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fdiv <16 x float> undef, undef
+; CHECK-Z14: Cost Model: Found an estimated cost of 4 for instruction:   %res9 = fdiv <16 x float> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fdiv <16 x double> undef, undef
 
   ret void;
diff --git a/test/Assembler/diimportedentity.ll b/test/Assembler/diimportedentity.ll
index bc85ca09f658..6a0e1eb931c1 100644
--- a/test/Assembler/diimportedentity.ll
+++ b/test/Assembler/diimportedentity.ll
@@ -18,9 +18,9 @@
 ; CHECK: !3 = !DICompositeType({{.*}})
 !3 = !DICompositeType(tag: DW_TAG_structure_type, name: "Class", size: 32, align: 32)
 
-; CHECK-NEXT: !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, line: 7)
+; CHECK-NEXT: !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0, entity: !1, file: !2, line: 7)
 !4 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
-                       entity: !1, line: 7)
+                       entity: !1, file: !2, line: 7)
 
 ; CHECK-NEXT: !5 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0)
 !5 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !0)
diff --git a/test/Bitcode/DIGlobalVariableExpression.ll b/test/Bitcode/DIGlobalVariableExpression.ll
index 31c3fda1b00a..3cf082472829 100644
--- a/test/Bitcode/DIGlobalVariableExpression.ll
+++ b/test/Bitcode/DIGlobalVariableExpression.ll
@@ -36,4 +36,4 @@
 !9 = !{!"clang version 4.0.0 (trunk 286129) (llvm/trunk 286128)"}
 !10 = distinct !DIGlobalVariable(name: "c", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true, expr: !DIExpression(DW_OP_constu, 23, DW_OP_stack_value))
 !11 = distinct !DIGlobalVariable(name: "h", scope: !1, file: !2, line: 2, type: !5, isLocal: false, isDefinition: true)
-!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 1, scope: !1, entity: !11)
+!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !2, line: 1, scope: !1, entity: !11)
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index cf6c30e7c26c..e9313dfba870 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -368,9 +368,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index 180dad258b68..82fc99055357 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -368,9 +368,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index 370c7f51a2b7..2e70a380d10e 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -393,9 +393,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index 4115cbd8fe64..7c84daa7d3c4 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -422,9 +422,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
index eef925564ecb..9e34d48c95f7 100644
--- a/test/Bitcode/compatibility-4.0.ll
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -422,9 +422,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.x86_64_win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index ebd727ba9aee..7df1535a6923 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -425,9 +425,9 @@ declare cc78 void @f.cc78()
 declare x86_64_sysvcc void @f.x86_64_sysvcc()
 ; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
 declare cc79 void @f.cc79()
-; CHECK: declare x86_64_win64cc void @f.cc79()
-declare x86_64_win64cc void @f.x86_64_win64cc()
-; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare win64cc void @f.cc79()
+declare win64cc void @f.win64cc()
+; CHECK: declare win64cc void @f.win64cc()
 declare cc80 void @f.cc80()
 ; CHECK: declare x86_vectorcallcc void @f.cc80()
 declare x86_vectorcallcc void @f.x86_vectorcallcc()
diff --git a/test/Bitcode/upgrade-importedentity.ll b/test/Bitcode/upgrade-importedentity.ll
new file mode 100644
index 000000000000..134ccf1f3eaf
--- /dev/null
+++ b/test/Bitcode/upgrade-importedentity.ll
@@ -0,0 +1,15 @@
+; RUN:  llvm-dis < %s.bc | FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)", emissionKind: FullDebug, imports: !3)
+!1 = !DIFile(filename: "using.ii", directory: "/")
+!3 = !{!4}
+!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !5, entity: !8, line: 301)
+; CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !5)
+!5 = !DINamespace(name: "M", scope: null)
+!8 = !DINamespace(name: "N", scope: null)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/Bitcode/upgrade-importedentity.ll.bc b/test/Bitcode/upgrade-importedentity.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..7fa833b5046272bdf22ea4047a65d545b32b5cef
GIT binary patch
literal 1216
zcmZ`&Z%i9y7=OzZ?tt!EwkoY$FL$slT@W~0fp%>p?IG(jG$k6dFUA%qP=#`|M`>AN
z+Fj?d*~D%kS(7b>COD&hDN2abCAPF;!<vQZV)%eKLLt*2AqytP@WHoC6TkT8-shL+
zdEfiJ&tDsx-e^+-Py+zylQBd_l`3movo$`7)TQB<G0q{Dg^YPkFfC+=w1mt_$Te%-
zy48@*7@{nZlW6i8LrhSV=OGDPv<!elh}C5@D1&$cZZy(x-I|x4uTbcWHzZNnsd7wB
zN<ZieW3QCiZdBX46gs5=Q(dUi)lb{wN{a2C{urorwxS)6*U&kKwKo#m6t=HPYq~40
zQUFv!PIwig2Chg?&~s5JwxY?n@_0`)Rvqo?QDBI~FLi<!=pOvX^!C)l`~JT!B<_6p
zOY!OF_{;aRMPK93?|-{}pu4+!=aXMUwYQq)t+T@{0G-cc6(w#mfvIbH;<3up=a!}H
zXINDdMg_pU3V>0FdMr%jHi<!{#4^nqqS=@&jD~`!KQS)b0rKA!_(2%g%WULQM)M#^
z3^L_M*MzzPV#%|X7-A^|V?2jm!qAXi<T!MSLjpmOqeOL~2H%b2n~dDc$hY?KeFv^r
z<TM*Ju|{i_T;w2g)pSoVt+K`i#C%UMM-fwDUm0+smxj@fVbKR=Smfd&=R^ZqH1KCR
zq@4_lZ)4)PO7tbfP$Kf;-afwf82=-U?=nbVK1mdlMBzxTyNDH<$VD_Ou8N=HtTE3T
z6G!r^tYL;VOe3b5Yzut!@UrO<VoGIY-+M&vqUhrUei#ipMP4QHv!eRJ>+(Gyf9k-C
zaeS*&zDeP`FbYbmS<jG5R`Lg$SdWnFkBQ=@<~IpBsOmMO*t53<Q^9fapFo`AWDpG|
zdV3Dy@=^xKcgPhN@r2fFWHj$d$dbdfX5Uru#udR($eL2D1t#Qh$y{Vj(@@REzN><f
zij&YE44u-VzEbT%6aZ$3eWaaYr)6rR`egii=W;$jT<^e?x{p+9`dmKVYgX_8*qx|#
z)egXf-{T!>_IO(QM_k?^D&Y3{J!4*~t(j?NC@t@s@V-mw8AGd~?F^+I83~NG{KwXQ
z=GE7_CSATk>dj~Wb-DW{hN$!2!LffW4VZ+vtdKTf5oSiIC_wKNaCbt9Jl9Eqv=ade
z;TUifhJbx*p^{<u3k=n*%Z7<-G3jY-ZF~DS?X|jj{muEWd&^EWl+>Gx9bcSX)Ya-2
zQ(xZgm1@gAg*{shS*xpjTG$Kd)I~1AT3|_2tZ_Aad52g|9(!UfwdxM7d6r}F5V}#C
z1GHeC6HK>Z;StNMV1`W)&q|<uoB~I)t-}@dQmB{KTW~@Mh{utct&)C}q#+OA<PUh8
zj*`^g)Zt?KU9GJz>H7Pr|37MOZsFa2zQyL@`^N^{Efd_3&o$s~@{Dq0KHfdx_VOM+
L)I2g$dItOrtYeoM

literal 0
HcmV?d00001

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b52b6018e026..124f0c72fd75 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -48,6 +48,7 @@ set(LLVM_TEST_DEPENDS
           llvm-cvtres
           llvm-diff
           llvm-dis
+          llvm-dlltool
           llvm-dsymutil
           llvm-dwarfdump
           llvm-dwp
@@ -58,6 +59,7 @@ set(LLVM_TEST_DEPENDS
           llvm-mc
           llvm-mcmarkup
           llvm-modextract
+          llvm-mt
           llvm-nm
           llvm-objdump
           llvm-opt-report
@@ -65,6 +67,7 @@ set(LLVM_TEST_DEPENDS
           llvm-profdata
           llvm-ranlib
           llvm-readobj
+          llvm-readelf
           llvm-rtdyld
           llvm-size
           llvm-split
diff --git a/test/CodeGen/AArch64/GlobalISel/select-fma.mir b/test/CodeGen/AArch64/GlobalISel/select-fma.mir
new file mode 100644
index 000000000000..3b2f3746b587
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-fma.mir
@@ -0,0 +1,41 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @FMADDSrrr_fpr() { ret void }
+...
+
+---
+# CHECK-LABEL: name: FMADDSrrr_fpr
+name:            FMADDSrrr_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32, preferred-register: '' }
+# CHECK-NEXT:  - { id: 1, class: fpr32, preferred-register: '' }
+# CHECK-NEXT:  - { id: 2, class: fpr32, preferred-register: '' }
+# CHECK-NEXT:  - { id: 3, class: fpr32, preferred-register: '' }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = COPY %w2
+# CHECK:    %3 = FMADDSrrr %0, %1, %2
+body:             |
+  bb.0:
+    liveins: %w0, %w1, %w2
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = COPY %w2
+    %3(s32) = G_FMA %0, %1, %2
+    %x0 = COPY %3
+...
+
diff --git a/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
new file mode 100644
index 000000000000..2546e7c90ce5
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+define win64cc void @pass_va(i32 %count, ...) nounwind {
+entry:
+; CHECK: sub     sp, sp, #80
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: stp     x6, x7, [sp, #64]
+; CHECK: stp     x4, x5, [sp, #48]
+; CHECK: stp     x2, x3, [sp, #32]
+; CHECK: str     x1, [sp, #24]
+; CHECK: stp     x30, x8, [sp]
+; CHECK: bl      other_func
+; CHECK: ldr     x30, [sp], #80
+; CHECK: ret
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  call void @other_func(i8* %ap2)
+  ret void
+}
+
+declare void @other_func(i8*) local_unnamed_addr
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_copy(i8*, i8*) nounwind
+
+; CHECK-LABEL: f9:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define win64cc i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f8:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #16
+; CHECK: add     x0, sp, #16
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define win64cc i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f7:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #8
+; CHECK: add     x0, sp, #8
+; CHECK: stp     x8, x7, [sp], #16
+; CHECK: ret
+define win64cc i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 0a7965571480..64a6b9b6b210 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -11,9 +11,8 @@ define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
 ; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
 ; First vararg
 ; CHECK: ldr {{w[0-9]+}}, [sp, #72]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Second vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
 ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Third vararg
 ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index b2ea9ad3b4a1..b844aab5628c 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -280,10 +280,10 @@ entry:
 define i32 @caller42() #3 {
 entry:
 ; CHECK-LABEL: caller42
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK: add x1, sp, #32
 ; CHECK: mov x2, sp
 ; Space for s1 is allocated at sp+32
@@ -318,10 +318,10 @@ entry:
 ; CHECK-LABEL: caller42_stack
 ; CHECK: sub sp, sp, #112
 ; CHECK: add x29, sp, #96
-; CHECK: stur {{x[0-9]+}}, [x29, #-16]
-; CHECK: stur {{q[0-9]+}}, [x29, #-32]
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: stur {{x[0-9]+}}, [x29, #-16]
+; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
 ; Space for s1 is allocated at x29-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; CHECK: add x[[B:[0-9]+]], sp, #32
@@ -388,10 +388,10 @@ entry:
 define i32 @caller43() #3 {
 entry:
 ; CHECK-LABEL: caller43
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #16]
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
 ; CHECK: add x1, sp, #32
 ; CHECK: mov x2, sp
 ; Space for s1 is allocated at sp+32
@@ -430,10 +430,10 @@ entry:
 ; CHECK-LABEL: caller43_stack
 ; CHECK: sub sp, sp, #112
 ; CHECK: add x29, sp, #96
-; CHECK: stur {{q[0-9]+}}, [x29, #-16]
-; CHECK: stur {{q[0-9]+}}, [x29, #-32]
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-16]
+; CHECK-DAG: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #48]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #32]
 ; Space for s1 is allocated at x29-32 = sp+64
 ; Space for s2 is allocated at sp+32
 ; CHECK: add x[[B:[0-9]+]], sp, #32
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
index a3b740df9b4e..fdb379871048 100644
--- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -1,10 +1,8 @@
 ; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone < %s | FileCheck %s
 
 ; CHECK: foo
-; CHECK: str w[[REG0:[0-9]+]], [x19, #264]
-; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]]
-; CHECK: str w[[REG1]], [x19, #132]
-
+; CHECK-DAG: str w[[REG0:[0-9]+]], [x19, #132]
+; CHECK-DAG: str w[[REG0]], [x19, #264]
 define i32 @foo(i32 %a) nounwind {
   %retval = alloca i32, align 4
   %a.addr = alloca i32, align 4
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index 990782cb69a0..c98bda0d01a0 100644
--- a/test/CodeGen/AArch64/arm64-extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck %s
 ; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index f849df2a51ec..848b87fd2cfb 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -261,3 +261,13 @@ define void @test_inline_modifier_a(i8* %ptr) nounwind {
   ; CHECK: prfm pldl1keep, [x0]
   ret void
 }
+
+; PR33134
+define void @test_zero_address() {
+entry:
+; CHECK-LABEL: test_zero_address
+; CHECK: mov {{x[0-9]+}}, xzr
+; CHECK: ldr {{x[0-9]+}}, {{[x[0-9]+]}}
+  tail call i32 asm sideeffect "ldr $0, $1 \0A", "=r,*Q"(i32* null)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index f3af01a73559..9b5d8a890fa6 100644
--- a/test/CodeGen/AArch64/arm64-platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mtriple=arm64-apple-ios -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-freebsd-gnu -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 
 ; x18 is reserved as a platform register on Darwin but not on other
 ; systems. Create loads of register pressure and make sure this is respected.
diff --git a/test/CodeGen/AArch64/arm64-vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
index b315e4c409b0..c1edf1b2e9bf 100644
--- a/test/CodeGen/AArch64/arm64-vext.ll
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -116,7 +116,7 @@ define void @test_vext_p16() nounwind ssp {
 
 define void @test_vext_s32() nounwind ssp {
   ; CHECK-LABEL: test_vext_s32:
-  ; CHECK: {{ext.8.*#4}}
+  ; CHECK: {{rev64.2s.*}}
   %xS32x2 = alloca <2 x i32>, align 8
   %__a = alloca <2 x i32>, align 8
   %__b = alloca <2 x i32>, align 8
@@ -137,7 +137,7 @@ define void @test_vext_s32() nounwind ssp {
 
 define void @test_vext_u32() nounwind ssp {
   ; CHECK-LABEL: test_vext_u32:
-  ; CHECK: {{ext.8.*#4}}
+  ; CHECK: {{rev64.2s.*}}
   %xU32x2 = alloca <2 x i32>, align 8
   %__a = alloca <2 x i32>, align 8
   %__b = alloca <2 x i32>, align 8
@@ -158,7 +158,7 @@ define void @test_vext_u32() nounwind ssp {
 
 define void @test_vext_f32() nounwind ssp {
   ; CHECK-LABEL: test_vext_f32:
-  ; CHECK: {{ext.8.*#4}}
+  ; CHECK: {{rev64.2s.*}}
   %xF32x2 = alloca <2 x float>, align 8
   %__a = alloca <2 x float>, align 8
   %__b = alloca <2 x float>, align 8
@@ -179,7 +179,7 @@ define void @test_vext_f32() nounwind ssp {
 
 define void @test_vext_s64() nounwind ssp {
   ; CHECK-LABEL: test_vext_s64:
-  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; CHECK_FIXME: {{rev64.2s.*}}
   ; this just turns into a load of the second element
   %xS64x1 = alloca <1 x i64>, align 8
   %__a = alloca <1 x i64>, align 8
diff --git a/test/CodeGen/AArch64/atomic-ops-lse.ll b/test/CodeGen/AArch64/atomic-ops-lse.ll
index a85eb6b46aff..a0c418bff573 100644
--- a/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -681,3 +681,164 @@ define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
    ret i64 %old
 }
 
+define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i8:
+  %old = atomicrmw sub i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+
+; CHECK: ldaddalb w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i8 %old
+}
+
+define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i16:
+  %old = atomicrmw sub i16* @var16, i16 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+
+; CHECK: ldaddalh w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i16 %old
+}
+
+define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i32:
+  %old = atomicrmw sub i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i32 %old
+}
+
+define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i64:
+  %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret i64 %old
+}
+
+define void @test_atomic_load_sub_i32_noret(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i32_noret:
+  atomicrmw sub i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg w[[NEG:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldaddal w[[NEG]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret void
+}
+
+define void @test_atomic_load_sub_i64_noret(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i64_noret:
+  atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: neg x[[NEG:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldaddal x[[NEG]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+
+  ret void
+}
+
+define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i8:
+  %old = atomicrmw and i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+
+; CHECK: ldclralb w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i8 %old
+}
+
+define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i16:
+  %old = atomicrmw and i16* @var16, i16 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+
+; CHECK: ldclralh w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i16 %old
+}
+
+define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i32:
+  %old = atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i32 %old
+}
+
+define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i64:
+  %old = atomicrmw and i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret i64 %old
+}
+
+define void @test_atomic_load_and_i32_noret(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i32_noret:
+  atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn w[[NOT:[0-9]+]], w[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
+
+; CHECK: ldclral w[[NOT]], w[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret void
+}
+
+define void @test_atomic_load_and_i64_noret(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i64_noret:
+  atomicrmw and i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK: mvn x[[NOT:[0-9]+]], x[[OLD:[0-9]+]]
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
+
+; CHECK: ldclral x[[NOT]], x[[NEW:[0-9]+]], [x[[ADDR]]]
+; CHECK-NOT: dmb
+  ret void
+}
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 20ba3fea8377..a2fa1db8a8ac 100644
--- a/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -9,7 +9,7 @@ main_:
   %i32T = alloca i32, align 4
   %i32F = alloca i32, align 4
   %i32X = alloca i32, align 4
-  store i32 0, i32* %tmp
+  store i32 %argc, i32* %tmp
   store i32 15, i32* %i32T, align 4
   store i32 5, i32* %i32F, align 4
   %tmp6 = load i32, i32* %tmp, align 4
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index ac2153ad8ffe..5671a1070138 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
diff --git a/test/CodeGen/AArch64/falkor-hwpf-fix.ll b/test/CodeGen/AArch64/falkor-hwpf-fix.ll
new file mode 100644
index 000000000000..9f2af5adce71
--- /dev/null
+++ b/test/CodeGen/AArch64/falkor-hwpf-fix.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -mtriple aarch64 -mcpu=falkor -disable-post-ra | FileCheck %s
+
+; Check that strided load tag collisions are avoided on Falkor.
+
+; CHECK-LABEL: hwpf1:
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE:[0-9]+]], #-16]
+; CHECK: mov x[[BASE2:[0-9]+]], x[[BASE]]
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE2]], #-8]
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE3:[0-9]+]]]
+; CHECK: mov x[[BASE4:[0-9]+]], x[[BASE3]]
+; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE4]], #8]
+
+define void @hwpf1(i32* %p, i32* %sp, i32* %sp2, i32* %sp3, i32* %sp4) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load1 = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %gep, i32 1
+  %load2 = load i32, i32* %gep2
+
+  %add = add i32 %load1, %load2
+  %storegep = getelementptr inbounds i32, i32* %sp, i32 %iv
+  store i32 %add, i32* %storegep
+
+  %gep3 = getelementptr inbounds i32, i32* %gep, i32 2
+  %load3 = load i32, i32* %gep3
+
+  %gep4 = getelementptr inbounds i32, i32* %gep, i32 3
+  %load4 = load i32, i32* %gep4
+
+  %add2 = add i32 %load3, %load4
+  %storegep2 = getelementptr inbounds i32, i32* %sp2, i32 %iv
+  store i32 %add2, i32* %storegep2
+
+  %gep5 = getelementptr inbounds i32, i32* %gep, i32 4
+  %load5 = load i32, i32* %gep5
+
+  %gep6 = getelementptr inbounds i32, i32* %gep, i32 5
+  %load6 = load i32, i32* %gep6
+
+  %add3 = add i32 %load5, %load6
+  %storegep3 = getelementptr inbounds i32, i32* %sp3, i32 %iv
+  store i32 %add3, i32* %storegep3
+
+  %gep7 = getelementptr inbounds i32, i32* %gep, i32 6
+  %load7 = load i32, i32* %gep7
+
+  %gep8 = getelementptr inbounds i32, i32* %gep, i32 7
+  %load8 = load i32, i32* %gep8
+
+  %add4 = add i32 %load7, %load8
+  %storegep4 = getelementptr inbounds i32, i32* %sp4, i32 %iv
+  store i32 %add4, i32* %storegep4
+
+  %inc = add i32 %iv, 8
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/falkor-hwpf-fix.mir b/test/CodeGen/AArch64/falkor-hwpf-fix.mir
new file mode 100644
index 000000000000..54c8b16a9b43
--- /dev/null
+++ b/test/CodeGen/AArch64/falkor-hwpf-fix.mir
@@ -0,0 +1,52 @@
+# RUN: llc -mtriple=aarch64-linux-gnu -mcpu=falkor -run-pass falkor-hwpf-fix-late -o - %s | FileCheck %s
+--- |
+  @g = external global i32
+
+  define void @hwpf1() { ret void }
+  define void @hwpf2() { ret void }
+...
+---
+# Verify that the tag collision between the loads is resolved.
+# CHECK-LABEL: name: hwpf1
+# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
+# CHECK: LDRWui %[[BASE]], 0
+# CHECK: LDRWui %x1, 1
+name:            hwpf1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %w0, %x1
+
+    %w2 = LDRWui %x1, 0 :: ("aarch64-strided-access" load 4 from @g)
+    %w2 = LDRWui %x1, 1
+
+    %w0 = SUBWri %w0, 1, 0
+    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
+    Bcc 9, %bb.0, implicit %nzcv
+
+  bb.1:
+    RET_ReallyLR
+...
+---
+# Verify that the tag collision between the loads is resolved and written back for post increment addressing.
+# CHECK-LABEL: name: hwpf2
+# CHECK: %[[BASE:[a-z0-9]+]] = ORRXrs %xzr, %x1, 0
+# CHECK: LDRWpost %[[BASE]], 0
+# CHECK: %x1 = ORRXrs %xzr, %[[BASE]], 0
+# CHECK: LDRWui %x1, 1
+name:            hwpf2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %w0, %x1
+
+    %x1, %w2 = LDRWpost %x1, 0 :: ("aarch64-strided-access" load 4 from @g)
+    %w2 = LDRWui %x1, 1
+
+    %w0 = SUBWri %w0, 1, 0
+    %wzr = SUBSWri %w0, 0, 0, implicit-def %nzcv
+    Bcc 9, %bb.0, implicit %nzcv
+
+  bb.1:
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/falkor-hwpf.ll b/test/CodeGen/AArch64/falkor-hwpf.ll
new file mode 100644
index 000000000000..bbe7febe397f
--- /dev/null
+++ b/test/CodeGen/AArch64/falkor-hwpf.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s
+; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF
+
+; Check that strided access metadata is added to loads in inner loops when compiling for Falkor.
+
+; CHECK-LABEL: @hwpf1(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2, !falkor.strided.access !0
+
+; NOHWPF-LABEL: @hwpf1(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf1(i32* %p, i32* %p2) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv
+  %load2 = load i32, i32* %gep2
+
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Check that outer loop strided load isn't marked.
+; CHECK-LABEL: @hwpf2(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2{{$}}
+
+; NOHWPF-LABEL: @hwpf2(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf2(i32* %p) {
+entry:
+  br label %loop1
+
+loop1:
+  %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
+  %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]
+  br label %loop2.header
+
+loop2.header:
+  br label %loop2
+
+loop2:
+  %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
+  %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv2
+  %load = load i32, i32* %gep
+  %sum.inc = add i32 %sum, %load
+  %inc2 = add i32 %iv2, 1
+  %exitcnd2 = icmp uge i32 %inc2, 1024
+  br i1 %exitcnd2, label %exit2, label %loop2
+
+exit2:
+  %gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1
+  %load2 = load i32, i32* %gep2
+  br label %loop1.latch
+
+loop1.latch:
+  %inc1 = add i32 %iv1, 1
+  %exitcnd1 = icmp uge i32 %inc1, 1024
+  br i1 %exitcnd2, label %exit, label %loop1
+
+exit:
+  ret void
+}
+
+
+; Check that non-strided load isn't marked.
+; CHECK-LABEL: @hwpf3(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2{{$}}
+
+; NOHWPF-LABEL: @hwpf3(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf3(i32* %p, i32* %p2) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %load
+  %load2 = load i32, i32* %gep2
+
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/preferred-function-alignment.ll b/test/CodeGen/AArch64/preferred-function-alignment.ll
index 88e6f5dd01c9..386a6ecccf54 100644
--- a/test/CodeGen/AArch64/preferred-function-alignment.ll
+++ b/test/CodeGen/AArch64/preferred-function-alignment.ll
@@ -1,7 +1,6 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=generic < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a35 < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a53 < %s | FileCheck --check-prefix=ALIGN2 %s
-; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cyclone < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=falkor < %s | FileCheck --check-prefix=ALIGN2 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=kryo < %s | FileCheck --check-prefix=ALIGN2 %s
@@ -12,6 +11,7 @@
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 < %s | FileCheck --check-prefix=ALIGN3 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a72 < %s | FileCheck --check-prefix=ALIGN4 %s
+; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m2 < %s | FileCheck --check-prefix=ALIGN4 %s
 ; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 < %s | FileCheck --check-prefix=ALIGN4 %s
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
index bc28f477c810..bcad19e391d0 100644
--- a/test/CodeGen/AArch64/swifterror.ll
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -309,17 +309,17 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE-LABEL: foo_vararg:
 ; CHECK-APPLE: orr w0, wzr, #0x10
 ; CHECK-APPLE: malloc
-; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
-; CHECK-APPLE: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
-; CHECK-APPLE: strb [[ID]], [x0, #8]
+; CHECK-APPLE-DAG: orr [[ID:w[0-9]+]], wzr, #0x1
+; CHECK-APPLE-DAG: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
+; CHECK-APPLE-DAG: strb [[ID]], [x0, #8]
 
 ; First vararg
 ; CHECK-APPLE-DAG: orr {{x[0-9]+}}, [[ARGS]], #0x8
 ; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16]
-; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Second vararg
-; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-APPLE: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{x[0-9]+}}], #8
+; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16
 ; Third vararg
 ; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
 
diff --git a/test/CodeGen/AArch64/win64_vararg.ll b/test/CodeGen/AArch64/win64_vararg.ll
new file mode 100644
index 000000000000..b760e4acd16a
--- /dev/null
+++ b/test/CodeGen/AArch64/win64_vararg.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -mtriple=aarch64-pc-win32 | FileCheck %s
+
+define void @pass_va(i32 %count, ...) nounwind {
+entry:
+; CHECK: sub     sp, sp, #80
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: stp     x6, x7, [sp, #64]
+; CHECK: stp     x4, x5, [sp, #48]
+; CHECK: stp     x2, x3, [sp, #32]
+; CHECK: str     x1, [sp, #24]
+; CHECK: stp     x30, x8, [sp]
+; CHECK: bl      other_func
+; CHECK: ldr     x30, [sp], #80
+; CHECK: ret
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  call void @other_func(i8* %ap2)
+  ret void
+}
+
+declare void @other_func(i8*) local_unnamed_addr
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_copy(i8*, i8*) nounwind
+
+; CHECK-LABEL: f9:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #24
+; CHECK: add     x0, sp, #24
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define i8* @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f8:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #16
+; CHECK: add     x0, sp, #16
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #16
+; CHECK: ret
+define i8* @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: f7:
+; CHECK: sub     sp, sp, #16
+; CHECK: add     x8, sp, #8
+; CHECK: add     x0, sp, #8
+; CHECK: stp     x8, x7, [sp], #16
+; CHECK: ret
+define i8* @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap1)
+  %ap2 = load i8*, i8** %ap, align 8
+  ret i8* %ap2
+}
+
+; CHECK-LABEL: copy1:
+; CHECK: sub     sp, sp, #80
+; CHECK: add     x8, sp, #24
+; CHECK: stp     x6, x7, [sp, #64]
+; CHECK: stp     x4, x5, [sp, #48]
+; CHECK: stp     x2, x3, [sp, #32]
+; CHECK: stp     x8, x1, [sp, #16]
+; CHECK: str     x8, [sp, #8]
+; CHECK: add     sp, sp, #80
+; CHECK: ret
+define void @copy1(i64 %a0, ...) nounwind {
+entry:
+  %ap = alloca i8*, align 8
+  %cp = alloca i8*, align 8
+  %ap1 = bitcast i8** %ap to i8*
+  %cp1 = bitcast i8** %cp to i8*
+  call void @llvm.va_start(i8* %ap1)
+  call void @llvm.va_copy(i8* %cp1, i8* %ap1)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
new file mode 100644
index 000000000000..e9797eff712b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -0,0 +1,312 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=HSA %s
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+declare i32 @llvm.amdgcn.workitem.id.z() #0
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
+declare i64 @llvm.amdgcn.dispatch.id() #0
+
+; HSA: define void @use_workitem_id_x() #1 {
+define void @use_workitem_id_x() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workitem_id_y() #2 {
+define void @use_workitem_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workitem_id_z() #3 {
+define void @use_workitem_id_z() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.z()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_x() #4 {
+define void @use_workgroup_id_x() #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_y() #5 {
+define void @use_workgroup_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_z() #6 {
+define void @use_workgroup_id_z() #1 {
+  %val = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_dispatch_ptr() #7 {
+define void @use_dispatch_ptr() #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_queue_ptr() #8 {
+define void @use_queue_ptr() #1 {
+  %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_dispatch_id() #9 {
+define void @use_dispatch_id() #1 {
+  %val = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %val, i64 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @use_workgroup_id_y_workgroup_id_z() #10 {
+define void @use_workgroup_id_y_workgroup_id_z() #1 {
+  %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+  %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %val0, i32 addrspace(1)* undef
+  store volatile i32 %val1, i32 addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workitem_id_x() #1 {
+define void @func_indirect_use_workitem_id_x() #1 {
+  call void @use_workitem_id_x()
+  ret void
+}
+
+; HSA: define void @kernel_indirect_use_workitem_id_x() #1 {
+define void @kernel_indirect_use_workitem_id_x() #1 {
+  call void @use_workitem_id_x()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workitem_id_y() #2 {
+define void @func_indirect_use_workitem_id_y() #1 {
+  call void @use_workitem_id_y()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workitem_id_z() #3 {
+define void @func_indirect_use_workitem_id_z() #1 {
+  call void @use_workitem_id_z()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_x() #4 {
+define void @func_indirect_use_workgroup_id_x() #1 {
+  call void @use_workgroup_id_x()
+  ret void
+}
+
+; HSA: define void @kernel_indirect_use_workgroup_id_x() #4 {
+define void @kernel_indirect_use_workgroup_id_x() #1 {
+  call void @use_workgroup_id_x()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_y() #5 {
+define void @func_indirect_use_workgroup_id_y() #1 {
+  call void @use_workgroup_id_y()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_z() #6 {
+define void @func_indirect_use_workgroup_id_z() #1 {
+  call void @use_workgroup_id_z()
+  ret void
+}
+
+; HSA: define void @func_indirect_indirect_use_workgroup_id_y() #5 {
+define void @func_indirect_indirect_use_workgroup_id_y() #1 {
+  call void @func_indirect_use_workgroup_id_y()
+  ret void
+}
+
+; HSA: define void @indirect_x2_use_workgroup_id_y() #5 {
+define void @indirect_x2_use_workgroup_id_y() #1 {
+  call void @func_indirect_indirect_use_workgroup_id_y()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_dispatch_ptr() #7 {
+define void @func_indirect_use_dispatch_ptr() #1 {
+  call void @use_dispatch_ptr()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_queue_ptr() #8 {
+define void @func_indirect_use_queue_ptr() #1 {
+  call void @use_queue_ptr()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_dispatch_id() #9 {
+define void @func_indirect_use_dispatch_id() #1 {
+  call void @use_dispatch_id()
+  ret void
+}
+
+; HSA: define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #11 {
+define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 {
+  call void @func_indirect_use_workgroup_id_y_workgroup_id_z()
+  ret void
+}
+
+; HSA: define void @recursive_use_workitem_id_y() #2 {
+define void @recursive_use_workitem_id_y() #1 {
+  %val = call i32 @llvm.amdgcn.workitem.id.y()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  call void @recursive_use_workitem_id_y()
+  ret void
+}
+
+; HSA: define void @call_recursive_use_workitem_id_y() #2 {
+define void @call_recursive_use_workitem_id_y() #1 {
+  call void @recursive_use_workitem_id_y()
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 {
+define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 {
+define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  ret void
+}
+
+; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 {
+define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 {
+  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
+  store volatile i32 0, i32 addrspace(4)* %stof
+  call void @func_indirect_use_queue_ptr()
+  ret void
+}
+
+; HSA: define void @indirect_use_group_to_flat_addrspacecast() #8 {
+define void @indirect_use_group_to_flat_addrspacecast() #1 {
+  call void @use_group_to_flat_addrspacecast(i32 addrspace(3)* null)
+  ret void
+}
+
+; HSA: define void @indirect_use_group_to_flat_addrspacecast_gfx9() #11 {
+define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
+  call void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* null)
+  ret void
+}
+
+; HSA: define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #8 {
+define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
+  call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* null)
+  ret void
+}
+
+; HSA: define void @use_kernarg_segment_ptr() #14 {
+define void @use_kernarg_segment_ptr() #1 {
+  %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_kernarg_segment_ptr() #14 {
+define void @func_indirect_use_kernarg_segment_ptr() #1 {
+  call void @use_kernarg_segment_ptr()
+  ret void
+}
+
+; HSA: define void @use_implicitarg_ptr() #14 {
+define void @use_implicitarg_ptr() #1 {
+  %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+  store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef
+  ret void
+}
+
+; HSA: define void @func_indirect_use_implicitarg_ptr() #14 {
+define void @func_indirect_use_implicitarg_ptr() #1 {
+  call void @use_implicitarg_ptr()
+  ret void
+}
+
+; HSA: declare void @external.func() #15
+declare void @external.func() #3
+
+; HSA: define internal void @defined.func() #15 {
+define internal void @defined.func() #3 {
+  ret void
+}
+
+; HSA: define void @func_call_external() #15 {
+define void @func_call_external() #3 {
+  call void @external.func()
+  ret void
+}
+
+; HSA: define void @func_call_defined() #15 {
+define void @func_call_defined() #3 {
+  call void @defined.func()
+  ret void
+}
+
+; HSA: define void @func_call_asm() #15 {
+define void @func_call_asm() #3 {
+  call void asm sideeffect "", ""() #3
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @kern_call_external() #16 {
+define amdgpu_kernel void @kern_call_external() #3 {
+  call void @external.func()
+  ret void
+}
+
+; HSA: define amdgpu_kernel void @func_kern_defined() #16 {
+define amdgpu_kernel void @func_kern_defined() #3 {
+  call void @defined.func()
+  ret void
+}
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind "target-cpu"="fiji" }
+attributes #2 = { nounwind "target-cpu"="gfx900" }
+attributes #3 = { nounwind }
+
+; HSA: attributes #0 = { nounwind readnone speculatable }
+; HSA: attributes #1 = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" }
+; HSA: attributes #2 = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" }
+; HSA: attributes #3 = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" }
+; HSA: attributes #4 = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" }
+; HSA: attributes #5 = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" }
+; HSA: attributes #6 = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" }
+; HSA: attributes #7 = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" }
+; HSA: attributes #8 = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" }
+; HSA: attributes #9 = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" }
+; HSA: attributes #10 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" }
+; HSA: attributes #11 = { nounwind "target-cpu"="fiji" }
+; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" }
+; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" }
+; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" }
+; HSA: attributes #15 = { nounwind }
+; HSA: attributes #16 = { nounwind "amdgpu-flat-scratch" }
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index f7461b925ca1..3059a95a5098 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 
 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
@@ -164,6 +165,15 @@ define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
+; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 {
+define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 {
+  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
+  %val = load i32, i32 addrspace(2)* %bc
+  store i32 %val, i32 addrspace(1)* %ptr
+  ret void
+}
+
 ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
@@ -236,3 +246,4 @@ attributes #1 = { nounwind }
 ; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
 ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" }
 ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" }
+; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" }
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 63a6f6a8d32c..a0694fb1e3c9 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -36,7 +36,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK-LABEL: {{^}}min_1024_max_2048
 ; CHECK: SGPRBlocks: 1
 ; CHECK: VGPRBlocks: 7
-; CHECK: NumSGPRsForWavesPerEU: 13
+; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_2048() #3 {
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 3dda73bc336e..a5e97205de21 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -118,7 +118,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK-LABEL: {{^}}exactly_10:
 ; CHECK: SGPRBlocks: 1
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 13
+; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
@@ -188,3 +188,15 @@ define amdgpu_kernel void @exactly_10() #9 {
   ret void
 }
 attributes #9 = {"amdgpu-waves-per-eu"="10,10"}
+
+; Exactly 256 workitems and exactly 2 waves.
+; CHECK-LABEL: {{^}}empty_workitems_exactly_256_waves_exactly_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_workitems_exactly_256_waves_exactly_2() #10 {
+entry:
+  ret void
+}
+attributes #10 = {"amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2,2"}
diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 5383bbe71ae3..5ffa45595e70 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -347,7 +347,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -388,9 +390,11 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN:  v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; GCN:  v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
-; GCN:  flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9:  v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI:    v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI:    v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN:   flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -402,9 +406,11 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
 }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
-; GCN:  v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; GCN:  v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GFX9:  v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; VI:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; VI:    v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
 ; GCN:  flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -465,6 +471,49 @@ entry:
   ret float %canonicalized
 }
 
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
+; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]],
+; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-DENORM-NOT: 1.0
+; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+  %v = load float, float addrspace(1)* %gep, align 4
+  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+  store float %canonicalized, float addrspace(1)* %gep2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
+; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+  %v = load double, double addrspace(1)* %gep, align 8
+  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+  store double %canonicalized, double addrspace(1)* %gep2, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
+; GCN: flat_load_ushort [[V:v[0-9]+]],
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+  %v = load half, half addrspace(1)* %gep, align 2
+  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+  store half %canonicalized, half addrspace(1)* %gep2, align 2
+  ret void
+}
+
 declare float @llvm.canonicalize.f32(float) #0
 declare double @llvm.canonicalize.f64(double) #0
 declare half @llvm.canonicalize.f16(half) #0
@@ -485,3 +534,4 @@ declare float @llvm.maxnum.f32(float, float) #0
 declare double @llvm.maxnum.f64(double, double) #0
 
 attributes #0 = { nounwind readnone }
+attributes #1 = { "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/function-args.ll b/test/CodeGen/AMDGPU/function-args.ll
index 9b1368493ba5..6b22cb0b7e28 100644
--- a/test/CodeGen/AMDGPU/function-args.ll
+++ b/test/CodeGen/AMDGPU/function-args.ll
@@ -34,6 +34,22 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
   ret void
 }
 
+; GCN-LABEL: {{^}}i1_arg_i1_use:
+; GCN: v_and_b32_e32 v0, 1, v0
+; GCN: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1
+define void @i1_arg_i1_use(i1 %arg) #0 {
+bb:
+  br i1 %arg, label %bb2, label %bb1
+
+bb1:
+  store volatile i32 0, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
 ; GCN-LABEL: {{^}}void_func_i8:
 ; GCN-NOT: v0
 ; GCN: buffer_store_byte v0, off
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
index 972fbd66ef37..0b19fbe7d70c 100644
--- a/test/CodeGen/AMDGPU/hsa.ll
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -40,7 +40,7 @@
 ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
-; HSA: .amdgpu_hsa_kernel simple
+; HSA-LABEL: .amdgpu_hsa_kernel simple
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
 ; HSA: enable_sgpr_private_segment_buffer = 1
@@ -65,3 +65,11 @@ entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
+
+; HSA-LABEL: .amdgpu_hsa_kernel simple_no_kernargs
+; HSA: enable_sgpr_kernarg_segment_ptr = 0
+define amdgpu_kernel void @simple_no_kernargs() {
+entry:
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 9a27809f37bb..70e6b408ca29 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -49,6 +49,18 @@ define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x
   ret void
 }
 
+; ALL-LABEL: {{^}}test_no_kernargs:
+; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
+define amdgpu_kernel void @test_no_kernargs() #1 {
+  %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
+  %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
+  %value = load i32, i32 addrspace(2)* %gep
+  store volatile i32 %value, i32 addrspace(1)* undef
+  ret void
+}
+
 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
index f0af876567b4..1c3cba8d3e4f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index ee58d359a935..a466671d8c55 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: image_store
diff --git a/test/CodeGen/AMDGPU/move-to-valu-worklist.ll b/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
new file mode 100644
index 000000000000..539eed92d540
--- /dev/null
+++ b/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; In moveToVALU(), move to vector ALU is performed, all instrs in
+; the use chain will be visited. We do not want the same node to be 
+; pushed to the visit worklist more than once.
+		
+; GCN-LABEL: {{^}}in_worklist_once:
+; GCN: buffer_load_dword
+; GCN: BB0_1:
+; GCN: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @in_worklist_once() #0 {
+bb:
+	%tmp = load i64, i64* undef
+br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+	%tmp2 = phi i64 [ undef, %bb ], [ %tmp16, %bb1 ]
+	%tmp3 = phi i64 [ %tmp, %bb ], [ undef, %bb1 ]
+	%tmp11 = shl i64 %tmp2, 14
+	%tmp13 = xor i64 %tmp11, %tmp2
+	%tmp15 = and i64 %tmp3, %tmp13
+	%tmp16 = xor i64 %tmp15, %tmp3
+br label %bb1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 3a0605fa182a..742c4f8af85d 100644
--- a/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -5,42 +5,42 @@
 ; Test addressing modes when the scratch base is not a frame index.
 
 ; GCN-LABEL: {{^}}store_private_offset_i8:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i8() #0 {
   store volatile i8 5, i8* inttoptr (i32 8 to i8*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i16:
-; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i16() #0 {
   store volatile i16 5, i16* inttoptr (i32 8 to i16*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i32:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_i32() #0 {
   store volatile i32 5, i32* inttoptr (i32 8 to i32*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v2i32:
-; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_v2i32() #0 {
   store volatile <2 x i32> <i32 5, i32 10>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v4i32:
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @store_private_offset_v4i32() #0 {
   store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i8:
-; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i8() #0 {
   %load = load volatile i8, i8* inttoptr (i32 8 to i8*)
   ret void
@@ -65,7 +65,7 @@ define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i16:
-; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i16() #0 {
   %load = load volatile i16, i16* inttoptr (i32 8 to i16*)
   ret void
@@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i32:
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_i32() #0 {
   %load = load volatile i32, i32* inttoptr (i32 8 to i32*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v2i32:
-; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_v2i32() #0 {
   %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v4i32:
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8
+; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8
 define amdgpu_kernel void @load_private_offset_v4i32() #0 {
   %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*)
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset:
-; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095
+; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095
 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
   store volatile i8 5, i8* inttoptr (i32 4095 to i8*)
   ret void
@@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
   store volatile i8 5, i8* inttoptr (i32 4096 to i8*)
   ret void
@@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2:
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
-; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}}
+; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
   store volatile i8 5, i8* inttoptr (i32 4097 to i8*)
   ret void
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index 190d2b72ebaf..87f37144244e 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -8,7 +8,7 @@
 ; CHECK-NEXT: OR_INT
 
 ; FIXME: For some reason having the allocas here allowed the flatten cfg pass
-; to do its transfomation, however now that we are using local memory for
+; to do its transformation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 
 define amdgpu_kernel void @_Z9chk1D_512v() #0 {
diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
index 91116b0f65ea..e199d5b5df25 100644
--- a/test/CodeGen/AMDGPU/parallelorifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -5,7 +5,7 @@
 ; then merge if-regions with the same bodies.
 
 ; FIXME: For some reason having the allocas here allowed the flatten cfg pass
-; to do its transfomation, however now that we are using local memory for
+; to do its transformation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 ; XFAIL: *
 ;
diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll
index dcb089010e99..cf0c7944d4cd 100644
--- a/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -10,14 +10,14 @@
 ; GCN-LABEL: {{^}}store_to_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
 
 ; OPTNONE-NOT: s_mov_b32
-; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}}
 define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
@@ -26,7 +26,7 @@ define amdgpu_kernel void @store_to_undef() #0 {
 ; GCN-LABEL: {{^}}store_to_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
 define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 124 to i32*)
@@ -36,7 +36,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 {
 ; GCN-LABEL: {{^}}load_from_undef:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
@@ -46,7 +46,7 @@ define amdgpu_kernel void @load_from_undef() #0 {
 ; GCN-LABEL: {{^}}load_from_inttoptr:
 ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
-; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}}
 define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 124 to i32*)
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
index 770bfaddb23e..a52b80ba86e5 100644
--- a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
@@ -34,7 +34,7 @@ body:             |
   bb.0:
     successors: %bb.2, %bb.1
 
-    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, 0, implicit %exec
+    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, implicit %exec
     %vcc = COPY killed %7
     S_CBRANCH_VCCZ %bb.2, implicit killed %vcc
 
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index 6ed730ad60f4..5e0178072e5e 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=gfx804 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
 
 ; This used to fail due to a v_add_i32 instruction with an illegal immediate
@@ -8,15 +8,16 @@
 ;
 ; GCN-LABEL: {{^}}ps_main:
 
-; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s0
 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
 
-; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -25,9 +26,10 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}vs_main:
-; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN-NOT: s_mov_b32 s0
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_vs float @vs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -36,9 +38,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}cs_main:
-; GCN: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_cs float @cs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -47,10 +49,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}hs_main:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; SI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NOT: s_mov_b32 s0
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+
+; GFX9: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-NOT: s_mov_b32 s5
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
 define amdgpu_hs float @hs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -59,10 +66,13 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}gs_main:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; SI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+
+; GFX9: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
 define amdgpu_gs float @gs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -71,10 +81,16 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 }
 
 ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s6
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+
+; SI-NOT: s_mov_b32 s6
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+
+; GFX9-NOT: s_mov_b32 s5
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+
 ; GCN: s_mov_b32 s2, s5
 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
@@ -86,10 +102,14 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 }
 
 ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset:
-; SI: s_mov_b32 [[SWO:s[0-9]+]], s6
-; GFX9: s_mov_b32 [[SWO:s[0-9]+]], s5
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+; SI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen
+
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+; GFX9: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen
+
 ; GCN: s_mov_b32 s2, s5
 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index 4f5c582f8b58..ff1b2ad73ef0 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -332,7 +332,7 @@ body:             |
 
 
 # VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
-# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %exec, implicit %exec
+# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, implicit-def %exec, implicit %exec
 # VI: %vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %3, 0, 6, 4, implicit-def %vcc, implicit %exec
 # VI: %{{[0-9]+}} = V_CMPX_EQ_I32_e64 23, killed %{{[0-9]+}}, implicit-def %exec, implicit %exec
 
@@ -345,20 +345,21 @@ body:             |
 
 
 # VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec
-# VI: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec
-# VI: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec
 # VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
-# VI: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 
-# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 0, implicit %exec
-# GFX9: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec
-# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec
+# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, implicit %exec
 # GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
 # GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
-# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, implicit-def %exec, implicit %exec
+
 
 
 name:            vopc_instructions
@@ -415,28 +416,28 @@ body:             |
     V_CMPX_EQ_I32_e32 123, killed %13, implicit-def %vcc, implicit-def %exec, implicit %exec
 
     %14 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, 0, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, implicit %exec
     %15 = V_AND_B32_e64 %5, %3, implicit %exec
-    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, 0, implicit-def %exec, implicit %exec
+    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, implicit-def %exec, implicit %exec
     %16 = V_AND_B32_e64 %5, %3, implicit %exec
     %vcc = V_CMP_LT_I32_e64 %6, killed %16, implicit %exec
     %17 = V_AND_B32_e64 %5, %3, implicit %exec
     %19 = V_CMPX_EQ_I32_e64 23, killed %17, implicit-def %exec, implicit %exec
 
     %20 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, 0, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, implicit %exec
     %21 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, 2, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, implicit-def %exec, implicit %exec
     %23 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, 2, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, implicit %exec
     %24 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, 0, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, implicit-def %exec, implicit %exec
     %25 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, 0, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, implicit-def %exec, implicit %exec
     %26 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, 0, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, implicit-def %exec, implicit %exec
     %27 = V_AND_B32_e64 %5, %3, implicit %exec
-    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, 2, implicit-def %exec, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, implicit-def %exec, implicit %exec
 
 
     %100 = V_MOV_B32_e32 %vcc_lo, implicit %exec
diff --git a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
index 913b54332119..bd222adf6a68 100644
--- a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
+++ b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
@@ -8,7 +8,7 @@
 
 # GCN: %{{[0-9]+}} = V_BCNT_U32_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 # GCN: %{{[0-9]+}} = V_BFM_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
-# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %vcc, implicit %exec
+# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec
 # GCN: %{{[0-9]+}} = V_READLANE_B32 killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec
 
 ---
@@ -50,7 +50,7 @@ body:             |
     %15 = V_BFM_B32_e64 %13, killed %14, implicit-def %vcc, implicit %exec
 
     %16 = V_LSHRREV_B32_e64 16, %15, implicit %exec
-    %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, 0, implicit-def %vcc, implicit %exec
+    %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, implicit-def %vcc, implicit %exec
 
     %18 = V_LSHRREV_B32_e64 16, %17, implicit %exec
     %19 = V_READLANE_B32 killed %18, 0, implicit-def %vcc, implicit %exec
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
index 51771c9723e0..04ff4c87ea77 100644
--- a/test/CodeGen/AMDGPU/trap.ll
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -19,11 +19,11 @@ declare void @llvm.debugtrap() #0
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   208
+; MESA-TRAP-NEXT: .long   204
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   144
+; NOMESA-TRAP-NEXT: .long   140
 
 ; GCN-LABEL: {{^}}hsa_trap:
 ; HSA-TRAP: enable_trap_handler = 1
@@ -45,11 +45,11 @@ define amdgpu_kernel void @hsa_trap() {
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   208
+; MESA-TRAP-NEXT: .long   204
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   144
+; NOMESA-TRAP-NEXT: .long   140
 
 ; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (): debugtrap handler not supported
 ; GCN-LABEL: {{^}}hsa_debugtrap:
diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 6eb937e71b1b..54991d3d953c 100644
--- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -81,7 +81,7 @@ body:             |
     %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
     %sgpr7 = S_MOV_B32 61440
     %sgpr6 = S_MOV_B32 -1
-    %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, 0, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, implicit %exec
     S_CBRANCH_VCCZ %bb.1.else, implicit killed %vcc
 
   bb.2.if:
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 135f02ac205a..feae5e9f3792 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -19,8 +19,9 @@
 ; HSA: workitem_private_segment_byte_size = 1536
 
 ; GCN-NOT: flat_scr
+; MESA-NOT: s_mov_b32 s3
+; HSA-NOT: s_mov_b32 s7
 
-; GCNMESA-DAG: s_mov_b32 s16, s3
 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCNMESA-DAG: s_mov_b32 s14, -1
@@ -29,17 +30,32 @@
 ; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000
 
 
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_store_dword {{v[0-9]}}, off, s[12:15], s16 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}}
+
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}}
+
+
+
+; HSA: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_store_dword {{v[0-9]}}, off, s[0:3], s7 offset:{{[0-9]+}}
+
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
+; HSA: buffer_load_dword {{v[0-9]+}}, off, s[0:3], s7 offset:{{[0-9]+}}
 
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
-; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index ca2366a361fb..afbd06a00fae 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -13,7 +13,7 @@
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12
+; GCN-NOT: s_mov_b32 s12
 ; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
@@ -22,8 +22,8 @@
 ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
 ; OFFREG is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536
 
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 6a1da0dfe85f..0e3ef479bc3c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -45,6 +45,8 @@
   define void @test_select_s32() { ret void }
   define void @test_select_ptr() { ret void }
 
+  define void @test_br() { ret void }
+
   define void @test_soft_fp_double() #0 { ret void }
 
   attributes #0 = { "target-features"="+vfp2,-neonfp" }
@@ -1173,6 +1175,43 @@ body:             |
     ; CHECK: BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_br
+# CHECK-LABEL: name: test_br
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+body:             |
+  bb.0:
+  ; CHECK: bb.0
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: %r0
+
+    %0(s1) = COPY %r0
+    ; CHECK: [[COND:%[0-9]+]] = COPY %r0
+
+    G_BRCOND %0(s1), %bb.1
+    ; CHECK: TSTri [[COND]], 1, 14, _, implicit-def %cpsr
+    ; CHECK: Bcc %bb.1, 0, %cpsr
+    G_BR %bb.2
+    ; CHECK: B %bb.2
+
+  bb.1:
+  ; CHECK: bb.1
+    successors: %bb.2(0x80000000)
+
+    G_BR %bb.2
+    ; CHECK: B %bb.2
+
+  bb.2:
+  ; CHECK: bb.2
+
+    BX_RET 14, _
+    ; CHECK: BX_RET 14, _
+...
+---
 name:            test_soft_fp_double
 # CHECK-LABEL: name: test_soft_fp_double
 legalized:       true
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
index c778caacd0f4..c2e8c5abca4e 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-divmod.ll
@@ -87,3 +87,55 @@ define arm_aapcscc i32 @test_urem_i32(i32 %x, i32 %y) {
   %r = urem i32 %x, %y
   ret i32 %r
 }
+
+define arm_aapcscc i16 @test_srem_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: test_srem_i16:
+; CHECK-DAG: sxth r0, r0
+; CHECK-DAG: sxth r1, r1
+; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_idivmod
+; SOFT-DEFAULT: blx __modsi3
+  %r = srem i16 %x, %y
+  ret i16 %r
+}
+
+define arm_aapcscc i16 @test_urem_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: test_urem_i16:
+; CHECK-DAG: uxth r0, r0
+; CHECK-DAG: uxth r1, r1
+; HWDIV: udiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_uidivmod
+; SOFT-DEFAULT: blx __umodsi3
+  %r = urem i16 %x, %y
+  ret i16 %r
+}
+
+define arm_aapcscc i8 @test_srem_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_i8:
+; CHECK-DAG: sxtb r0, r0
+; CHECK-DAG: sxtb r1, r1
+; HWDIV: sdiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_idivmod
+; SOFT-DEFAULT: blx __modsi3
+  %r = srem i8 %x, %y
+  ret i8 %r
+}
+
+define arm_aapcscc i8 @test_urem_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_i8:
+; CHECK-DAG: uxtb r0, r0
+; CHECK-DAG: uxtb r1, r1
+; HWDIV: udiv [[Q:r[0-9]+]], r0, r1
+; HWDIV: mul [[P:r[0-9]+]], [[Q]], r1
+; HWDIV: sub r0, r0, [[P]]
+; SOFT-AEABI: blx __aeabi_uidivmod
+; SOFT-DEFAULT: blx __umodsi3
+  %r = urem i8 %x, %y
+  ret i8 %r
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
index 4c498ff6ca9b..419bcf71c106 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
@@ -420,3 +420,42 @@ entry:
   %r = select i1 %cond, i32* %a, i32* %b
   ret i32* %r
 }
+
+define arm_aapcscc void @test_br() {
+; CHECK-LABEL: test_br
+; CHECK: [[LABEL:.L[[:alnum:]_]+]]:
+; CHECK: b [[LABEL]]
+entry:
+  br label %infinite
+
+infinite:
+  br label %infinite
+}
+
+declare arm_aapcscc void @brcond1()
+declare arm_aapcscc void @brcond2()
+
+define arm_aapcscc void @test_brcond(i32 %n) {
+; CHECK-LABEL: test_brcond
+; CHECK: cmp r0
+; CHECK-NEXT: movgt [[RCMP:r[0-9]+]], #1
+; CHECK: tst [[RCMP]], #1
+; CHECK-NEXT: bne [[FALSE:.L[[:alnum:]_]+]]
+; CHECK: blx brcond1
+; CHECK: [[FALSE]]:
+; CHECK: blx brcond2
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.true, label %if.false
+
+if.true:
+  call arm_aapcscc void @brcond1()
+  br label %if.end
+
+if.false:
+  call arm_aapcscc void @brcond2()
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
index 9a0877846fc3..f436c3774c86 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir
@@ -14,6 +14,12 @@
 
   define void @test_srem_i32() { ret void }
   define void @test_urem_i32() { ret void }
+
+  define void @test_srem_i16() { ret void }
+  define void @test_urem_i16() { ret void }
+
+  define void @test_srem_i8() { ret void }
+  define void @test_urem_i8() { ret void }
 ...
 ---
 name:            test_sdiv_i32
@@ -323,3 +329,171 @@ body:             |
     %r0 = COPY %2(s32)
     BX_RET 14, _, implicit %r0
 ...
+---
+name:            test_srem_i16
+# CHECK-LABEL: name: test_srem_i16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s16)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s16)
+    %0(s16) = COPY %r0
+    %1(s16) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_SREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_SREM
+    ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_SREM
+    %2(s16) = G_SREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s16)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_urem_i16
+# CHECK-LABEL: name: test_urem_i16
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s16) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s16) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s16)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s16)
+    %0(s16) = COPY %r0
+    %1(s16) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_UREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_UREM
+    ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_UREM
+    %2(s16) = G_UREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s16)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_srem_i8
+# CHECK-LABEL: name: test_srem_i8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_SEXT [[X]](s8)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_SEXT [[Y]](s8)
+    %0(s8) = COPY %r0
+    %1(s8) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_SREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_idivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_SREM
+    ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_SREM
+    %2(s8) = G_SREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s8)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_urem_i8
+# CHECK-LABEL: name: test_urem_i8
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s8) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s8) = COPY %r1
+    ; CHECK-DAG: [[X32:%[0-9]+]](s32) = G_ZEXT [[X]](s8)
+    ; CHECK-DAG: [[Y32:%[0-9]+]](s32) = G_ZEXT [[Y]](s8)
+    %0(s8) = COPY %r0
+    %1(s8) = COPY %r1
+    ; HWDIV: [[Q32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]]
+    ; HWDIV: [[P32:%[0-9]+]](s32) = G_MUL [[Q32]], [[Y32]]
+    ; HWDIV: [[R32:%[0-9]+]](s32) = G_SUB [[X32]], [[P32]]
+    ; SOFT-NOT: G_UREM
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X32]]
+    ; SOFT-DAG: %r1 = COPY [[Y32]]
+    ; SOFT-AEABI: BLX $__aeabi_uidivmod, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-AEABI: [[R32:%[0-9]+]](s32) = COPY %r1
+    ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    ; SOFT-NOT: G_UREM
+    ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]]
+    ; SOFT-NOT: G_UREM
+    %2(s8) = G_UREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s8)
+    BX_RET 14, _, implicit %r0
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index 4575341dfc29..616f29d3b068 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -42,6 +42,8 @@
   define void @test_select_s32() { ret void }
   define void @test_select_ptr() { ret void }
 
+  define void @test_brcond() { ret void }
+
   define void @test_fadd_s32() #0 { ret void }
   define void @test_fadd_s64() #0 { ret void }
 
@@ -863,6 +865,40 @@ body:             |
     BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_brcond
+# CHECK-LABEL: name: test_brcond
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: %r0, %r1
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s1) = G_ICMP intpred(sgt), %0(s32), %1
+    G_BRCOND %2(s1), %bb.1
+    ; G_BRCOND with s1 is legal, so we should find it unchanged in the output
+    ; CHECK: G_BRCOND {{%[0-9]+}}(s1), %bb.1
+    G_BR %bb.2
+
+  bb.1:
+    %r0 = COPY %1(s32)
+    BX_RET 14, _, implicit %r0
+
+  bb.2:
+    %r0 = COPY %0(s32)
+    BX_RET 14, _, implicit %r0
+
+...
+---
 name:            test_fadd_s32
 # CHECK-LABEL: name: test_fadd_s32
 legalized:       false
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index ffca431d96ea..638c6e620926 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -40,6 +40,8 @@
 
   define void @test_select_s32() { ret void }
 
+  define void @test_br() { ret void }
+
   define void @test_fadd_s32() #0 { ret void }
   define void @test_fadd_s64() #0 { ret void }
 
@@ -828,6 +830,34 @@ body:             |
     %r0 = COPY %3(s32)
     BX_RET 14, _, implicit %r0
 
+...
+---
+name:            test_br
+# CHECK-LABEL: name: test_br
+legalized:       true
+regBankSelected: false
+# CHECK: regBankSelected: true
+selected:        false
+registers:
+  - { id: 0, class: _ }
+# CHECK: { id: 0, class: gprb, preferred-register: '' }
+# Check that we map the condition of the G_BRCOND into the GPR.
+# For the G_BR, there are no registers to map, but make sure we don't crash.
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: %r0
+
+    %0(s1) = COPY %r0
+    G_BRCOND %0(s1), %bb.1
+    G_BR %bb.2
+
+  bb.1:
+    BX_RET 14, _
+
+  bb.2:
+    BX_RET 14, _
+
 ...
 ---
 name:            test_fadd_s32
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 23c4ccea4604..644a7fbf8d9a 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -26,6 +26,7 @@ entry:
 	store i32 3855, i32* %xort
 	store i32 4, i32* %temp
 	%tmp = load i32, i32* %temp
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
@@ -35,6 +36,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw add i32* %val1, i32 %tmp monotonic
 	store i32 %0, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
@@ -44,6 +46,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %1 = atomicrmw sub i32* %val2, i32 30 monotonic
 	store i32 %1, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
@@ -53,6 +56,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw add i32* %val2, i32 1 monotonic
 	store i32 %2, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
@@ -62,6 +66,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw sub i32* %val2, i32 1 monotonic
 	store i32 %3, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: and
   ; CHECK: strex
@@ -71,6 +76,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %4 = atomicrmw and i32* %andt, i32 4080 monotonic
 	store i32 %4, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: or
   ; CHECK: strex
@@ -80,6 +86,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %5 = atomicrmw or i32* %ort, i32 4080 monotonic
 	store i32 %5, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: eor
   ; CHECK: strex
@@ -89,6 +96,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %6 = atomicrmw xor i32* %xort, i32 4080 monotonic
 	store i32 %6, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -98,6 +106,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %7 = atomicrmw min i32* %val2, i32 16 monotonic
 	store i32 %7, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
 	%neg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
@@ -108,6 +117,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %8 = atomicrmw min i32* %val2, i32 %neg monotonic
 	store i32 %8, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -117,6 +127,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %9 = atomicrmw max i32* %val2, i32 1 monotonic
 	store i32 %9, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -126,6 +137,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %10 = atomicrmw max i32* %val2, i32 0 monotonic
 	store i32 %10, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -135,6 +147,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %11 = atomicrmw umin i32* %val2, i32 16 monotonic
 	store i32 %11, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
 	%uneg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
@@ -145,6 +158,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
 	store i32 %12, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
@@ -154,6 +168,7 @@ entry:
   ; CHECK-BAREMETAL-NOT: __sync
   %13 = atomicrmw umax i32* %val2, i32 1 monotonic
 	store i32 %13, i32* %old
+	call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
diff --git a/test/CodeGen/AVR/branch-relaxation.ll b/test/CodeGen/AVR/branch-relaxation.ll
index d6f07f653576..e415b059692e 100644
--- a/test/CodeGen/AVR/branch-relaxation.ll
+++ b/test/CodeGen/AVR/branch-relaxation.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=avr | FileCheck %s
 
-; CHECKC-LABEL: relax_breq
+; CHECK-LABEL: relax_breq
 ; CHECK: cpi     r{{[0-9]+}}, 0
 ; CHECK: brne    LBB0_1
 ; CHECK: rjmp    LBB0_2
@@ -66,7 +66,7 @@ finished:
   ret i8 3
 }
 
-; CHECKC-LABEL: no_relax_breq
+; CHECK-LABEL: no_relax_breq
 ; CHECK: cpi     r{{[0-9]+}}, 0
 ; CHECK: breq    [[END_BB:LBB[0-9]+_[0-9]+]]
 ; CHECK: nop
diff --git a/test/CodeGen/BPF/select_ri.ll b/test/CodeGen/BPF/select_ri.ll
new file mode 100644
index 000000000000..c4ac376502b8
--- /dev/null
+++ b/test/CodeGen/BPF/select_ri.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s
+;
+; Source file:
+; int b, c;
+; int test() {
+;   int a = b;
+;   if (a)
+;     a = c;
+;   return a;
+; }
+@b = common local_unnamed_addr global i32 0, align 4
+@c = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @test() local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  %1 = load i32, i32* @c, align 4
+  %. = select i1 %tobool, i32 0, i32 %1
+; CHECK:  r1 = <MCOperand Expr:(b)>ll
+; CHECK:  r1 = *(u32 *)(r1 + 0)
+; CHECK:  if r1 == 0 goto
+  ret i32 %.
+}
+
+attributes #0 = { norecurse nounwind readonly }
diff --git a/test/CodeGen/BPF/setcc.ll b/test/CodeGen/BPF/setcc.ll
index 294c49365670..7e20814da807 100644
--- a/test/CodeGen/BPF/setcc.ll
+++ b/test/CodeGen/BPF/setcc.ll
@@ -7,7 +7,7 @@ define i16 @sccweqand(i16 %a, i16 %b) nounwind {
   ret i16 %t3
 }
 ; CHECK-LABEL: sccweqand:
-; CHECK: if r1 == r2
+; CHECK: if r1 == 0
 
 define i16 @sccwneand(i16 %a, i16 %b) nounwind {
   %t1 = and i16 %a, %b
@@ -16,7 +16,7 @@ define i16 @sccwneand(i16 %a, i16 %b) nounwind {
   ret i16 %t3
 }
 ; CHECK-LABEL: sccwneand:
-; CHECK: if r1 != r2
+; CHECK: if r1 != 0
 
 define i16 @sccwne(i16 %a, i16 %b) nounwind {
   %t1 = icmp ne i16 %a, %b
diff --git a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
index 9e4664ad69c9..48c5f8f4d247 100644
--- a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
+++ b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s
 
 ; Bug: PR31341
-; XFAIL: avr
 
 ;; Date:     Jul 29, 2003.
 ;; From:     test/Programs/MultiSource/Ptrdist-bc
diff --git a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
index a9a33d72bca2..afa2e8a72ed1 100644
--- a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
+++ b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
@@ -1,8 +1,5 @@
 ; RUN: llc < %s
 
-; Bug: PR31898
-; XFAIL: avr
-
 ; This caused ScheduleDAG to crash in EmitPhysRegCopy when searching
 ; the uses of a copy to a physical register without ignoring non-data
 ; dependence, PR10220.
diff --git a/test/CodeGen/Generic/print-mul-exp.ll b/test/CodeGen/Generic/print-mul-exp.ll
index 91c8147aaad9..1426fb59f669 100644
--- a/test/CodeGen/Generic/print-mul-exp.ll
+++ b/test/CodeGen/Generic/print-mul-exp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; XFAIL: avr
 
 @a_str = internal constant [8 x i8] c"a = %d\0A\00"		; <[8 x i8]*> [#uses=1]
 @a_mul_str = internal constant [13 x i8] c"a * %d = %d\0A\00"		; <[13 x i8]*> [#uses=1]
diff --git a/test/CodeGen/Generic/print-mul.ll b/test/CodeGen/Generic/print-mul.ll
index 4b60d759278a..20fb1be6edef 100644
--- a/test/CodeGen/Generic/print-mul.ll
+++ b/test/CodeGen/Generic/print-mul.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; XFAIL: avr
 
 @a_str = internal constant [8 x i8] c"a = %d\0A\00"		; <[8 x i8]*> [#uses=1]
 @b_str = internal constant [8 x i8] c"b = %d\0A\00"		; <[8 x i8]*> [#uses=1]
diff --git a/test/CodeGen/Generic/print-shift.ll b/test/CodeGen/Generic/print-shift.ll
index 56b3ec1df760..1fda55420b59 100644
--- a/test/CodeGen/Generic/print-shift.ll
+++ b/test/CodeGen/Generic/print-shift.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s
+; XFAIL: avr
 
 @a_str = internal constant [8 x i8] c"a = %d\0A\00"             ; <[8 x i8]*> [#uses=1]
 @b_str = internal constant [8 x i8] c"b = %d\0A\00"             ; <[8 x i8]*> [#uses=1]
diff --git a/test/CodeGen/Generic/v-split.ll b/test/CodeGen/Generic/v-split.ll
index 91aece94fecd..f9a1cee440ca 100644
--- a/test/CodeGen/Generic/v-split.ll
+++ b/test/CodeGen/Generic/v-split.ll
@@ -1,8 +1,5 @@
 ; RUN: llc < %s
 
-; Bug: PR31898
-; XFAIL: avr
-
 %f8 = type <8 x float>
 
 define void @test_f8(%f8 *%P, %f8* %Q, %f8 *%S) {
diff --git a/test/CodeGen/Generic/vector-redux.ll b/test/CodeGen/Generic/vector-redux.ll
index 64562d6d9490..8efdbf85b8c0 100644
--- a/test/CodeGen/Generic/vector-redux.ll
+++ b/test/CodeGen/Generic/vector-redux.ll
@@ -1,9 +1,6 @@
 ; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; Bug: PR31898
-; XFAIL: avr
-
 @a = global [1024 x i32] zeroinitializer, align 16
 
 define i32 @reduce_add() {
diff --git a/test/CodeGen/Generic/vector.ll b/test/CodeGen/Generic/vector.ll
index 9c0cacdcd878..2d4dc501a53a 100644
--- a/test/CodeGen/Generic/vector.ll
+++ b/test/CodeGen/Generic/vector.ll
@@ -1,9 +1,6 @@
 ; Test that vectors are scalarized/lowered correctly.
 ; RUN: llc < %s
 
-; Bug: PR31898
-; XFAIL: avr
-
 %d8 = type <8 x double>
 %f1 = type <1 x float>
 %f2 = type <2 x float>
diff --git a/test/CodeGen/Hexagon/intrinsics/system_user.ll b/test/CodeGen/Hexagon/intrinsics/system_user.ll
index ac4c53e221d0..23473c92da91 100644
--- a/test/CodeGen/Hexagon/intrinsics/system_user.ll
+++ b/test/CodeGen/Hexagon/intrinsics/system_user.ll
@@ -1,13 +1,71 @@
-; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
-; RUN: llc -march=hexagon -O0 < %s | FileCheck -check-prefix=CHECK-CALL %s
-; Hexagon Programmer's Reference Manual 11.9.1 SYSTEM/USER
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
-; CHECK-CALL-NOT: call
+target triple = "hexagon"
 
-; Data cache prefetch
-declare void @llvm.hexagon.prefetch(i8*)
-define void @prefetch(i8* %a) {
-  call void @llvm.hexagon.prefetch(i8* %a)
+; CHECK-LABEL: dc00:
+; CHECK: dcfetch
+define void @dc00(i8* nocapture readonly %p) local_unnamed_addr #0 {
+  tail call void @llvm.hexagon.prefetch(i8* %p)
   ret void
 }
-; CHECK: dcfetch({{.*}}+#0)
+
+; CHECK-LABEL: dc01:
+; CHECK: dccleana
+define void @dc01(i8* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dccleana(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc02:
+; CHECK: dccleaninva
+define void @dc02(i8* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dccleaninva(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc03:
+; CHECK: dcinva
+define void @dc03(i8* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dcinva(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc04:
+; CHECK: dczeroa
+define void @dc04(i8* nocapture %p) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y2.dczeroa(i8* %p)
+  ret void
+}
+
+; CHECK-LABEL: dc05:
+; CHECK: l2fetch(r{{[0-9]+}},r{{[0-9]+}})
+define void @dc05(i8* nocapture readonly %p, i32 %q) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y4.l2fetch(i8* %p, i32 %q)
+  ret void
+}
+
+; CHECK-LABEL: dc06:
+; CHECK: l2fetch(r{{[0-9]+}},r{{[0-9]+}}:{{[0-9]+}})
+define void @dc06(i8* nocapture readonly %p, i64 %q) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.hexagon.Y5.l2fetch(i8* %p, i64 %q)
+  ret void
+}
+
+declare void @llvm.hexagon.prefetch(i8* nocapture) #1
+declare void @llvm.hexagon.Y2.dccleana(i8* nocapture readonly) #2
+declare void @llvm.hexagon.Y2.dccleaninva(i8* nocapture readonly) #2
+declare void @llvm.hexagon.Y2.dcinva(i8* nocapture readonly) #2
+declare void @llvm.hexagon.Y2.dczeroa(i8* nocapture) #3
+declare void @llvm.hexagon.Y4.l2fetch(i8* nocapture readonly, i32) #2
+declare void @llvm.hexagon.Y5.l2fetch(i8* nocapture readonly, i64) #2
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nounwind writeonly }
diff --git a/test/CodeGen/Hexagon/switch-lut-explicit-section.ll b/test/CodeGen/Hexagon/switch-lut-explicit-section.ll
new file mode 100644
index 000000000000..6c67a0dab1a8
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-explicit-section.ll
@@ -0,0 +1,32 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=FUNCTEXT %s
+;RUN: llc -O2 -hexagon-emit-lut-text=true -function-sections < %s | FileCheck --check-prefix=FUNCTEXT %s
+
+;This test checks the placement of lookup table in explicit section from the attribute set.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;FUNCTEXT: .text
+;FUNCTEXT: .section{{.*}}tcm.hexagon,
+;FUNCTEXT-NOT: .section{{.*}}.rodata
+;FUNCTEXT-NOT: .text
+;FUNCTEXT: .Lswitch.table:
+;FUNCTEXT-NEXT: .word
+
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] #0
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 section "tcm.hexagon" {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/switch-lut-function-section.ll b/test/CodeGen/Hexagon/switch-lut-function-section.ll
new file mode 100644
index 000000000000..bb2b1e798c8a
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-function-section.ll
@@ -0,0 +1,30 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true -function-sections < %s | FileCheck --check-prefix=FUNCTEXT %s
+
+;This test checks the placement of lookup table in function's text section.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;FUNCTEXT: .text
+;FUNCTEXT: .section{{.*}}text.foo,
+;FUNCTEXT-NOT: .section{{.*}}.rodata
+;FUNCTEXT: .Lswitch.table:
+;FUNCTEXT-NEXT: .word
+
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11] #0
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll b/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll
new file mode 100644
index 000000000000..57fdfbf33abc
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-multiple-functions.ll
@@ -0,0 +1,42 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=TEXT %s
+;If the look up table is used by more than one function, we should ignore the
+;flag and place it the rodata.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;TEXT: .text
+;TEXT: .section{{.*}}.rodata
+;TEXT: .Lswitch.table:
+;TEXT-NEXT: .word
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11]
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+define i32 @goo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/switch-lut-text-section.ll b/test/CodeGen/Hexagon/switch-lut-text-section.ll
new file mode 100644
index 000000000000..b4d3e898d103
--- /dev/null
+++ b/test/CodeGen/Hexagon/switch-lut-text-section.ll
@@ -0,0 +1,27 @@
+;RUN: llc -O2 -hexagon-emit-lut-text=true < %s | FileCheck --check-prefix=TEXT %s
+;This test checks the placement of lookup table in text section.
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown--elf"
+
+;TEXT: .text
+;TEXT-NOT: .section{{.*}}.rodata
+;TEXT: .Lswitch.table:
+;TEXT-NEXT: .word
+@switch.table = private unnamed_addr constant [9 x i32] [i32 9, i32 20, i32 14, i32 22, i32 12, i32 5, i32 98, i32 8, i32 11]
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = icmp ult i32 %x, 9
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:                                    ; preds = %entry
+  %switch.gep = getelementptr inbounds [9 x i32], [9 x i32]* @switch.table, i32 0, i32 %x
+  %switch.load = load i32, i32* %switch.gep, align 4
+  ret i32 %switch.load
+
+return:                                           ; preds = %entry
+  ret i32 19
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/v6vec-vprint.ll b/test/CodeGen/Hexagon/v6vec-vprint.ll
index 224547c24b75..24daeac3fb5d 100644
--- a/test/CodeGen/Hexagon/v6vec-vprint.ll
+++ b/test/CodeGen/Hexagon/v6vec-vprint.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print < %s | FileCheck %s
 ; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx -disable-hexagon-shuffle=0 -O2 -enable-hexagon-vector-print -trace-hex-vector-stores-only < %s | FileCheck --check-prefix=VSTPRINT %s
 ;   generate .long XXXX which is a vector debug print instruction.
 ; CHECK: .long 0x1dffe0
diff --git a/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll b/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll
new file mode 100644
index 000000000000..32abb75f20f4
--- /dev/null
+++ b/test/CodeGen/Hexagon/vect/vect-load-v4i16.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: danny:
+; CHECK-DAG: [[T0:r[0-9]+]] = memuh(r0+#0)
+; CHECK-DAG: [[T1:r[0-9]+]] = memuh(r0+#2)
+; CHECK:     [[T0]] |= asl([[T1]],#16)
+; CHECK-DAG: [[T2:r[0-9]+]] = memuh(r0+#4)
+; CHECK-DAG: [[T3:r[0-9]+]] = memuh(r0+#6)
+; CHECK:     [[T2]] |= asl([[T3]],#16)
+; CHECK:     combine([[T2]],[[T0]])
+define <4 x i16> @danny(<4 x i16>* %p) {
+  %t0 = load <4 x i16>, <4 x i16>* %p, align 2
+  ret <4 x i16> %t0
+}
+
+; CHECK-LABEL: sammy:
+; CHECK-DAG: [[T0:r[0-9]+]] = memw(r0+#0)
+; CHECK-DAG: [[T1:r[0-9]+]] = memw(r0+#4)
+; CHECK:     combine([[T1]],[[T0]])
+define <4 x i16> @sammy(<4 x i16>* %p) {
+  %t0 = load <4 x i16>, <4 x i16>* %p, align 4
+  ret <4 x i16> %t0
+}
diff --git a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll b/test/CodeGen/Hexagon/vect/vect-v4i16.ll
similarity index 100%
rename from test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
rename to test/CodeGen/Hexagon/vect/vect-v4i16.ll
diff --git a/test/CodeGen/MIR/AArch64/target-memoperands.mir b/test/CodeGen/MIR/AArch64/target-memoperands.mir
index f853b551e098..c71302d97e2e 100644
--- a/test/CodeGen/MIR/AArch64/target-memoperands.mir
+++ b/test/CodeGen/MIR/AArch64/target-memoperands.mir
@@ -10,13 +10,17 @@
 ---
 # CHECK-LABEL: name: target_memoperands
 # CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+# CHECK: %2(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4)
 # CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+# CHECK: G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4)
 name:            target_memoperands
 body: |
   bb.0:
 
     %0:_(p0) = COPY %x0
     %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+    %2:_(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4)
     G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+    G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4)
     RET_ReallyLR
 ...
diff --git a/test/CodeGen/MIR/AMDGPU/fold-multiple.mir b/test/CodeGen/MIR/AMDGPU/fold-multiple.mir
new file mode 100644
index 000000000000..a5da33a997d3
--- /dev/null
+++ b/test/CodeGen/MIR/AMDGPU/fold-multiple.mir
@@ -0,0 +1,40 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+  define amdgpu_kernel void @test() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+
+# This used to crash / trigger an assertion, because re-scanning the use list
+# after constant-folding the definition of %3 lead to the definition of %2
+# being processed twice.
+
+# CHECK-LABEL: name: test
+# CHECK: %2 = V_LSHLREV_B32_e32 2, killed %0, implicit %exec
+# CHECK: %4 = V_AND_B32_e32 8, killed %2, implicit %exec
+
+name:            test
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_128 }
+body:             |
+  bb.0 (%ir-block.0):
+    %0 = IMPLICIT_DEF
+    %1 = S_MOV_B32 2
+    %2 = V_LSHLREV_B32_e64 %1, killed %0, implicit %exec
+    %3 = S_LSHL_B32 %1, killed %1, implicit-def dead %scc
+    %4 = V_AND_B32_e64 killed %2, killed %3, implicit %exec
+    %5 = IMPLICIT_DEF
+    BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 4baf499848fd..3501861f5757 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -39,11 +39,11 @@ entry:
 ; CHECK-LABEL: va_copy:
   %vl.addr = alloca i8*, align 2
   %vl2 = alloca i8*, align 2
-; CHECK: mov.w r12, 2(r1)
+; CHECK-DAG: mov.w r12, 2(r1)
   store i8* %vl, i8** %vl.addr, align 2
   %0 = bitcast i8** %vl2 to i8*
   %1 = bitcast i8** %vl.addr to i8*
-; CHECK-NEXT: mov.w r12, 0(r1)
+; CHECK-DAG: mov.w r12, 0(r1)
   call void @llvm.va_copy(i8* %0, i8* %1)
   ret void
 }
diff --git a/test/CodeGen/Mips/2008-06-05-Carry.ll b/test/CodeGen/Mips/2008-06-05-Carry.ll
index c61e1cdedea7..5e6092fc7848 100644
--- a/test/CodeGen/Mips/2008-06-05-Carry.ll
+++ b/test/CodeGen/Mips/2008-06-05-Carry.ll
@@ -2,20 +2,21 @@
 
 define i64 @add64(i64 %u, i64 %v) nounwind  {
 entry:
+; CHECK-LABEL: add64:
 ; CHECK: addu
-; CHECK: sltu 
+; CHECK-DAG: sltu
+; CHECK-DAG: addu
 ; CHECK: addu
-; CHECK: addu
-  %tmp2 = add i64 %u, %v  
+  %tmp2 = add i64 %u, %v
   ret i64 %tmp2
 }
 
 define i64 @sub64(i64 %u, i64 %v) nounwind  {
 entry:
-; CHECK: sub64
+; CHECK-LABEL: sub64
+; CHECK-DAG: sltu
+; CHECK-DAG: subu
 ; CHECK: subu
-; CHECK: sltu 
-; CHECK: addu
 ; CHECK: subu
   %tmp2 = sub i64 %u, %v
   ret i64 %tmp2
diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll
index 5c0415759266..2aa824250d3b 100644
--- a/test/CodeGen/Mips/dins.ll
+++ b/test/CodeGen/Mips/dins.ll
@@ -59,9 +59,9 @@ entry:
 ; CHECK-LABEL: f123:
 ; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 123
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37
-; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 5
 ; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 4
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6
+; MIPS64R2: daddiu  $[[R0:[0-9]+]], $zero, 5
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14
 ; MIPS64R2: dsrl    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50
 ; MIPS64R2: dins    $[[R0:[0-9]+]], $[[R1:[0-9]+]], 34, 16
@@ -94,4 +94,4 @@ entry:
 ; MIPS32R2:        ori  $[[R0:[0-9]+]], $[[R0:[0-9]+]], 8
 ; MIPS32R2-NOT:    ins {{[[:space:]].*}}
 ; MIPS64R2N32:     ori  $[[R0:[0-9]+]], $[[R0:[0-9]+]], 8
-; MIPS64R2N32-NOT: ins {{[[:space:]].*}}
\ No newline at end of file
+; MIPS64R2N32-NOT: ins {{[[:space:]].*}}
diff --git a/test/CodeGen/Mips/dsp-patterns.ll b/test/CodeGen/Mips/dsp-patterns.ll
index 837c0d8bfc52..250d3eff37dc 100644
--- a/test/CodeGen/Mips/dsp-patterns.ll
+++ b/test/CodeGen/Mips/dsp-patterns.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=mips -mattr=dsp < %s | FileCheck %s -check-prefix=R1
-; RUN: llc -march=mips -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dsp < %s | FileCheck %s -check-prefix=R1
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=dspr2 < %s | FileCheck %s -check-prefix=R2
 
 ; R1-LABEL: test_lbux:
 ; R1: lbux ${{[0-9]+}}
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
index fcf129420234..b7cc6fc8ea75 100644
--- a/test/CodeGen/Mips/llcarry.ll
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -14,9 +14,9 @@ entry:
   %add = add nsw i64 %1, %0
   store i64 %add, i64* @k, align 8
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
-; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   ret void
 }
@@ -28,8 +28,8 @@ entry:
   %sub = sub nsw i64 %0, %1
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
-; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   store i64 %sub, i64* @l, align 8
   ret void
@@ -41,8 +41,7 @@ entry:
   %add = add nsw i64 %0, 15
 ; 16:	addiu	${{[0-9]+}}, 15
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
-; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
   store i64 %add, i64* @m, align 8
   ret void
diff --git a/test/CodeGen/Mips/llvm-ir/add.ll b/test/CodeGen/Mips/llvm-ir/add.ll
index a5ecdda94ce2..63884eb03b8c 100644
--- a/test/CodeGen/Mips/llvm-ir/add.ll
+++ b/test/CodeGen/Mips/llvm-ir/add.ll
@@ -1,35 +1,35 @@
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,PRE4
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP32
+; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64
+; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,R2-R6,GP64
+; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -O2 -verify-machineinstrs | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MMR6,MM32
+; RUN:    -check-prefixes=ALL,MMR3,MM32
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MMR6,MM32
 ; RUN: llc < %s -march=mips -mcpu=mips64r6 -target-abi n64 -mattr=+micromips -O2 | FileCheck %s \
-; RUN:    -check-prefixes=ALL,MMR6,MM64
+; RUN:    -check-prefixes=ALL,MM64
 
 
 ; FIXME: This code sequence is inefficient as it should be 'subu $[[T0]], $zero, $[[T0]'. 
@@ -110,17 +110,17 @@ define signext i64 @add_i64(i64 signext %a, i64 signext %b) {
 entry:
 ; ALL-LABEL: add_i64:
 
-  ; GP32:       addu    $3, $5, $7
-  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $7
-  ; GP32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP32:       addu    $2, $4, $[[T1]]
+  ; GP32-DAG:   addu    $[[T0:[0-9]+]], $4, $6
+  ; GP32-DAG:   addu    $3, $5, $7
+  ; GP32:       sltu    $[[T1:[0-9]+]], $3, $5
+  ; GP32:       addu    $2, $[[T0]], $[[T1]]
 
   ; GP64:       daddu   $2, $4, $5
 
-  ; MM32:       addu16  $3, $5, $7
-  ; MM32:       sltu    $[[T0:[0-9]+]], $3, $7
-  ; MM32:       addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; MM32:       addu    $2, $4, $[[T1]]
+  ; MM32-DAG:   addu16  $3, $5, $7
+  ; MM32-DAG:   addu16  $[[T0:[0-9]+]], $4, $6
+  ; MM32:       sltu    $[[T1:[0-9]+]], $3, $5
+  ; MM32:       addu16  $2, $[[T0]], $[[T1]]
 
   ; MM64:       daddu   $2, $4, $5
 
@@ -132,49 +132,108 @@ define signext i128 @add_i128(i128 signext %a, i128 signext %b) {
 entry:
 ; ALL-LABEL: add_i128:
 
-  ; GP32:       lw        $[[T0:[0-9]+]], 28($sp)
-  ; GP32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
-  ; GP32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
-  ; GP32:       lw        $[[T3:[0-9]+]], 24($sp)
-  ; GP32:       addu      $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; GP32:       addu      $[[T5:[0-9]+]], $6, $[[T4]]
-  ; GP32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
-  ; GP32:       lw        $[[T7:[0-9]+]], 20($sp)
-  ; GP32:       addu      $[[T8:[0-9]+]], $[[T6]], $[[T7]]
-  ; GP32:       lw        $[[T9:[0-9]+]], 16($sp)
-  ; GP32:       addu      $3, $5, $[[T8]]
-  ; GP32:       sltu      $[[T10:[0-9]+]], $3, $[[T7]]
-  ; GP32:       addu      $[[T11:[0-9]+]], $[[T10]], $[[T9]]
-  ; GP32:       addu      $2, $4, $[[T11]]
-  ; GP32:       move      $4, $[[T5]]
-  ; GP32:       move      $5, $[[T1]]
+  ; PRE4:       move    $[[R1:[0-9]+]], $5
+  ; PRE4:       move    $[[R2:[0-9]+]], $4
+  ; PRE4:       lw   $[[R3:[0-9]+]], 24($sp)
+  ; PRE4:       addu   $[[R4:[0-9]+]], $6, $[[R3]]
+  ; PRE4:       lw   $[[R5:[0-9]+]], 28($sp)
+  ; PRE4:       addu   $[[R6:[0-9]+]], $7, $[[R5]]
+  ; PRE4:       sltu   $[[R7:[0-9]+]], $[[R6]], $7
+  ; PRE4:       addu   $[[R8:[0-9]+]], $[[R4]], $[[R7]]
+  ; PRE4:       xor   $[[R9:[0-9]+]], $[[R8]], $6
+  ; PRE4:       sltiu   $[[R10:[0-9]+]], $[[R9]], 1
+  ; PRE4:       bnez   $[[R10]], $BB5_2
+  ; PRE4:       sltu   $[[R7]], $[[R8]], $6
+  ; PRE4:       lw   $[[R12:[0-9]+]], 20($sp)
+  ; PRE4:       addu   $[[R13:[0-9]+]], $[[R1]], $[[R12]]
+  ; PRE4:       lw   $[[R14:[0-9]+]], 16($sp)
+  ; PRE4:       addu   $[[R15:[0-9]+]], $[[R13]], $[[R7]]
+  ; PRE4:       addu   $[[R16:[0-9]+]], $[[R2]], $[[R14]]
+  ; PRE4:       sltu   $[[R17:[0-9]+]], $[[R15]], $[[R13]]
+  ; PRE4:       sltu   $[[R18:[0-9]+]], $[[R13]], $[[R1]]
+  ; PRE4:       addu   $[[R19:[0-9]+]], $[[R16]], $[[R18]]
+  ; PRE4:       addu   $2, $[[R19]], $[[R17]]
 
-  ; GP64:       daddu     $3, $5, $7
-  ; GP64:       sltu      $[[T0:[0-9]+]], $3, $7
-  ; GP64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP64:       daddu     $2, $4, $[[T1]]
+  ; GP32-CMOV:  lw        $[[T0:[0-9]+]], 24($sp)
+  ; GP32-CMOV:  addu      $[[T1:[0-9]+]], $6, $[[T0]]
+  ; GP32-CMOV:  lw        $[[T2:[0-9]+]], 28($sp)
+  ; GP32-CMOV:  addu      $[[T3:[0-9]+]], $7, $[[T2]]
+  ; GP32-CMOV:  sltu      $[[T4:[0-9]+]], $[[T3]], $7
+  ; GP32-CMOV:  addu      $[[T5:[0-9]+]], $[[T1]], $[[T4]]
+  ; GP32-CMOV:  sltu      $[[T6:[0-9]+]], $[[T5]], $6
+  ; GP32-CMOV:  xor       $[[T7:[0-9]+]], $[[T5]], $6
+  ; GP32-CMOV:  movz      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; GP32-CMOV:  lw        $[[T9:[0-9]+]], 20($sp)
+  ; GP32-CMOV:  addu      $[[T10:[0-9]+]], $5, $[[T4]]
+  ; GP32-CMOV:  addu      $[[T11:[0-9]+]], $[[T10]], $[[T8]]
+  ; GP32-CMOV:  lw        $[[T12:[0-9]+]], 16($sp)
+  ; GP32-CMOV:  sltu      $[[T13:[0-9]+]], $[[T11]], $[[T10]]
+  ; GP32-CMOV:  addu      $[[T14:[0-9]+]], $4, $[[T12]]
+  ; GP32-CMOV:  sltu      $[[T15:[0-9]+]], $[[T10]], $5
+  ; GP32-CMOV:  addu      $[[T16:[0-9]+]], $[[T14]], $[[T15]]
+  ; GP32-CMOV:  addu      $[[T17:[0-9]+]], $[[T16]], $[[T13]]
+  ; GP32-CMOV:  move      $4, $[[T5]]
+  ; GP32-CMOV:  move      $5, $[[T3]]
 
-  ; MM32:       lw        $[[T0:[0-9]+]], 28($sp)
-  ; MM32:       addu      $[[T1:[0-9]+]], $7, $[[T0]]
-  ; MM32:       sltu      $[[T2:[0-9]+]], $[[T1]], $[[T0]]
-  ; MM32:       lw        $[[T3:[0-9]+]], 24($sp)
-  ; MM32:       addu16    $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; MM32:       addu16    $[[T5:[0-9]+]], $6, $[[T4]]
-  ; MM32:       sltu      $[[T6:[0-9]+]], $[[T5]], $[[T3]]
-  ; MM32:       lw        $[[T7:[0-9]+]], 20($sp)
-  ; MM32:       addu16    $[[T8:[0-9]+]], $[[T6]], $[[T7]]
-  ; MM32:       lw        $[[T9:[0-9]+]], 16($sp)
-  ; MM32:       addu16    $[[T10:[0-9]+]], $5, $[[T8]]
-  ; MM32:       sltu      $[[T11:[0-9]+]], $[[T10]], $[[T7]]
-  ; MM32:       addu      $[[T12:[0-9]+]], $[[T11]], $[[T9]]
-  ; MM32:       addu16    $[[T13:[0-9]+]], $4, $[[T12]]
-  ; MM32:       move      $4, $[[T5]]
-  ; MM32:       move      $5, $[[T1]]
+  ; GP64:           daddu   $[[T0:[0-9]+]], $4, $6
+  ; GP64:           daddu   $[[T1:[0-9]+]], $5, $7
+  ; GP64:           sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; GP64-NOT-R2-R6: dsll    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T4:[0-9]+]], $[[T3]], 32
+  ; GP64-R2-R6:     dext    $[[T4:[0-9]+]], $[[T2]], 0, 32
 
+  ; GP64:           daddu   $2, $[[T0]], $[[T4]]
+
+  ; MMR3:       move      $[[T1:[0-9]+]], $5
+  ; MMR3-DAG:   lw        $[[T2:[0-9]+]], 32($sp)
+  ; MMR3:       addu16    $[[T3:[0-9]+]], $6, $[[T2]]
+  ; MMR3-DAG:   lw        $[[T4:[0-9]+]], 36($sp)
+  ; MMR3:       addu16    $[[T5:[0-9]+]], $7, $[[T4]]
+  ; MMR3:       sltu      $[[T6:[0-9]+]], $[[T5]], $7
+  ; MMR3:       addu16    $[[T7:[0-9]+]], $[[T3]], $[[T6]]
+  ; MMR3:       sltu      $[[T8:[0-9]+]], $[[T7]], $6
+  ; MMR3:       xor       $[[T9:[0-9]+]], $[[T7]], $6
+  ; MMR3:       movz      $[[T8]], $[[T6]], $[[T9]]
+  ; MMR3:       lw        $[[T10:[0-9]+]], 28($sp)
+  ; MMR3:       addu16    $[[T11:[0-9]+]], $[[T1]], $[[T10]]
+  ; MMR3:       addu16    $[[T12:[0-9]+]], $[[T11]], $[[T8]]
+  ; MMR3:       lw        $[[T13:[0-9]+]], 24($sp)
+  ; MMR3:       sltu      $[[T14:[0-9]+]], $[[T12]], $[[T11]]
+  ; MMR3:       addu16    $[[T15:[0-9]+]], $4, $[[T13]]
+  ; MMR3:       sltu      $[[T16:[0-9]+]], $[[T11]], $[[T1]]
+  ; MMR3:       addu16    $[[T17:[0-9]+]], $[[T15]], $[[T16]]
+  ; MMR3:       addu16    $2, $2, $[[T14]]
+
+  ; MMR6:        move      $[[T1:[0-9]+]], $5
+  ; MMR6:        move      $[[T2:[0-9]+]], $4
+  ; MMR6:        lw        $[[T3:[0-9]+]], 32($sp)
+  ; MMR6:        addu16    $[[T4:[0-9]+]], $6, $[[T3]]
+  ; MMR6:        lw        $[[T5:[0-9]+]], 36($sp)
+  ; MMR6:        addu16    $[[T6:[0-9]+]], $7, $[[T5]]
+  ; MMR6:        sltu      $[[T7:[0-9]+]], $[[T6]], $7
+  ; MMR6:        addu16    $[[T8:[0-9]+]], $[[T4]], $7
+  ; MMR6:        sltu      $[[T9:[0-9]+]], $[[T8]], $6
+  ; MMR6:        xor       $[[T10:[0-9]+]], $[[T4]], $6
+  ; MMR6:        sltiu     $[[T11:[0-9]+]], $[[T10]], 1
+  ; MMR6:        seleqz    $[[T12:[0-9]+]], $[[T9]], $[[T11]]
+  ; MMR6:        selnez    $[[T13:[0-9]+]], $[[T7]], $[[T11]]
+  ; MMR6:        lw        $[[T14:[0-9]+]], 24($sp)
+  ; MMR6:        or        $[[T15:[0-9]+]], $[[T13]], $[[T12]]
+  ; MMR6:        addu16    $[[T16:[0-9]+]], $[[T2]], $[[T14]]
+  ; MMR6:        lw        $[[T17:[0-9]+]], 28($sp)
+  ; MMR6:        addu16    $[[T18:[0-9]+]], $[[T1]], $[[T17]]
+  ; MMR6:        addu16    $[[T19:[0-9]+]], $[[T18]], $[[T15]]
+  ; MMR6:        sltu      $[[T20:[0-9]+]], $[[T18]], $[[T1]]
+  ; MMR6:        sltu      $[[T21:[0-9]+]], $[[T17]], $[[T18]]
+  ; MMR6:        addu16    $2, $[[T16]], $[[T20]]
+  ; MMR6:        addu16    $2, $[[T20]], $[[T21]]
+
+  ; MM64:       daddu     $[[T0:[0-9]+]], $4, $6
   ; MM64:       daddu     $3, $5, $7
-  ; MM64:       sltu      $[[T0:[0-9]+]], $3, $7
-  ; MM64:       daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; MM64:       daddu     $2, $4, $[[T1]]
+  ; MM64:       sltu      $[[T1:[0-9]+]], $3, $5
+  ; MM64:       dsll      $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl      $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu     $2, $[[T0]], $[[T3]]
 
   %r = add i128 %a, %b
   ret i128 %r
@@ -249,17 +308,16 @@ define signext i32 @add_i32_4(i32 signext %a) {
 define signext i64 @add_i64_4(i64 signext %a) {
 ; ALL-LABEL: add_i64_4:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $5, 4
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 4
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $2, $4, $[[T1]]
+  ; GP32:       addiu   $3, $5, 4
+  ; GP32:       sltu    $[[T0:[0-9]+]], $3, $5
+  ; GP32:       addu    $2, $4, $[[T0]]
+
+  ; MM32:       addiur2 $[[T1:[0-9]+]], $5, 4
+  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; MM32:       addu16  $2, $4, $[[T2]]
 
   ; GP64:       daddiu  $2, $4, 4
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $5, 4
-  ; MM32:       li16    $[[T1:[0-9]+]], 4
-  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
-  ; MM32:       addu    $2, $4, $[[T2]]
 
   ; MM64:       daddiu  $2, $4, 4
 
@@ -270,38 +328,67 @@ define signext i64 @add_i64_4(i64 signext %a) {
 define signext i128 @add_i128_4(i128 signext %a) {
 ; ALL-LABEL: add_i128_4:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $7, 4
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 4
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $[[T2:[0-9]+]], $6, $[[T1]]
-  ; GP32:       sltu    $[[T1]], $[[T2]], $zero
-  ; GP32:       addu    $[[T3:[0-9]+]], $5, $[[T1]]
-  ; GP32:       sltu    $[[T1]], $[[T3]], $zero
-  ; GP32:       addu    $[[T1]], $4, $[[T1]]
-  ; GP32:       move    $4, $[[T2]]
-  ; GP32:       move    $5, $[[T0]]
+  ; PRE4:       move   $[[T0:[0-9]+]], $5
+  ; PRE4:       addiu  $[[T1:[0-9]+]], $7, 4
+  ; PRE4:       sltu   $[[T2:[0-9]+]], $[[T1]], $7
+  ; PRE4:       xori   $[[T3:[0-9]+]], $[[T2]], 1
+  ; PRE4:       bnez   $[[T3]], $BB[[BB0:[0-9_]+]]
+  ; PRE4:       addu   $[[T4:[0-9]+]], $6, $[[T2]]
+  ; PRE4:       sltu   $[[T5:[0-9]+]], $[[T4]], $6
+  ; PRE4;       $BB[[BB0:[0-9]+]]:
+  ; PRE4:       addu   $[[T6:[0-9]+]], $[[T0]], $[[T5]]
+  ; PRE4:       sltu   $[[T7:[0-9]+]], $[[T6]], $[[T0]]
+  ; PRE4:       addu   $[[T8:[0-9]+]], $4, $[[T7]]
+  ; PRE4:       move    $4, $[[T4]]
 
-  ; GP64:       daddiu  $[[T0:[0-9]+]], $5, 4
-  ; GP64:       daddiu  $[[T1:[0-9]+]], $zero, 4
-  ; GP64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP64:       daddu   $2, $4, $[[T1]]
+  ; GP32-CMOV:  addiu   $[[T0:[0-9]+]], $7, 4
+  ; GP32-CMOV:  sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; GP32-CMOV:  addu    $[[T2:[0-9]+]], $6, $[[T1]]
+  ; GP32-CMOV:  sltu    $[[T3:[0-9]+]], $[[T2]], $6
+  ; GP32-CMOV:  movz    $[[T3]], $[[T1]], $[[T1]]
+  ; GP32-CMOV:  addu    $[[T4:[0-9]+]], $5, $[[T3]]
+  ; GP32-CMOV:  sltu    $[[T5:[0-9]+]], $[[T4]], $5
+  ; GP32-CMOV:  addu    $[[T7:[0-9]+]], $4, $[[T5]]
+  ; GP32-CMOV:  move    $4, $[[T2]]
+  ; GP32-CMOV:  move    $5, $[[T0]]
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $7, 4
-  ; MM32:       li16    $[[T1:[0-9]+]], 4
-  ; MM32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM32:       addu16  $[[T2:[0-9]+]], $6, $[[T1]]
-  ; MM32:       li16    $[[T1]], 0
-  ; MM32:       sltu    $[[T3:[0-9]+]], $[[T2]], $[[T1]]
-  ; MM32:       addu16  $[[T3]], $5, $[[T3]]
-  ; MM32:       sltu    $[[T1]], $[[T3]], $[[T1]]
-  ; MM32:       addu16  $[[T1]], $4, $[[T1]]
-  ; MM32:       move    $4, $[[T2]]
-  ; MM32:       move    $5, $[[T0]]
+  ; GP64:           daddiu  $[[T0:[0-9]+]], $5, 4
+  ; GP64:           sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; GP64-NOT-R2-R6: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-R2-R6:     dext    $[[T3:[0-9]+]], $[[T1]], 0, 32
+
+  ; GP64:           daddu   $2, $4, $[[T3]]
+
+  ; MMR3:       addiur2 $[[T0:[0-9]+]], $7, 4
+  ; MMR3:       sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; MMR3:       sltu    $[[T2:[0-9]+]], $[[T0]], $7
+  ; MMR3:       addu16  $[[T3:[0-9]+]], $6, $[[T2]]
+  ; MMR3:       sltu    $[[T4:[0-9]+]], $[[T3]], $6
+  ; MMR3:       movz    $[[T4]], $[[T2]], $[[T1]]
+  ; MMR3:       addu16  $[[T6:[0-9]+]], $5, $[[T4]]
+  ; MMR3:       sltu    $[[T7:[0-9]+]], $[[T6]], $5
+  ; MMR3:       addu16  $2, $4, $[[T7]]
+
+  ; MMR6: addiur2 $[[T1:[0-9]+]], $7, 4
+  ; MMR6: sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR6: xori    $[[T3:[0-9]+]], $[[T2]], 1
+  ; MMR6: selnez  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; MMR6: addu16  $[[T5:[0-9]+]], $6, $[[T2]]
+  ; MMR6: sltu    $[[T6:[0-9]+]], $[[T5]], $6
+  ; MMR6: seleqz  $[[T7:[0-9]+]], $[[T6]], $[[T3]]
+  ; MMR6: or      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; MMR6: addu16  $[[T9:[0-9]+]], $5, $[[T8]]
+  ; MMR6: sltu    $[[T10:[0-9]+]], $[[T9]], $5
+  ; MMR6: addu16  $[[T11:[0-9]+]], $4, $[[T10]]
+  ; MMR6: move    $4, $7
+  ; MMR6: move    $5, $[[T1]]
 
   ; MM64:       daddiu  $[[T0:[0-9]+]], $5, 4
-  ; MM64:       daddiu  $[[T1:[0-9]+]], $zero, 4
-  ; MM64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM64:       daddu   $2, $4, $[[T1]]
+  ; MM64:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; MM64:       dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu   $2, $4, $[[T3]]
 
   %r = add i128 4, %a
   ret i128 %r
@@ -380,16 +467,15 @@ define signext i64 @add_i64_3(i64 signext %a) {
 ; ALL-LABEL: add_i64_3:
 
   ; GP32:       addiu   $[[T0:[0-9]+]], $5, 3
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 3
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
+  ; GP32:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
   ; GP32:       addu    $2, $4, $[[T1]]
 
   ; GP64:       daddiu  $2, $4, 3
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $5, 3
-  ; MM32:       li16    $[[T1:[0-9]+]], 3
-  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T0]], $[[T1]]
-  ; MM32:       addu    $2, $4, $[[T2]]
+  ; MM32:       move    $[[T1:[0-9]+]], $5
+  ; MM32:       addius5 $[[T1]], 3
+  ; MM32:       sltu    $[[T2:[0-9]+]], $[[T1]], $5
+  ; MM32:       addu16  $2, $4, $[[T2]]
 
   ; MM64:       daddiu  $2, $4, 3
 
@@ -400,38 +486,70 @@ define signext i64 @add_i64_3(i64 signext %a) {
 define signext i128 @add_i128_3(i128 signext %a) {
 ; ALL-LABEL: add_i128_3:
 
-  ; GP32:       addiu   $[[T0:[0-9]+]], $7, 3
-  ; GP32:       addiu   $[[T1:[0-9]+]], $zero, 3
-  ; GP32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP32:       addu    $[[T2:[0-9]+]], $6, $[[T1]]
-  ; GP32:       sltu    $[[T3:[0-9]+]], $[[T2]], $zero
-  ; GP32:       addu    $[[T4:[0-9]+]], $5, $[[T3]]
-  ; GP32:       sltu    $[[T5:[0-9]+]], $[[T4]], $zero
-  ; GP32:       addu    $[[T5]], $4, $[[T5]]
-  ; GP32:       move    $4, $[[T2]]
-  ; GP32:       move    $5, $[[T0]]
+  ; PRE4:       move   $[[T0:[0-9]+]], $5
+  ; PRE4:       addiu  $[[T1:[0-9]+]], $7, 3
+  ; PRE4:       sltu   $[[T2:[0-9]+]], $[[T1]], $7
+  ; PRE4:       xori   $[[T3:[0-9]+]], $[[T2]], 1
+  ; PRE4:       bnez   $[[T3]], $BB[[BB0:[0-9_]+]]
+  ; PRE4:       addu   $[[T4:[0-9]+]], $6, $[[T2]]
+  ; PRE4:       sltu   $[[T5:[0-9]+]], $[[T4]], $6
+  ; PRE4;       $BB[[BB0:[0-9]+]]:
+  ; PRE4:       addu   $[[T6:[0-9]+]], $[[T0]], $[[T5]]
+  ; PRE4:       sltu   $[[T7:[0-9]+]], $[[T6]], $[[T0]]
+  ; PRE4:       addu   $[[T8:[0-9]+]], $4, $[[T7]]
+  ; PRE4:       move    $4, $[[T4]]
 
-  ; GP64:       daddiu  $[[T0:[0-9]+]], $5, 3
-  ; GP64:       daddiu  $[[T1:[0-9]+]], $zero, 3
-  ; GP64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; GP64:       daddu   $2, $4, $[[T1]]
+  ; GP32-CMOV:  addiu   $[[T0:[0-9]+]], $7, 3
+  ; GP32-CMOV:  sltu    $[[T1:[0-9]+]], $[[T0]], $7
+  ; GP32-CMOV:  addu    $[[T2:[0-9]+]], $6, $[[T1]]
+  ; GP32-CMOV:  sltu    $[[T3:[0-9]+]], $[[T2]], $6
+  ; GP32-CMOV:  movz    $[[T3]], $[[T1]], $[[T1]]
+  ; GP32-CMOV:  addu    $[[T4:[0-9]+]], $5, $[[T3]]
+  ; GP32-CMOV:  sltu    $[[T5:[0-9]+]], $[[T4]], $5
+  ; GP32-CMOV:  addu    $[[T7:[0-9]+]], $4, $[[T5]]
+  ; GP32-CMOV:  move    $4, $[[T2]]
+  ; GP32-CMOV:  move    $5, $[[T0]]
 
-  ; MM32:       addiu   $[[T0:[0-9]+]], $7, 3
-  ; MM32:       li16    $[[T1:[0-9]+]], 3
-  ; MM32:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM32:       addu16  $[[T2:[0-9]+]], $6, $[[T1]]
-  ; MM32:       li16    $[[T3:[0-9]+]], 0
-  ; MM32:       sltu    $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-  ; MM32:       addu16  $[[T4]], $5, $[[T4]]
-  ; MM32:       sltu    $[[T5:[0-9]+]], $[[T4]], $[[T3]]
-  ; MM32:       addu16  $[[T5]], $4, $[[T5]]
-  ; MM32:       move    $4, $[[T2]]
-  ; MM32:       move    $5, $[[T0]]
+  ; GP64:           daddiu  $[[T0:[0-9]+]], $5, 3
+  ; GP64:           sltu    $[[T1:[0-9]+]], $[[T0]], $5
+
+  ; GP64-NOT-R2-R6: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; GP64-NOT-R2-R6: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; GP64-R2-R6:     dext    $[[T3:[0-9]+]], $[[T1]], 0, 32
+
+  ; GP64:           daddu   $2, $4, $[[T3]]
+
+  ; MMR3:       move    $[[T1:[0-9]+]], $7
+  ; MMR3:       addius5 $[[T1]], 3
+  ; MMR3:       sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR3:       sltu    $[[T3:[0-9]+]], $[[T1]], $7
+  ; MMR3:       addu16  $[[T4:[0-9]+]], $6, $[[T3]]
+  ; MMR3:       sltu    $[[T5:[0-9]+]], $[[T4]], $6
+  ; MMR3:       movz    $[[T5]], $[[T3]], $[[T2]]
+  ; MMR3:       addu16  $[[T6:[0-9]+]], $5, $[[T5]]
+  ; MMR3:       sltu    $[[T7:[0-9]+]], $[[T6]], $5
+  ; MMR3:       addu16  $2, $4, $[[T7]]
+
+  ; MMR6: move    $[[T1:[0-9]+]], $7
+  ; MMR6: addius5 $[[T1]], 3
+  ; MMR6: sltu    $[[T2:[0-9]+]], $[[T1]], $7
+  ; MMR6: xori    $[[T3:[0-9]+]], $[[T2]], 1
+  ; MMR6: selnez  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+  ; MMR6: addu16  $[[T5:[0-9]+]], $6, $[[T2]]
+  ; MMR6: sltu    $[[T6:[0-9]+]], $[[T5]], $6
+  ; MMR6: seleqz  $[[T7:[0-9]+]], $[[T6]], $[[T3]]
+  ; MMR6: or      $[[T8:[0-9]+]], $[[T4]], $[[T7]]
+  ; MMR6: addu16  $[[T9:[0-9]+]], $5, $[[T8]]
+  ; MMR6: sltu    $[[T10:[0-9]+]], $[[T9]], $5
+  ; MMR6: addu16  $[[T11:[0-9]+]], $4, $[[T10]]
+  ; MMR6: move    $4, $[[T5]]
+  ; MMR6: move    $5, $[[T1]]
 
   ; MM64:       daddiu  $[[T0:[0-9]+]], $5, 3
-  ; MM64:       daddiu  $[[T1:[0-9]+]], $zero, 3
-  ; MM64:       sltu    $[[T1]], $[[T0]], $[[T1]]
-  ; MM64:       daddu   $2, $4, $[[T1]]
+  ; MM64:       sltu    $[[T1:[0-9]+]], $[[T0]], $5
+  ; MM64:       dsll    $[[T2:[0-9]+]], $[[T1]], 32
+  ; MM64:       dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+  ; MM64:       daddu   $2, $4, $[[T3]]
 
   %r = add i128 3, %a
   ret i128 %r
diff --git a/test/CodeGen/Mips/llvm-ir/sub.ll b/test/CodeGen/Mips/llvm-ir/sub.ll
index a730063c552f..655addb10a64 100644
--- a/test/CodeGen/Mips/llvm-ir/sub.ll
+++ b/test/CodeGen/Mips/llvm-ir/sub.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM,PRE4
 ; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
 ; RUN:    -check-prefixes=NOT-R2-R6,GP32,GP32-NOT-MM,NOT-MM
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
@@ -11,25 +11,25 @@
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=R2-R6,GP32,GP32-NOT-MM,NOT-MM
 ; RUN: llc < %s -march=mips -mcpu=mips32r3 -mattr=+micromips -verify-machineinstrs | FileCheck %s \
-; RUN:    -check-prefixes=GP32-MM,GP32,MM
+; RUN:    -check-prefixes=GP32-MM,GP32,MM32,MMR3
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=GP32-MM,GP32,MM
+; RUN:    -check-prefixes=GP32-MM,GP32,MM32,MMR6
 ; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
-; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=NOT-R2-R6,GP64,NOT-MM,GP64-NOT-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r3 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r5 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
-; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM
+; RUN:    -check-prefixes=R2-R6,GP64,NOT-MM,GP64-R2
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -mattr=+micromips | FileCheck %s \
-; RUN:    -check-prefixes=GP64,MM
+; RUN:    -check-prefixes=GP64,MM64
 
 define signext i1 @sub_i1(i1 signext %a, i1 signext %b) {
 entry:
@@ -100,10 +100,15 @@ define signext i64 @sub_i64(i64 signext %a, i64 signext %b) {
 entry:
 ; ALL-LABEL: sub_i64:
 
-  ; GP32-NOT-MM     subu    $3, $5, $7
-  ; GP32:           sltu    $[[T0:[0-9]+]], $5, $7
-  ; GP32:           addu    $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP32:           subu    $2, $4, $[[T1]]
+  ; GP32-NOT-MM:    sltu    $[[T0:[0-9]+]], $5, $7
+  ; GP32-NOT-MM:    subu    $2, $4, $6
+  ; GP32-NOT-MM:    subu    $2, $2, $[[T0]]
+  ; GP32-NOT-MM:    subu    $3, $5, $7
+
+  ; MM32:           sltu    $[[T0:[0-9]+]], $5, $7
+  ; MM32:           subu16    $3, $4, $6
+  ; MM32:           subu16    $2, $3, $[[T0]]
+  ; MM32:           subu16    $3, $5, $7
 
   ; GP64:           dsubu   $2, $4, $5
 
@@ -115,42 +120,109 @@ define signext i128 @sub_i128(i128 signext %a, i128 signext %b) {
 entry:
 ; ALL-LABEL: sub_i128:
 
-  ; GP32-NOT-MM:    lw        $[[T0:[0-9]+]], 20($sp)
-  ; GP32-NOT-MM:    sltu      $[[T1:[0-9]+]], $5, $[[T0]]
-  ; GP32-NOT-MM:    lw        $[[T2:[0-9]+]], 16($sp)
-  ; GP32-NOT-MM:    addu      $[[T3:[0-9]+]], $[[T1]], $[[T2]]
-  ; GP32-NOT-MM:    lw        $[[T4:[0-9]+]], 24($sp)
-  ; GP32-NOT-MM:    lw        $[[T5:[0-9]+]], 28($sp)
-  ; GP32-NOT-MM:    subu      $[[T6:[0-9]+]], $7, $[[T5]]
-  ; GP32-NOT-MM:    subu      $2, $4, $[[T3]]
-  ; GP32-NOT-MM:    sltu      $[[T8:[0-9]+]], $6, $[[T4]]
-  ; GP32-NOT-MM:    addu      $[[T9:[0-9]+]], $[[T8]], $[[T0]]
-  ; GP32-NOT-MM:    subu      $3, $5, $[[T9]]
-  ; GP32-NOT-MM:    sltu      $[[T10:[0-9]+]], $7, $[[T5]]
-  ; GP32-NOT-MM:    addu      $[[T11:[0-9]+]], $[[T10]], $[[T4]]
-  ; GP32-NOT-MM:    subu      $4, $6, $[[T11]]
-  ; GP32-NOT-MM:    move      $5, $[[T6]]
+; PRE4: lw     $[[T0:[0-9]+]], 24($sp)
+; PRE4: lw     $[[T1:[0-9]+]], 28($sp)
+; PRE4: sltu   $[[T2:[0-9]+]], $7, $[[T1]]
+; PRE4: xor    $[[T3:[0-9]+]], $6, $[[T0]]
+; PRE4: sltiu  $[[T4:[0-9]+]], $[[T3]], 1
+; PRE4: bnez   $[[T4]]
+; PRE4: move   $[[T5:[0-9]+]], $[[T2]]
+; PRE4: sltu   $[[T5]], $6, $[[T0]]
 
-  ; GP32-MM:        lw        $[[T0:[0-9]+]], 20($sp)
-  ; GP32-MM:        sltu      $[[T1:[0-9]+]], $[[T2:[0-9]+]], $[[T0]]
-  ; GP32-MM:        lw        $[[T3:[0-9]+]], 16($sp)
-  ; GP32-MM:        addu      $[[T3]], $[[T1]], $[[T3]]
-  ; GP32-MM:        lw        $[[T4:[0-9]+]], 24($sp)
-  ; GP32-MM:        lw        $[[T5:[0-9]+]], 28($sp)
-  ; GP32-MM:        subu      $[[T1]], $7, $[[T5]]
-  ; GP32-MM:        subu16    $[[T3]], $[[T6:[0-9]+]], $[[T3]]
-  ; GP32-MM:        sltu      $[[T6]], $6, $[[T4]]
-  ; GP32-MM:        addu16    $[[T0]], $[[T6]], $[[T0]]
-  ; GP32-MM:        subu16    $[[T0]], $5, $[[T0]]
-  ; GP32-MM:        sltu      $[[T6]], $7, $[[T5]]
-  ; GP32-MM:        addu      $[[T6]], $[[T6]], $[[T4]]
-  ; GP32-MM:        subu16    $[[T6]], $6, $[[T6]]
-  ; GP32-MM:        move      $[[T2]], $[[T1]]
+; PRE4: lw     $[[T6:[0-9]+]], 20($sp)
+; PRE4: subu   $[[T7:[0-9]+]], $5, $[[T6]]
+; PRE4: subu   $[[T8:[0-9]+]], $[[T7]], $[[T5]]
+; PRE4: sltu   $[[T9:[0-9]+]], $[[T7]], $[[T5]]
+; PRE4: sltu   $[[T10:[0-9]+]], $5, $[[T6]]
+; PRE4: lw     $[[T11:[0-9]+]], 16($sp)
+; PRE4: subu   $[[T12:[0-9]+]], $4, $[[T11]]
+; PRE4: subu   $[[T13:[0-9]+]], $[[T12]], $[[T10]]
+; PRE4: subu   $[[T14:[0-9]+]], $[[T13]], $[[T9]]
+; PRE4: subu   $[[T15:[0-9]+]], $6, $[[T0]]
+; PRE4: subu   $[[T16:[0-9]+]], $[[T15]], $[[T2]]
+; PRE4: subu   $5, $7, $[[T1]]
 
-  ; GP64:           dsubu     $3, $5, $7
-  ; GP64:           sltu      $[[T0:[0-9]+]], $5, $7
-  ; GP64:           daddu     $[[T1:[0-9]+]], $[[T0]], $6
-  ; GP64:           dsubu     $2, $4, $[[T1]]
+; MMR3: lw       $[[T1:[0-9]+]], 48($sp)
+; MMR3: sltu     $[[T2:[0-9]+]], $6, $[[T1]]
+; MMR3: xor      $[[T3:[0-9]+]], $6, $[[T1]]
+; MMR3: lw       $[[T4:[0-9]+]], 52($sp)
+; MMR3: sltu     $[[T5:[0-9]+]], $7, $[[T4]]
+; MMR3: movz     $[[T6:[0-9]+]], $[[T5]], $[[T3]]
+; MMR3: lw       $[[T7:[0-8]+]], 44($sp)
+; MMR3: subu16   $[[T8:[0-9]+]], $5, $[[T7]]
+; MMR3: subu16   $[[T9:[0-9]+]], $[[T8]], $[[T6]]
+; MMR3: sltu     $[[T10:[0-9]+]], $[[T8]], $[[T2]]
+; MMR3: sltu     $[[T11:[0-9]+]], $5, $[[T7]]
+; MMR3: lw       $[[T12:[0-9]+]], 40($sp)
+; MMR3: lw       $[[T13:[0-9]+]], 12($sp)
+; MMR3: subu16   $[[T14:[0-9]+]], $[[T13]], $[[T12]]
+; MMR3: subu16   $[[T15:[0-9]+]], $[[T14]], $[[T11]]
+; MMR3: subu16   $[[T16:[0-9]+]], $[[T15]], $[[T10]]
+; MMR3: subu16   $[[T17:[0-9]+]], $6, $[[T1]]
+; MMR3: subu16   $[[T18:[0-9]+]], $[[T17]], $7
+; MMR3: lw       $[[T19:[0-9]+]], 8($sp)
+; MMR3: lw       $[[T20:[0-9]+]], 0($sp)
+; MMR3: subu16   $5, $[[T19]], $[[T20]]
+
+; MMR6: move     $[[T0:[0-9]+]], $7
+; MMR6: sw       $[[T0]], 8($sp)
+; MMR6: move     $[[T1:[0-9]+]], $5
+; MMR6: sw       $4, 12($sp)
+; MMR6: lw       $[[T2:[0-9]+]], 48($sp)
+; MMR6: sltu     $[[T3:[0-9]+]], $6, $[[T2]]
+; MMR6: xor      $[[T4:[0-9]+]], $6, $[[T2]]
+; MMR6: sltiu    $[[T5:[0-9]+]], $[[T4]], 1
+; MMR6: seleqz   $[[T6:[0-9]+]], $[[T3]], $[[T5]]
+; MMR6: lw       $[[T7:[0-9]+]], 52($sp)
+; MMR6: sltu     $[[T8:[0-9]+]], $[[T0]], $[[T7]]
+; MMR6: selnez   $[[T9:[0-9]+]], $[[T8]], $[[T5]]
+; MMR6: or       $[[T10:[0-9]+]], $[[T9]], $[[T6]]
+; MMR6: lw       $[[T11:[0-9]+]], 44($sp)
+; MMR6: subu16   $[[T12:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: subu16   $[[T13:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu     $[[T16:[0-9]+]], $[[T12]], $[[T7]]
+; MMR6: sltu     $[[T17:[0-9]+]], $[[T1]], $[[T11]]
+; MMR6: lw       $[[T18:[0-9]+]], 40($sp)
+; MMR6: lw       $[[T19:[0-9]+]], 12($sp)
+; MMR6: subu16   $[[T20:[0-9]+]], $[[T19]], $[[T18]]
+; MMR6: subu16   $[[T21:[0-9]+]], $[[T20]], $[[T17]]
+; MMR6: subu16   $[[T22:[0-9]+]], $[[T21]], $[[T16]]
+; MMR6: subu16   $[[T23:[0-9]+]], $6, $[[T2]]
+; MMR6: subu16   $4, $[[T23]], $5
+; MMR6: lw       $[[T24:[0-9]+]], 8($sp)
+; MMR6: lw       $[[T25:[0-9]+]], 0($sp)
+; MMR6: subu16   $5, $[[T24]], $[[T25]]
+; MMR6: lw       $3, 4($sp)
+
+; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero
+;        extended to 64 bits. Fortunately slt(i)(u) actually gives an i1.
+;        These should be combined away.
+
+; GP64-NOT-R2: dsubu     $1, $4, $6
+; GP64-NOT-R2: sltu      $[[T0:[0-9]+]], $5, $7
+; GP64-NOT-R2: dsll      $[[T1:[0-9]+]], $[[T0]], 32
+; GP64-NOT-R2: dsrl      $[[T2:[0-9]+]], $[[T1]], 32
+; GP64-NOT-R2: dsubu     $2, $1, $[[T2]]
+; GP64-NOT-R2: dsubu     $3, $5, $7
+
+; FIXME: Likewise for the sltu, dext here.
+
+; GP64-R2:     dsubu     $1, $4, $6
+; GP64-R2:     sltu      $[[T0:[0-9]+]], $5, $7
+; GP64-R2:     dext      $[[T1:[0-9]+]], $[[T0]], 0, 32
+; GP64-R2:     dsubu     $2, $1, $[[T1]]
+; GP64-R2:     dsubu     $3, $5, $7
+
+; FIXME: Again, redundant sign extension. Also, microMIPSR6 has the
+;        dext instruction which should be used here.
+
+; MM64: dsubu   $[[T0:[0-9]+]], $4, $6
+; MM64: sltu    $[[T1:[0-9]+]], $5, $7
+; MM64: dsll    $[[T2:[0-9]+]], $[[T1]], 32
+; MM64: dsrl    $[[T3:[0-9]+]], $[[T2]], 32
+; MM64: dsubu   $2, $[[T0]], $[[T3]]
+; MM64: dsubu   $3, $5, $7
+; MM64: jr      $ra
 
   %r = sub i128 %a, %b
   ret i128 %r
diff --git a/test/CodeGen/Mips/long-calls.ll b/test/CodeGen/Mips/long-calls.ll
new file mode 100644
index 000000000000..8a95e9b9307d
--- /dev/null
+++ b/test/CodeGen/Mips/long-calls.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=mips -mattr=-long-calls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+; RUN: llc -march=mips -mattr=+long-calls,+noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=ON32 %s
+
+; RUN: llc -march=mips -mattr=+long-calls,-noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+
+; RUN: llc -march=mips64 -target-abi n32 -mattr=-long-calls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+; RUN: llc -march=mips64 -target-abi n32 -mattr=+long-calls,+noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=ON32 %s
+
+; RUN: llc -march=mips64 -target-abi n64 -mattr=-long-calls %s -o - \
+; RUN:   | FileCheck -check-prefix=OFF %s
+; RUN: llc -march=mips64 -target-abi n64 -mattr=+long-calls,+noabicalls %s -o - \
+; RUN:   | FileCheck -check-prefix=ON64 %s
+
+declare void @callee()
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1)
+
+@val = internal unnamed_addr global [20 x i32] zeroinitializer, align 4
+
+define void @caller() {
+
+; Use `jal` instruction with R_MIPS_26 relocation.
+; OFF: jal callee
+; OFF: jal memset
+
+; Save the `callee` and `memset` addresses in $25 register
+; and use `jalr` for the jumps.
+; ON32: lui    $1, %hi(callee)
+; ON32: addiu  $25, $1, %lo(callee)
+; ON32: jalr   $25
+
+; ON32: addiu  $1, $zero, %lo(memset)
+; ON32: lui    $2, %hi(memset)
+; ON32: addu   $25, $2, $1
+; ON32: jalr   $25
+
+; ON64: lui     $1, %highest(callee)
+; ON64: daddiu  $1, $1, %higher(callee)
+; ON64: daddiu  $1, $1, %hi(callee)
+; ON64: daddiu  $25, $1, %lo(callee)
+; ON64: jalr    $25
+
+; ON64: daddiu  $1, $zero, %higher(memset)
+; ON64: lui     $2, %highest(memset)
+; ON64: lui     $2, %hi(memset)
+; ON64: daddiu  $2, $zero, %lo(memset)
+; ON64: daddu   $25, $1, $2
+; ON64: jalr    $25
+
+  call void @callee()
+  call void @llvm.memset.p0i8.i32(i8* bitcast ([20 x i32]* @val to i8*), i8 0, i32 80, i32 4, i1 false)
+  ret  void
+}
diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 7baba005a072..3e1a2e8b9708 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll
@@ -25,11 +25,11 @@
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
-; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $[[T3]]
-; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
+; 32R6-DAG:      muh  $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sra  $[[T4:[0-9]+]], $6, 31
+; 32R6-DAG:      addu $[[T5:[0-9]+]], $[[T3]], $[[T4]]
+; 32R6-DAG:      addu $2, $[[T5]], $[[T2]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -71,7 +71,7 @@ entry:
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $6
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $6
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $[[T0]]
 ; FIXME: There's a redundant move here. We should remove it
 ; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
@@ -109,10 +109,10 @@ entry:
 
 ; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $[[T1:[0-9]+]], $[[T0]], $7
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $7
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T2]], $6
-; 32R6-DAG:      muh  $[[T5:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      addu $2, $[[T5]], $[[T4]]
+; 32R6-DAG:      sltu $[[T2:[0-9]+]], $[[T1]], $1
+; 32R6-DAG:      muh  $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $6
+; 32R6-DAG:      addu $2, $[[T4]], $[[T2]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -134,6 +134,17 @@ entry:
   ret i64 %add
 }
 
+; ALL-LABEL: madd4
+; ALL-NOT: madd ${{[0-9]+}}, ${{[0-9]+}}
+
+define i32 @madd4(i32 %a, i32 %b, i32 %c) {
+entry:
+  %mul = mul nsw i32 %a, %b
+  %add = add nsw i32 %c, %mul
+
+  ret i32 %add
+}
+
 ; ALL-LABEL: msub1:
 
 ; 32-DAG:        sra $[[T0:[0-9]+]], $6, 31
@@ -148,13 +159,13 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muh  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul  $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      sltu $[[T3:[0-9]+]], $6, $[[T1]]
-; 32R6-DAG:      addu $[[T4:[0-9]+]], $[[T3]], $[[T0]]
-; 32R6-DAG:      sra  $[[T5:[0-9]+]], $6, 31
-; 32R6-DAG:      subu $2, $[[T5]], $[[T4]]
-; 32R6-DAG:      subu $3, $6, $[[T1]]
+; 32R6-DAG:      mul  $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $6, $[[T0]]
+; 32R6-DAG:      muh  $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sra  $[[T3:[0-9]+]], $6, 31
+; 32R6-DAG:      subu $[[T4:[0-9]+]], $[[T3]], $[[T2]]
+; 32R6-DAG:      subu $2, $[[T4]], $[[T1]]
+; 32R6-DAG:      subu $3, $6, $[[T0]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -194,13 +205,12 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muhu $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $6, $[[T1]]
-; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
-; 32R6-DAG:      negu $2, $[[T3]]
-; 32R6-DAG:      subu $3, $6, $[[T1]]
+; 32R6-DAG:      mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $6, $[[T0]]
+; 32R6-DAG:      muhu $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      negu $[[T3:[0-9]+]], $[[T2]]
+; 32R6-DAG:      subu $2, $[[T3]], $[[T1]]
+; 32R6-DAG:      subu $3, $6, $[[T0]]
 
 ; 64-DAG:        d[[m:m]]ult $5, $4
 ; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
@@ -234,12 +244,12 @@ entry:
 ; DSP-DAG:       mfhi $2, $[[AC]]
 ; DSP-DAG:       mflo $3, $[[AC]]
 
-; 32R6-DAG:      muh $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      mul $[[T1:[0-9]+]], ${{[45]}}, ${{[45]}}
-; 32R6-DAG:      sltu $[[T2:[0-9]+]], $7, $[[T1]]
-; 32R6-DAG:      addu $[[T3:[0-9]+]], $[[T2]], $[[T0]]
-; 32R6-DAG:      subu $2, $6, $[[T3]]
-; 32R6-DAG:      subu $3, $7, $[[T1]]
+; 32R6-DAG:      mul $[[T0:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      sltu $[[T1:[0-9]+]], $7, $[[T0]]
+; 32R6-DAG:      muh $[[T2:[0-9]+]], ${{[45]}}, ${{[45]}}
+; 32R6-DAG:      subu $[[T3:[0-9]+]], $6, $[[T2]]
+; 32R6-DAG:      subu $2, $[[T3]], $[[T1]]
+; 32R6-DAG:      subu $3, $7, $[[T0]]
 
 ; 64-DAG:        sll $[[T0:[0-9]+]], $4, 0
 ; 64-DAG:        sll $[[T1:[0-9]+]], $5, 0
@@ -260,3 +270,14 @@ entry:
   %sub = sub nsw i64 %c, %mul
   ret i64 %sub
 }
+
+; ALL-LABEL: msub4
+; ALL-NOT: msub ${{[0-9]+}}, ${{[0-9]+}}
+
+define i32 @msub4(i32 %a, i32 %b, i32 %c) {
+entry:
+  %mul = mul nsw i32 %a, %b
+  %sub = sub nsw i32 %c, %mul
+
+  ret i32 %sub
+}
diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index ac69dc913c18..b3ed8bdd3b9a 100644
--- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r5 \
-; RUN:     -mattr=+fp64,+msa < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR5,MIPS32-O32,MIPS32R5-O32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
-; RUN:     -mattr=+fp64,+msa -target-abi n32 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N32,MIPS64R5-N32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
-; RUN:     -mattr=+fp64,+msa -target-abi n64 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N64,MIPS64R5-N64
 
 ; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r6 \
-; RUN:     -mattr=+fp64,+msa < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR6,MIPSR6-O32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
-; RUN:     -mattr=+fp64,+msa -target-abi n32 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N32,MIPSR6-N32
 ; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
-; RUN:     -mattr=+fp64,+msa -target-abi n64 < %s | FileCheck %s \
+; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N64,MIPSR6-N64
 
 
diff --git a/test/CodeGen/PowerPC/PR33671.ll b/test/CodeGen/PowerPC/PR33671.ll
new file mode 100644
index 000000000000..0edd2e8daff4
--- /dev/null
+++ b/test/CodeGen/PowerPC/PR33671.ll
@@ -0,0 +1,32 @@
+; Function Attrs: norecurse nounwind
+; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 < %s | FileCheck %s
+define void @test1(i32* nocapture readonly %arr, i32* nocapture %arrTo) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 4
+  %0 = bitcast i32* %arrayidx to <4 x i32>*
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 4
+  %1 = bitcast i32* %arrayidx1 to <4 x i32>*
+  %2 = load <4 x i32>, <4 x i32>* %1, align 16
+  store <4 x i32> %2, <4 x i32>* %0, align 16
+  ret void
+; CHECK-LABEL: test1
+; CHECK: lxv [[LD:[0-9]+]], 16(3)
+; CHECK: stxv [[LD]], 16(4)
+}
+
+; Function Attrs: norecurse nounwind
+define void @test2(i32* nocapture readonly %arr, i32* nocapture %arrTo) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arrTo, i64 1
+  %0 = bitcast i32* %arrayidx to <4 x i32>*
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = bitcast i32* %arrayidx1 to <4 x i32>*
+  %2 = load <4 x i32>, <4 x i32>* %1, align 16
+  store <4 x i32> %2, <4 x i32>* %0, align 16
+  ret void
+; CHECK-LABEL: test2
+; CHECK: addi 3, 3, 8
+; CHECK: lxvx [[LD:[0-9]+]], 0, 3
+; CHECK: addi 3, 4, 4
+; CHECK: stxvx [[LD]], 0, 3
+}
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index 60bec4d18f12..3ad432872c0e 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1018,13 +1018,13 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDi
 ; P8LE-LABEL: fromDiffMemVarDi
 ; P9BE: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxv {{v[0-9]+}}
-; P9BE-DAG: lxv
+; P9BE-DAG: lxvx {{v[0-9]+}}
+; P9BE-DAG: lxvx
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxv {{v[0-9]+}}
-; P9LE-DAG: lxv
+; P9LE-DAG: lxvx {{v[0-9]+}}
+; P9LE-DAG: lxvx
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: sldi {{r[0-9]+}}, r4, 2
@@ -1584,16 +1584,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoi
 ; P8BE-LABEL: fromDiffMemConsAConvdtoi
 ; P8LE-LABEL: fromDiffMemConsAConvdtoi
-; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspsxws v2, v2
-; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -2177,12 +2177,14 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDui
 ; P8LE-LABEL: fromDiffMemVarDui
 ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3)
-; P9BE-DAG: lxv
+; P9BE-DAG: addi r3, r3, -12
+; P9BE-DAG: lxvx {{v[0-9]+}}, 0, r3
+; P9BE-DAG: lxvx
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3)
+; P9LE-DAG: addi r3, r3, -12
+; P9LE-DAG: lxvx {{v[0-9]+}}, 0, r3
 ; P9LE-DAG: lxv
 ; P9LE: vperm
 ; P9LE: blr
@@ -2742,16 +2744,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoui
 ; P8BE-LABEL: fromDiffMemConsAConvdtoui
 ; P8LE-LABEL: fromDiffMemConsAConvdtoui
-; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9BE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspuxws v2, v2
-; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
-; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
+; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -3466,9 +3468,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoll
 ; P8BE-LABEL: fromDiffConstsConvftoll
 ; P8LE-LABEL: fromDiffConstsConvftoll
-; P9BE: lxv v2
+; P9BE: lxvx v2
 ; P9BE: blr
-; P9LE: lxv v2
+; P9LE: lxvx v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -4370,9 +4372,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoull
 ; P8BE-LABEL: fromDiffConstsConvftoull
 ; P8LE-LABEL: fromDiffConstsConvftoull
-; P9BE: lxv v2
+; P9BE: lxvx v2
 ; P9BE: blr
-; P9LE: lxv v2
+; P9LE: lxvx v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
index 90dd1d84fc23..6d19d7f0d629 100644
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@@ -63,7 +63,7 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind {
 ; FIXME: li [[R1:r[0-9]+]], 1
 ; FIXME: li [[R2:r[0-9]+]], 0
 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]]
-; CHECK-P9: lxv [[V1:v[0-9]+]]
+; CHECK-P9: lxvx [[V1:v[0-9]+]]
 ; CHECK-P9: vadduqm v2, v2, [[V1]]
 ; CHECK-P9: blr
 
@@ -237,8 +237,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind {
 ; CHECK-LE: blr
 
 ; CHECK-P9-LABEL: @call_v1i128_increment_by_val
-; CHECK-P9-DAG: lxv v2
-; CHECK-P9-DAG: lxv v3
+; CHECK-P9-DAG: lxvx v2
+; CHECK-P9-DAG: lxvx v3
 ; CHECK-P9: bl v1i128_increment_by_val
 ; CHECK-P9: blr
 
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll
index e7640cab6aef..d573441f2cc9 100644
--- a/test/CodeGen/PowerPC/swaps-le-6.ll
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -33,11 +33,11 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar0
-; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1
-; CHECK-P9: stxv [[REG5]]
+; CHECK-P9: stxvx [[REG5]]
 
 define void @bar1() {
 entry:
@@ -56,9 +56,9 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar1
-; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]]
-; CHECK-P9: stxv [[REG5]]
+; CHECK-P9: stxvx [[REG5]]
 
diff --git a/test/CodeGen/PowerPC/vsx-p9.ll b/test/CodeGen/PowerPC/vsx-p9.ll
index 0c29b6adad77..1ca679f474c3 100644
--- a/test/CodeGen/PowerPC/vsx-p9.ll
+++ b/test/CodeGen/PowerPC/vsx-p9.ll
@@ -36,8 +36,8 @@ entry:
   %1 = load <16 x i8>, <16 x i8>* @ucb, align 16
   %add.i = add <16 x i8> %1, %0
   tail call void (...) @sink(<16 x i8> %add.i)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddubm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -45,8 +45,8 @@ entry:
   %3 = load <16 x i8>, <16 x i8>* @scb, align 16
   %add.i22 = add <16 x i8> %3, %2
   tail call void (...) @sink(<16 x i8> %add.i22)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddubm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -54,8 +54,8 @@ entry:
   %5 = load <8 x i16>, <8 x i16>* @usb, align 16
   %add.i21 = add <8 x i16> %5, %4
   tail call void (...) @sink(<8 x i16> %add.i21)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduhm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -63,8 +63,8 @@ entry:
   %7 = load <8 x i16>, <8 x i16>* @ssb, align 16
   %add.i20 = add <8 x i16> %7, %6
   tail call void (...) @sink(<8 x i16> %add.i20)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduhm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -72,8 +72,8 @@ entry:
   %9 = load <4 x i32>, <4 x i32>* @uib, align 16
   %add.i19 = add <4 x i32> %9, %8
   tail call void (...) @sink(<4 x i32> %add.i19)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduwm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -81,8 +81,8 @@ entry:
   %11 = load <4 x i32>, <4 x i32>* @sib, align 16
   %add.i18 = add <4 x i32> %11, %10
   tail call void (...) @sink(<4 x i32> %add.i18)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduwm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -90,8 +90,8 @@ entry:
   %13 = load <2 x i64>, <2 x i64>* @ullb, align 16
   %add.i17 = add <2 x i64> %13, %12
   tail call void (...) @sink(<2 x i64> %add.i17)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddudm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -99,8 +99,8 @@ entry:
   %15 = load <2 x i64>, <2 x i64>* @sllb, align 16
   %add.i16 = add <2 x i64> %15, %14
   tail call void (...) @sink(<2 x i64> %add.i16)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vaddudm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -108,8 +108,8 @@ entry:
   %17 = load <1 x i128>, <1 x i128>* @uxb, align 16
   %add.i15 = add <1 x i128> %17, %16
   tail call void (...) @sink(<1 x i128> %add.i15)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduqm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -117,8 +117,8 @@ entry:
   %19 = load <1 x i128>, <1 x i128>* @sxb, align 16
   %add.i14 = add <1 x i128> %19, %18
   tail call void (...) @sink(<1 x i128> %add.i14)
-; CHECK: lxv 34, 0(3)
-; CHECK: lxv 35, 0(4)
+; CHECK: lxvx 34, 0, 3
+; CHECK: lxvx 35, 0, 4
 ; CHECK: vadduqm 2, 3, 2
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -126,8 +126,8 @@ entry:
   %21 = load <4 x float>, <4 x float>* @vfb, align 16
   %add.i13 = fadd <4 x float> %20, %21
   tail call void (...) @sink(<4 x float> %add.i13)
-; CHECK: lxv 0, 0(3)
-; CHECK: lxv 1, 0(4)
+; CHECK: lxvx 0, 0, 3
+; CHECK: lxvx 1, 0, 4
 ; CHECK: xvaddsp 34, 0, 1
 ; CHECK: stxv 34,
 ; CHECK: bl sink
@@ -135,8 +135,8 @@ entry:
   %23 = load <2 x double>, <2 x double>* @vdb, align 16
   %add.i12 = fadd <2 x double> %22, %23
   tail call void (...) @sink(<2 x double> %add.i12)
-; CHECK: lxv 0, 0(3)
-; CHECK: lxv 1, 0(4)
+; CHECK: lxvx 0, 0, 3
+; CHECK: lxvx 1, 0, 4
 ; CHECK: xvadddp 0, 0, 1
 ; CHECK: stxv 0,
 ; CHECK: bl sink
diff --git a/test/CodeGen/SPARC/soft-mul-div.ll b/test/CodeGen/SPARC/soft-mul-div.ll
new file mode 100644
index 000000000000..7c453dd35be7
--- /dev/null
+++ b/test/CodeGen/SPARC/soft-mul-div.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=sparc -mcpu=v7 -O0 < %s | FileCheck %s
+
+define i32 @test_mul32(i32 %a, i32 %b) #0 {
+    ; CHECK-LABEL: test_mul32
+    ; CHECK:       call .umul
+    %m = mul i32 %a, %b
+    ret i32 %m
+}
+
+define i16 @test_mul16(i16 %a, i16 %b) #0 {
+    ; CHECK-LABEL: test_mul16
+    ; CHECK:       call .umul
+    %m = mul i16 %a, %b
+    ret i16 %m
+}
+
+define i8 @test_mul8(i8 %a, i8 %b) #0 {
+    ; CHECK-LABEL: test_mul8
+    ; CHECK:       call .umul
+    %m = mul i8 %a, %b
+    ret i8 %m
+}
+
+define i32 @test_sdiv32(i32 %a, i32 %b) #0 {
+    ; CHECK-LABEL: test_sdiv32
+    ; CHECK:       call .div
+    %d = sdiv i32 %a, %b
+    ret i32 %d
+}
+
+define i16 @test_sdiv16(i16 %a, i16 %b) #0 {
+    ; CHECK-LABEL: test_sdiv16
+    ; CHECK:       call .div
+    %d = sdiv i16 %a, %b
+    ret i16 %d
+}
+
+define i8 @test_sdiv8(i8 %a, i8 %b) #0 {
+    ; CHECK-LABEL: test_sdiv8
+    ; CHECK:       call .div
+    %d = sdiv i8 %a, %b
+    ret i8 %d
+}
+
+define i32 @test_udiv32(i32 %a, i32 %b) #0 {
+    ; CHECK-LABEL: test_udiv32
+    ; CHECK:       call .udiv
+    %d = udiv i32 %a, %b
+    ret i32 %d
+}
+
+define i16 @test_udiv16(i16 %a, i16 %b) #0 {
+    ; CHECK-LABEL: test_udiv16
+    ; CHECK:       call .udiv
+    %d = udiv i16 %a, %b
+    ret i16 %d
+}
+
+define i8 @test_udiv8(i8 %a, i8 %b) #0 {
+    ; CHECK-LABEL: test_udiv8
+    ; CHECK:       call .udiv
+    %d = udiv i8 %a, %b
+    ret i8 %d
+}
+
diff --git a/test/CodeGen/SystemZ/branch-11.ll b/test/CodeGen/SystemZ/branch-11.ll
new file mode 100644
index 000000000000..ce7b3ef267b4
--- /dev/null
+++ b/test/CodeGen/SystemZ/branch-11.ll
@@ -0,0 +1,56 @@
+; Test indirect jumps on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define i32 @f1(i32 %x, i32 %y, i32 %op) {
+; CHECK-LABEL: f1:
+; CHECK: ahi %r4, -1
+; CHECK: clibh %r4, 5, 0(%r14)
+; CHECK: llgfr [[OP64:%r[0-5]]], %r4
+; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3
+; CHECK: larl [[BASE:%r[1-5]]]
+; CHECK: bi 0([[BASE]],[[INDEX]])
+entry:
+  switch i32 %op, label %exit [
+    i32 1, label %b.add
+    i32 2, label %b.sub
+    i32 3, label %b.and
+    i32 4, label %b.or
+    i32 5, label %b.xor
+    i32 6, label %b.mul
+  ]
+
+b.add:
+  %add = add i32 %x, %y
+  br label %exit
+
+b.sub:
+  %sub = sub i32 %x, %y
+  br label %exit
+
+b.and:
+  %and = and i32 %x, %y
+  br label %exit
+
+b.or:
+  %or = or i32 %x, %y
+  br label %exit
+
+b.xor:
+  %xor = xor i32 %x, %y
+  br label %exit
+
+b.mul:
+  %mul = mul i32 %x, %y
+  br label %exit
+
+exit:
+  %res = phi i32 [ %x,   %entry ],
+                 [ %add, %b.add ],
+                 [ %sub, %b.sub ],
+                 [ %and, %b.and ],
+                 [ %or,  %b.or ],
+                 [ %xor, %b.xor ],
+                 [ %mul, %b.mul ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-abs-03.ll b/test/CodeGen/SystemZ/fp-abs-03.ll
new file mode 100644
index 000000000000..cab6c116bc08
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-abs-03.ll
@@ -0,0 +1,43 @@
+; Test floating-point absolute on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f32.
+declare float @llvm.fabs.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lpdfr %f0, %f0
+; CHECK: br %r14
+  %res = call float @llvm.fabs.f32(float %f)
+  ret float %res
+}
+
+; Test f64.
+declare double @llvm.fabs.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lpdfr %f0, %f0
+; CHECK: br %r14
+  %res = call double @llvm.fabs.f64(double %f)
+  ret double %res
+}
+
+; Test f128.  With the loads and stores, a pure absolute would probably
+; be better implemented using an NI on the upper byte.  Do some extra
+; processing so that using FPRs is unequivocally better.
+declare fp128 @llvm.fabs.f128(fp128 %f)
+define void @f3(fp128 *%ptr, fp128 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: wflpxb [[POSREG1:%v[0-9]+]], [[REG1]]
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[POSREG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %orig = load fp128 , fp128 *%ptr
+  %abs = call fp128 @llvm.fabs.f128(fp128 %orig)
+  %op2 = load fp128 , fp128 *%ptr2
+  %res = fdiv fp128 %abs, %op2
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-abs-04.ll b/test/CodeGen/SystemZ/fp-abs-04.ll
new file mode 100644
index 000000000000..606bce3de36e
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-abs-04.ll
@@ -0,0 +1,46 @@
+; Test negated floating-point absolute on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f32.
+declare float @llvm.fabs.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lndfr %f0, %f0
+; CHECK: br %r14
+  %abs = call float @llvm.fabs.f32(float %f)
+  %res = fsub float -0.0, %abs
+  ret float %res
+}
+
+; Test f64.
+declare double @llvm.fabs.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lndfr %f0, %f0
+; CHECK: br %r14
+  %abs = call double @llvm.fabs.f64(double %f)
+  %res = fsub double -0.0, %abs
+  ret double %res
+}
+
+; Test f128.  With the loads and stores, a pure negative-absolute would
+; probably be better implemented using an OI on the upper byte.  Do some
+; extra processing so that using FPRs is unequivocally better.
+declare fp128 @llvm.fabs.f128(fp128 %f)
+define void @f3(fp128 *%ptr, fp128 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: wflnxb [[NEGREG1:%v[0-9]+]], [[REG1]]
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[NEGREG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %orig = load fp128 , fp128 *%ptr
+  %abs = call fp128 @llvm.fabs.f128(fp128 %orig)
+  %negabs = fsub fp128 0xL00000000000000008000000000000000, %abs
+  %op2 = load fp128 , fp128 *%ptr2
+  %res = fdiv fp128 %negabs, %op2
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-add-01.ll b/test/CodeGen/SystemZ/fp-add-01.ll
index 5b0ed0513a37..219607d628d7 100644
--- a/test/CodeGen/SystemZ/fp-add-01.ll
+++ b/test/CodeGen/SystemZ/fp-add-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit floating-point addition.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: aeb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: aeb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-add-04.ll b/test/CodeGen/SystemZ/fp-add-04.ll
new file mode 100644
index 000000000000..186f37ca5182
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-add-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point addition on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fadd fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-01.ll b/test/CodeGen/SystemZ/fp-cmp-01.ll
index 075c7aa3dd84..146b16bc695f 100644
--- a/test/CodeGen/SystemZ/fp-cmp-01.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-01.ll
@@ -1,7 +1,10 @@
 ; Test 32-bit floating-point comparison.  The tests assume a z10 implementation
 ; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare float @foo()
 
@@ -9,8 +12,9 @@ declare float @foo()
 define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) {
 ; CHECK-LABEL: f1:
 ; CHECK: cebr %f0, %f2
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq float %f1, %f2
   %res = select i1 %cond, i64 %a, i64 %b
@@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, float %f1, float %f2) {
 define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %f2 = load float , float *%ptr
   %cond = fcmp oeq float %f1, %f2
@@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, float %f1, float *%ptr) {
 define i64 @f3(i64 %a, i64 %b, float %f1, float *%base) {
 ; CHECK-LABEL: f3:
 ; CHECK: ceb %f0, 4092(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1023
   %f2 = load float , float *%ptr
@@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, float %f1, float *%base) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r4, 4096
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1024
   %f2 = load float , float *%ptr
@@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, float %f1, float *%base) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r4, -4
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 -1
   %f2 = load float , float *%ptr
@@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r5, 2
 ; CHECK: ceb %f0, 400(%r1,%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%base, i64 %index
   %ptr2 = getelementptr float, float *%ptr1, i64 100
@@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: ceb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: ceb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
@@ -153,8 +162,9 @@ define float @f7(float *%ptr0) {
 define i64 @f8(i64 %a, i64 %b, float %f) {
 ; CHECK-LABEL: f8:
 ; CHECK: ltebr %f0, %f0
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %cond = fcmp oeq float %f, 0.0
   %res = select i1 %cond, i64 %a, i64 %b
@@ -166,8 +176,9 @@ define i64 @f8(i64 %a, i64 %b, float %f) {
 define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: ber %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: ber %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrne %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp oeq float %f1, %f2
@@ -179,8 +190,9 @@ define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f10:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: blhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: blhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnlh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp one float %f1, %f2
@@ -192,8 +204,9 @@ define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f11:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp olt float %f1, %f2
@@ -205,8 +218,9 @@ define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f12:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bher %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bher %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnhe %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ole float %f1, %f2
@@ -218,8 +232,9 @@ define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f13:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bler %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bler %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnle %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp oge float %f1, %f2
@@ -231,8 +246,9 @@ define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f14:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: blr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: blr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ogt float %f1, %f2
@@ -244,8 +260,9 @@ define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f15:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnlhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnlhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrlh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ueq float %f1, %f2
@@ -257,8 +274,9 @@ define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f16:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bner %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bner %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgre %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp une float %f1, %f2
@@ -270,8 +288,9 @@ define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f17:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnler %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnler %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrle %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ult float %f1, %f2
@@ -283,8 +302,9 @@ define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f18:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnlr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnlr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrl %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ule float %f1, %f2
@@ -296,8 +316,9 @@ define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f19:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnhr %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnhr %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrh %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp uge float %f1, %f2
@@ -309,8 +330,9 @@ define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
 define i64 @f20(i64 %a, i64 %b, float %f2, float *%ptr) {
 ; CHECK-LABEL: f20:
 ; CHECK: ceb %f0, 0(%r4)
-; CHECK-NEXT: bnher %r14
-; CHECK: lgr %r2, %r3
+; CHECK-SCALAR-NEXT: bnher %r14
+; CHECK-SCALAR: lgr %r2, %r3
+; CHECK-VECTOR-NEXT: locgrhe %r2, %r3
 ; CHECK: br %r14
   %f1 = load float , float *%ptr
   %cond = fcmp ugt float %f1, %f2
diff --git a/test/CodeGen/SystemZ/fp-cmp-06.ll b/test/CodeGen/SystemZ/fp-cmp-06.ll
new file mode 100644
index 000000000000..e146b51e4fb2
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-cmp-06.ll
@@ -0,0 +1,33 @@
+; Test f128 comparisons on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; There is no memory form of 128-bit comparison.
+define i64 @f1(i64 %a, i64 %b, fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r4)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r5)
+; CHECK: wfcxb [[REG1]], [[REG2]]
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %cond = fcmp oeq fp128 %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; Check comparison with zero -- it is not worthwhile to copy to
+; FP pairs just so we can use LTXBR, so simply load up a zero.
+define i64 @f2(i64 %a, i64 %b, fp128 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r4)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfcxb [[REG1]], [[REG2]]
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f = load fp128, fp128 *%ptr
+  %cond = fcmp oeq fp128 %f, 0xL00000000000000000000000000000000
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-const-11.ll b/test/CodeGen/SystemZ/fp-const-11.ll
new file mode 100644
index 000000000000..8523f2786c34
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-const-11.ll
@@ -0,0 +1,40 @@
+; Test loads of f128 floating-point constants on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefix=CONST
+
+; Test loading zero.
+define void @f1(fp128 *%x) {
+; CHECK-LABEL: f1:
+; CHECK: vzero [[REG:%v[0-9]+]]
+; CHECK: vst [[REG]], 0(%r2)
+; CHECK: br %r14
+  store fp128 0xL00000000000000000000000000000000, fp128 *%x
+  ret void
+}
+
+; Test loading of negative floating-point zero.
+define void @f2(fp128 *%x) {
+; CHECK-LABEL: f2:
+; CHECK: vzero [[REG:%v[0-9]+]]
+; CHECK: wflnxb [[REG]], [[REG]]
+; CHECK: vst [[REG]], 0(%r2)
+; CHECK: br %r14
+  store fp128 0xL00000000000000008000000000000000, fp128 *%x
+  ret void
+}
+
+; Test loading of a 128-bit floating-point constant.  This value would
+; actually fit within the 32-bit format, but we don't have extending
+; loads into vector registers.
+define void @f3(fp128 *%x) {
+; CHECK-LABEL: f3:
+; CHECK: larl [[REGISTER:%r[1-5]+]], {{.*}}
+; CHECK: vl [[REG:%v[0-9]+]], 0([[REGISTER]])
+; CHECK: vst [[REG]], 0(%r2)
+; CHECK: br %r14
+; CONST: .quad 4611404543484231680
+; CONST: .quad 0
+  store fp128 0xL00000000000000003fff000002000000, fp128 *%x
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-conv-15.ll b/test/CodeGen/SystemZ/fp-conv-15.ll
new file mode 100644
index 000000000000..61100016c426
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-conv-15.ll
@@ -0,0 +1,50 @@
+; Test f128 floating-point truncations/extensions on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f128->f64.
+define double @f1(fp128 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wflrx %f0, [[REG]], 0, 0
+; CHECK: br %r14
+  %val = load fp128, fp128 *%ptr
+  %res = fptrunc fp128 %val to double
+  ret double %res
+}
+
+; Test f128->f32.
+define float @f2(fp128 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wflrx %f0, [[REG]], 0, 3
+; CHECK: ledbra %f0, 0, %f0, 0
+; CHECK: br %r14
+  %val = load fp128, fp128 *%ptr
+  %res = fptrunc fp128 %val to float
+  ret float %res
+}
+
+; Test f64->f128.
+define void @f3(fp128 *%dst, double %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflld [[RES:%v[0-9]+]], %f0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %res = fpext double %val to fp128
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+; Test f32->f128.
+define void @f4(fp128 *%dst, float %val) {
+; CHECK-LABEL: f4:
+; CHECK: ldebr %f0, %f0
+; CHECK: wflld [[RES:%v[0-9]+]], %f0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %res = fpext float %val to fp128
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-conv-16.ll b/test/CodeGen/SystemZ/fp-conv-16.ll
new file mode 100644
index 000000000000..4f9bb865694b
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-conv-16.ll
@@ -0,0 +1,99 @@
+; Test f128 floating-point conversion to/from integers on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test signed i32->f128.
+define void @f1(i32 %i, fp128 *%dst) {
+; CHECK-LABEL: f1:
+; CHECK: cxfbr %f0, %r2
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = sitofp i32 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test signed i64->f128.
+define void @f2(i64 %i, fp128 *%dst) {
+; CHECK-LABEL: f2:
+; CHECK: cxgbr %f0, %r2
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = sitofp i64 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test unsigned i32->f128.
+define void @f3(i32 %i, fp128 *%dst) {
+; CHECK-LABEL: f3:
+; CHECK: cxlfbr %f0, 0, %r2, 0
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = uitofp i32 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test unsigned i64->f128.
+define void @f4(i64 %i, fp128 *%dst) {
+; CHECK-LABEL: f4:
+; CHECK: cxlgbr %f0, 0, %r2, 0
+; CHECK: vmrhg %v0, %v0, %v2
+; CHECK: vst %v0, 0(%r3)
+; CHECK: br %r14
+  %conv = uitofp i64 %i to fp128
+  store fp128 %conv, fp128 *%dst
+  ret void
+}
+
+; Test signed f128->i32.
+define i32 @f5(fp128 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: cfxbr %r2, 5, %f0
+; CHECK: br %r14
+  %f = load fp128, fp128 *%src
+  %conv = fptosi fp128 %f to i32
+  ret i32 %conv
+}
+
+; Test signed f128->i64.
+define i64 @f6(fp128 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: cgxbr %r2, 5, %f0
+; CHECK: br %r14
+  %f = load fp128, fp128 *%src
+  %conv = fptosi fp128 %f to i64
+  ret i64 %conv
+}
+
+; Test unsigned f128->i32.
+define i32 @f7(fp128 *%src) {
+; CHECK-LABEL: f7:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: clfxbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %f = load fp128 , fp128 *%src
+  %conv = fptoui fp128 %f to i32
+  ret i32 %conv
+}
+
+; Test unsigned f128->i64.
+define i64 @f8(fp128 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg %v2, %v0, 1
+; CHECK: clgxbr %r2, 5, %f0, 0
+; CHECK: br %r14
+  %f = load fp128 , fp128 *%src
+  %conv = fptoui fp128 %f to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/SystemZ/fp-copysign-02.ll b/test/CodeGen/SystemZ/fp-copysign-02.ll
new file mode 100644
index 000000000000..657c0e18767b
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-copysign-02.ll
@@ -0,0 +1,81 @@
+; Test f128 copysign operations on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @copysignf(float, float) readnone
+declare double @copysign(double, double) readnone
+; FIXME: not really the correct prototype for SystemZ.
+declare fp128 @copysignl(fp128, fp128) readnone
+
+; Test f32 copies in which the sign comes from an f128.
+define float @f1(float %a, fp128 *%bptr) {
+; CHECK-LABEL: f1:
+; CHECK: vl %v[[REG:[0-9]+]], 0(%r2)
+; CHECK: cpsdr %f0, %f[[REG]], %f0
+; CHECK: br %r14
+  %bl = load volatile fp128, fp128 *%bptr
+  %b = fptrunc fp128 %bl to float
+  %res = call float @copysignf(float %a, float %b) readnone
+  ret float %res
+}
+
+; Test f64 copies in which the sign comes from an f128.
+define double @f2(double %a, fp128 *%bptr) {
+; CHECK-LABEL: f2:
+; CHECK: vl %v[[REG:[0-9]+]], 0(%r2)
+; CHECK: cpsdr %f0, %f[[REG]], %f0
+; CHECK: br %r14
+  %bl = load volatile fp128, fp128 *%bptr
+  %b = fptrunc fp128 %bl to double
+  %res = call double @copysign(double %a, double %b) readnone
+  ret double %res
+}
+
+; Test f128 copies in which the sign comes from an f32.
+define void @f7(fp128 *%cptr, fp128 *%aptr, float %bf) {
+; CHECK-LABEL: f7:
+; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3)
+; CHECK: tmlh
+; CHECK: wflnxb [[REG1]], [[REG1]]
+; CHECK: wflpxb [[REG1]], [[REG1]]
+; CHECK: vst [[REG1]], 0(%r2)
+; CHECK: br %r14
+  %a = load volatile fp128, fp128 *%aptr
+  %b = fpext float %bf to fp128
+  %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone
+  store fp128 %c, fp128 *%cptr
+  ret void
+}
+
+; As above, but the sign comes from an f64.
+define void @f8(fp128 *%cptr, fp128 *%aptr, double %bd) {
+; CHECK-LABEL: f8:
+; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3)
+; CHECK: tmhh
+; CHECK: wflnxb [[REG1]], [[REG1]]
+; CHECK: wflpxb [[REG1]], [[REG1]]
+; CHECK: vst [[REG1]], 0(%r2)
+; CHECK: br %r14
+  %a = load volatile fp128, fp128 *%aptr
+  %b = fpext double %bd to fp128
+  %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone
+  store fp128 %c, fp128 *%cptr
+  ret void
+}
+
+; As above, but the sign comes from an f128.
+define void @f9(fp128 *%cptr, fp128 *%aptr, fp128 *%bptr) {
+; CHECK-LABEL: f9:
+; CHECK: vl [[REG1:%v[0-7]+]], 0(%r3)
+; CHECK: vl [[REG2:%v[0-7]+]], 0(%r4)
+; CHECK: tm
+; CHECK: wflnxb [[REG1]], [[REG1]]
+; CHECK: wflpxb [[REG1]], [[REG1]]
+; CHECK: vst [[REG1]], 0(%r2)
+; CHECK: br %r14
+  %a = load volatile fp128, fp128 *%aptr
+  %b = load volatile fp128, fp128 *%bptr
+  %c = call fp128 @copysignl(fp128 %a, fp128 %b) readnone
+  store fp128 %c, fp128 *%cptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-div-01.ll b/test/CodeGen/SystemZ/fp-div-01.ll
index 0791e8db93f8..ee514dc474e9 100644
--- a/test/CodeGen/SystemZ/fp-div-01.ll
+++ b/test/CodeGen/SystemZ/fp-div-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit floating-point division.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: deb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: deb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-div-04.ll b/test/CodeGen/SystemZ/fp-div-04.ll
new file mode 100644
index 000000000000..54e87f46c84a
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-div-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point division on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fdiv fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-move-13.ll b/test/CodeGen/SystemZ/fp-move-13.ll
new file mode 100644
index 000000000000..d6c53eaceeef
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-13.ll
@@ -0,0 +1,46 @@
+; Test f128 moves on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; VR-to-VR moves.  Since f128s are passed by reference,
+; we need to force a copy by other means.
+define void @f1(fp128 *%x) {
+; CHECK-LABEL: f1:
+; CHECK: vlr
+; CHECK: vleig
+; CHECK: br %r14
+  %val = load volatile fp128 , fp128 *%x
+  %t1 = bitcast fp128 %val to <2 x i64>
+  %t2 = insertelement <2 x i64> %t1, i64 0, i32 0
+  %res = bitcast <2 x i64> %t2 to fp128
+  store volatile fp128 %res, fp128 *%x
+  store volatile fp128 %val, fp128 *%x
+  ret void
+}
+
+; Test 128-bit moves from GPRs to VRs.  i128 isn't a legitimate type,
+; so this goes through memory.
+define void @f2(fp128 *%a, i128 *%b) {
+; CHECK-LABEL: f2:
+; CHECK: lg
+; CHECK: lg
+; CHECK: stg
+; CHECK: stg
+; CHECK: br %r14
+  %val = load i128 , i128 *%b
+  %res = bitcast i128 %val to fp128
+  store fp128 %res, fp128 *%a
+  ret void
+}
+
+; Test 128-bit moves from VRs to GPRs, with the same restriction as f2.
+define void @f3(fp128 *%a, i128 *%b) {
+; CHECK-LABEL: f3:
+; CHECK: vl
+; CHECK: vst
+  %val = load fp128 , fp128 *%a
+  %res = bitcast fp128 %val to i128
+  store i128 %res, i128 *%b
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-mul-01.ll b/test/CodeGen/SystemZ/fp-mul-01.ll
index 3b72d25e0b5c..126567b218ab 100644
--- a/test/CodeGen/SystemZ/fp-mul-01.ll
+++ b/test/CodeGen/SystemZ/fp-mul-01.ll
@@ -1,6 +1,8 @@
 ; Test multiplication of two f32s, producing an f32 result.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: meeb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: meeb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-mul-06.ll b/test/CodeGen/SystemZ/fp-mul-06.ll
index 896fafecbdaf..581e44eeaa2f 100644
--- a/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
 
 define float @f1(float %f1, float %f2, float %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: maebr %f4, %f0, %f2
-; CHECK: ler %f0, %f4
+; CHECK-SCALAR: maebr %f4, %f0, %f2
+; CHECK-SCALAR: ler %f0, %f4
+; CHECK-VECTOR: wfmasb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
   ret float %res
@@ -14,7 +18,8 @@ define float @f1(float %f1, float %f2, float %acc) {
 define float @f2(float %f1, float *%ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: maeb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %f2 = load float , float *%ptr
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
@@ -24,7 +29,8 @@ define float @f2(float %f1, float *%ptr, float %acc) {
 define float @f3(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f3:
 ; CHECK: maeb %f2, %f0, 4092(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1023
   %f2 = load float , float *%ptr
@@ -39,7 +45,8 @@ define float @f4(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r2, 4096
 ; CHECK: maeb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1024
   %f2 = load float , float *%ptr
@@ -54,7 +61,8 @@ define float @f5(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r2, -4
 ; CHECK: maeb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 -1
   %f2 = load float , float *%ptr
@@ -66,7 +74,8 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: maeb %f2, %f0, 0(%r1,%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 %index
   %f2 = load float , float *%ptr
@@ -78,7 +87,8 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f7:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: maeb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}})
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1023
   %ptr = getelementptr float, float *%base, i64 %index2
@@ -92,7 +102,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}})
 ; CHECK: maeb %f2, %f0, 0(%r1)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1024
   %ptr = getelementptr float, float *%base, i64 %index2
diff --git a/test/CodeGen/SystemZ/fp-mul-08.ll b/test/CodeGen/SystemZ/fp-mul-08.ll
index 5e5538bfacc9..5b1f9b96c089 100644
--- a/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -1,11 +1,15 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
 
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
 
 define float @f1(float %f1, float %f2, float %acc) {
 ; CHECK-LABEL: f1:
-; CHECK: msebr %f4, %f0, %f2
-; CHECK: ler %f0, %f4
+; CHECK-SCALAR: msebr %f4, %f0, %f2
+; CHECK-SCALAR: ler %f0, %f4
+; CHECK-VECTOR: wfmssb %f0, %f0, %f2, %f4
 ; CHECK: br %r14
   %negacc = fsub float -0.0, %acc
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %negacc)
@@ -15,7 +19,8 @@ define float @f1(float %f1, float %f2, float %acc) {
 define float @f2(float %f1, float *%ptr, float %acc) {
 ; CHECK-LABEL: f2:
 ; CHECK: mseb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %f2 = load float , float *%ptr
   %negacc = fsub float -0.0, %acc
@@ -26,7 +31,8 @@ define float @f2(float %f1, float *%ptr, float %acc) {
 define float @f3(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f3:
 ; CHECK: mseb %f2, %f0, 4092(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1023
   %f2 = load float , float *%ptr
@@ -42,7 +48,8 @@ define float @f4(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f4:
 ; CHECK: aghi %r2, 4096
 ; CHECK: mseb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 1024
   %f2 = load float , float *%ptr
@@ -58,7 +65,8 @@ define float @f5(float %f1, float *%base, float %acc) {
 ; CHECK-LABEL: f5:
 ; CHECK: aghi %r2, -4
 ; CHECK: mseb %f2, %f0, 0(%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 -1
   %f2 = load float , float *%ptr
@@ -71,7 +79,8 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f6:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: mseb %f2, %f0, 0(%r1,%r2)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %ptr = getelementptr float, float *%base, i64 %index
   %f2 = load float , float *%ptr
@@ -84,7 +93,8 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK-LABEL: f7:
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: mseb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}})
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1023
   %ptr = getelementptr float, float *%base, i64 %index2
@@ -99,7 +109,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) {
 ; CHECK: sllg %r1, %r3, 2
 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}})
 ; CHECK: mseb %f2, %f0, 0(%r1)
-; CHECK: ler %f0, %f2
+; CHECK-SCALAR: ler %f0, %f2
+; CHECK-VECTOR: ldr %f0, %f2
 ; CHECK: br %r14
   %index2 = add i64 %index, 1024
   %ptr = getelementptr float, float *%base, i64 %index2
diff --git a/test/CodeGen/SystemZ/fp-mul-10.ll b/test/CodeGen/SystemZ/fp-mul-10.ll
new file mode 100644
index 000000000000..c23a6a202ad5
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
+declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+
+define double @f1(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f1:
+; CHECK: wfnmadb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+  %negres = fsub double -0.0, %res
+  ret double %negres
+}
+
+define double @f2(double %f1, double %f2, double %acc) {
+; CHECK-LABEL: f2:
+; CHECK: wfnmsdb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %negacc = fsub double -0.0, %acc
+  %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
+  %negres = fsub double -0.0, %res
+  ret double %negres
+}
+
+define float @f3(float %f1, float %f2, float %acc) {
+; CHECK-LABEL: f3:
+; CHECK: wfnmasb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
+  %negres = fsub float -0.0, %res
+  ret float %negres
+}
+
+define float @f4(float %f1, float %f2, float %acc) {
+; CHECK-LABEL: f4:
+; CHECK: wfnmssb %f0, %f0, %f2, %f4
+; CHECK: br %r14
+  %negacc = fsub float -0.0, %acc
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %negacc)
+  %negres = fsub float -0.0, %res
+  ret float %negres
+}
+
diff --git a/test/CodeGen/SystemZ/fp-mul-11.ll b/test/CodeGen/SystemZ/fp-mul-11.ll
new file mode 100644
index 000000000000..ef45bf184a4c
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-mul-11.ll
@@ -0,0 +1,32 @@
+; Test 128-bit floating-point multiplication on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfmxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fmul fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
+
+define void @f2(double %f1, double %f2, fp128 *%dst) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: wflld [[REG1:%v[0-9]+]], %f0
+; CHECK-DAG: wflld [[REG2:%v[0-9]+]], %f2
+; CHECK: wfmxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1x = fpext double %f1 to fp128
+  %f2x = fpext double %f2 to fp128
+  %res = fmul fp128 %f1x, %f2x
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-mul-12.ll b/test/CodeGen/SystemZ/fp-mul-12.ll
new file mode 100644
index 000000000000..331f9a30c274
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-mul-12.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare fp128 @llvm.fma.f128(fp128 %f1, fp128 %f2, fp128 %f3)
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfmaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %f3)
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+define void @f2(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfmsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %neg = fsub fp128 0xL00000000000000008000000000000000, %f3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %neg)
+  store fp128 %res, fp128 *%dst
+  ret void
+}
+
+define void @f3(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfnmaxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %f3)
+  %negres = fsub fp128 0xL00000000000000008000000000000000, %res
+  store fp128 %negres, fp128 *%dst
+  ret void
+}
+
+define void @f4(fp128 *%ptr1, fp128 *%ptr2, fp128 *%ptr3, fp128 *%dst) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: vl [[REG3:%v[0-9]+]], 0(%r4)
+; CHECK: wfnmsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], [[REG3]]
+; CHECK: vst [[RES]], 0(%r5)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %f3 = load fp128, fp128 *%ptr3
+  %neg = fsub fp128 0xL00000000000000008000000000000000, %f3
+  %res = call fp128 @llvm.fma.f128 (fp128 %f1, fp128 %f2, fp128 %neg)
+  %negres = fsub fp128 0xL00000000000000008000000000000000, %res
+  store fp128 %negres, fp128 *%dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/fp-neg-02.ll b/test/CodeGen/SystemZ/fp-neg-02.ll
new file mode 100644
index 000000000000..38fb3a58d404
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-neg-02.ll
@@ -0,0 +1,41 @@
+; Test floating-point negation on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test f32.
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: lcdfr %f0, %f0
+; CHECK: br %r14
+  %res = fsub float -0.0, %f
+  ret float %res
+}
+
+; Test f64.
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: lcdfr %f0, %f0
+; CHECK: br %r14
+  %res = fsub double -0.0, %f
+  ret double %res
+}
+
+; Test f128.  With the loads and stores, a pure negation would probably
+; be better implemented using an XI on the upper byte.  Do some extra
+; processing so that using FPRs is unequivocally better.
+define void @f3(fp128 *%ptr, fp128 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK-DAG: wflcxb [[NEGREG1:%v[0-9]+]], [[REG1]]
+; CHECK: wfdxb [[RES:%v[0-9]+]], [[NEGREG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %orig = load fp128 , fp128 *%ptr
+  %negzero = fpext float -0.0 to fp128
+  %neg = fsub fp128 0xL00000000000000008000000000000000, %orig
+  %op2 = load fp128 , fp128 *%ptr2
+  %res = fdiv fp128 %neg, %op2
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-round-03.ll b/test/CodeGen/SystemZ/fp-round-03.ll
new file mode 100644
index 000000000000..762e793701d1
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-round-03.ll
@@ -0,0 +1,207 @@
+; Test rounding functions for z14 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test rint for f32.
+declare float @llvm.rint.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: fiebra %f0, 0, %f0, 0
+; CHECK: br %r14
+  %res = call float @llvm.rint.f32(float %f)
+  ret float %res
+}
+
+; Test rint for f64.
+declare double @llvm.rint.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: fidbra %f0, 0, %f0, 0
+; CHECK: br %r14
+  %res = call double @llvm.rint.f64(double %f)
+  ret double %res
+}
+
+; Test rint for f128.
+declare fp128 @llvm.rint.f128(fp128 %f)
+define void @f3(fp128 *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 0, 0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.rint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test nearbyint for f32.
+declare float @llvm.nearbyint.f32(float %f)
+define float @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: fiebra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.nearbyint.f32(float %f)
+  ret float %res
+}
+
+; Test nearbyint for f64.
+declare double @llvm.nearbyint.f64(double %f)
+define double @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: fidbra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.nearbyint.f64(double %f)
+  ret double %res
+}
+
+; Test nearbyint for f128.
+declare fp128 @llvm.nearbyint.f128(fp128 %f)
+define void @f6(fp128 *%ptr) {
+; CHECK-LABEL: f6:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 0
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.nearbyint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test floor for f32.
+declare float @llvm.floor.f32(float %f)
+define float @f7(float %f) {
+; CHECK-LABEL: f7:
+; CHECK: fiebra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.floor.f32(float %f)
+  ret float %res
+}
+
+; Test floor for f64.
+declare double @llvm.floor.f64(double %f)
+define double @f8(double %f) {
+; CHECK-LABEL: f8:
+; CHECK: fidbra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.floor.f64(double %f)
+  ret double %res
+}
+
+; Test floor for f128.
+declare fp128 @llvm.floor.f128(fp128 %f)
+define void @f9(fp128 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 7
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.floor.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test ceil for f32.
+declare float @llvm.ceil.f32(float %f)
+define float @f10(float %f) {
+; CHECK-LABEL: f10:
+; CHECK: fiebra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.ceil.f32(float %f)
+  ret float %res
+}
+
+; Test ceil for f64.
+declare double @llvm.ceil.f64(double %f)
+define double @f11(double %f) {
+; CHECK-LABEL: f11:
+; CHECK: fidbra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.ceil.f64(double %f)
+  ret double %res
+}
+
+; Test ceil for f128.
+declare fp128 @llvm.ceil.f128(fp128 %f)
+define void @f12(fp128 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 6
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.ceil.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test trunc for f32.
+declare float @llvm.trunc.f32(float %f)
+define float @f13(float %f) {
+; CHECK-LABEL: f13:
+; CHECK: fiebra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.trunc.f32(float %f)
+  ret float %res
+}
+
+; Test trunc for f64.
+declare double @llvm.trunc.f64(double %f)
+define double @f14(double %f) {
+; CHECK-LABEL: f14:
+; CHECK: fidbra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.trunc.f64(double %f)
+  ret double %res
+}
+
+; Test trunc for f128.
+declare fp128 @llvm.trunc.f128(fp128 %f)
+define void @f15(fp128 *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 5
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.trunc.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test round for f32.
+declare float @llvm.round.f32(float %f)
+define float @f16(float %f) {
+; CHECK-LABEL: f16:
+; CHECK: fiebra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.round.f32(float %f)
+  ret float %res
+}
+
+; Test round for f64.
+declare double @llvm.round.f64(double %f)
+define double @f17(double %f) {
+; CHECK-LABEL: f17:
+; CHECK: fidbra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.round.f64(double %f)
+  ret double %res
+}
+
+; Test round for f128.
+declare fp128 @llvm.round.f128(fp128 %f)
+define void @f18(fp128 *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 1
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %src = load fp128 , fp128 *%ptr
+  %res = call fp128 @llvm.round.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-01.ll b/test/CodeGen/SystemZ/fp-sqrt-01.ll
index 3680207e7f20..85a46bc2d7fc 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit square root.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @llvm.sqrt.f32(float)
 declare float @sqrtf(float)
@@ -77,7 +79,7 @@ define float @f6(float *%base, i64 %index) {
 ; to use SQEB if possible.
 define void @f7(float *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK: sqeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+; CHECK-SCALAR: sqeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %val0 = load volatile float , float *%ptr
   %val1 = load volatile float , float *%ptr
@@ -160,7 +162,7 @@ define float @f8(float %dummy, float %val) {
 ; CHECK: sqebr %f0, %f2
 ; CHECK: cebr %f0, %f0
 ; CHECK: bnor %r14
-; CHECK: ler %f0, %f2
+; CHECK: {{ler|ldr}} %f0, %f2
 ; CHECK: jg sqrtf@PLT
   %res = tail call float @sqrtf(float %val)
   ret float %res
diff --git a/test/CodeGen/SystemZ/fp-sqrt-04.ll b/test/CodeGen/SystemZ/fp-sqrt-04.ll
new file mode 100644
index 000000000000..e0fb2569b39a
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-sqrt-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point square root on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare fp128 @llvm.sqrt.f128(fp128 %f)
+
+define void @f1(fp128 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG:%v[0-9]+]], 0(%r2)
+; CHECK: wfsqxb [[RES:%v[0-9]+]], [[REG]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f = load fp128, fp128 *%ptr
+  %res = call fp128 @llvm.sqrt.f128(fp128 %f)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sub-01.ll b/test/CodeGen/SystemZ/fp-sub-01.ll
index f4185ca3108d..41f72e1810e9 100644
--- a/test/CodeGen/SystemZ/fp-sub-01.ll
+++ b/test/CodeGen/SystemZ/fp-sub-01.ll
@@ -1,6 +1,8 @@
 ; Test 32-bit floating-point subtraction.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
+; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
 
 declare float @foo()
 
@@ -76,7 +78,7 @@ define float @f6(float %f1, float *%base, i64 %index) {
 define float @f7(float *%ptr0) {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: seb %f0, 16{{[04]}}(%r15)
+; CHECK-SCALAR: seb %f0, 16{{[04]}}(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr float, float *%ptr0, i64 2
   %ptr2 = getelementptr float, float *%ptr0, i64 4
diff --git a/test/CodeGen/SystemZ/fp-sub-04.ll b/test/CodeGen/SystemZ/fp-sub-04.ll
new file mode 100644
index 000000000000..5f88132664ef
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-sub-04.ll
@@ -0,0 +1,17 @@
+; Test 128-bit floating-point subtraction on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define void @f1(fp128 *%ptr1, fp128 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfsxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK: vst [[RES]], 0(%r2)
+; CHECK: br %r14
+  %f1 = load fp128, fp128 *%ptr1
+  %f2 = load fp128, fp128 *%ptr2
+  %sum = fsub fp128 %f1, %f2
+  store fp128 %sum, fp128 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-add-17.ll b/test/CodeGen/SystemZ/int-add-17.ll
new file mode 100644
index 000000000000..fd245871c652
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-add-17.ll
@@ -0,0 +1,95 @@
+; Test additions between an i64 and a sign-extended i16 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check AGH with no displacement.
+define i64 @f1(i64 %a, i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: agh %r2, 0(%r3)
+; CHECK: br %r14
+  %b = load i16, i16 *%src
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the high end of the aligned AGH range.
+define i64 @f2(i64 %a, i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: agh %r2, 524286(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262143
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f3(i64 %a, i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r3, 524288
+; CHECK: agh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the high end of the negative aligned AGH range.
+define i64 @f4(i64 %a, i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: agh %r2, -2(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -1
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the low end of the AGH range.
+define i64 @f5(i64 %a, i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: agh %r2, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %a, i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524290
+; CHECK: agh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262145
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
+; Check that AGH allows an index.
+define i64 @f7(i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: agh %r2, 524284({{%r4,%r3|%r3,%r4}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524284
+  %ptr = inttoptr i64 %add2 to i16 *
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %add = add i64 %a, %bext
+  ret i64 %add
+}
+
diff --git a/test/CodeGen/SystemZ/int-mul-09.ll b/test/CodeGen/SystemZ/int-mul-09.ll
new file mode 100644
index 000000000000..3e384e72db5d
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-mul-09.ll
@@ -0,0 +1,95 @@
+; Test multiplications between an i64 and a sign-extended i16 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check MGH with no displacement.
+define i64 @f1(i64 %a, i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: mgh %r2, 0(%r3)
+; CHECK: br %r14
+  %b = load i16, i16 *%src
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the high end of the aligned MGH range.
+define i64 @f2(i64 %a, i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: mgh %r2, 524286(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262143
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f3(i64 %a, i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r3, 524288
+; CHECK: mgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the high end of the negative aligned MGH range.
+define i64 @f4(i64 %a, i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: mgh %r2, -2(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -1
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the low end of the MGH range.
+define i64 @f5(i64 %a, i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: mgh %r2, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %a, i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524290
+; CHECK: mgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262145
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
+; Check that MGH allows an index.
+define i64 @f7(i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: mgh %r2, 524284({{%r4,%r3|%r3,%r4}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524284
+  %ptr = inttoptr i64 %add2 to i16 *
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
diff --git a/test/CodeGen/SystemZ/int-mul-10.ll b/test/CodeGen/SystemZ/int-mul-10.ll
new file mode 100644
index 000000000000..a4d80af36a3c
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-mul-10.ll
@@ -0,0 +1,165 @@
+; Test signed high-part i64->i128 multiplications on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check sign-extended multiplication in which only the high part is used.
+define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mgrk %r2, %r3, %r4
+; CHECK: br %r14
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check sign-extended multiplication in which only part of the high half
+; is used.
+define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mgrk [[REG:%r[0-9]+]], %r3, %r4
+; CHECK: srlg %r2, [[REG]], 3
+; CHECK: br %r14
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 67
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check sign-extended multiplication in which the result is split into
+; high and low halves.
+define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f3:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mgrk %r2, %r3, %r4
+; CHECK: ogr %r2, %r3
+; CHECK: br %r14
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  %low = trunc i128 %mulx to i64
+  %or = or i64 %high, %low
+  ret i64 %or
+}
+
+; Check MG with no displacement.
+define i64 @f4(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: {{%r[234]}}
+; CHECK: mg %r2, 0(%r4)
+; CHECK: br %r14
+  %b = load i64 , i64 *%src
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the high end of the aligned MG range.
+define i64 @f5(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: mg %r2, 524280(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 65535
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the next doubleword up, which requires separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r4, 524288
+; CHECK: mg %r2, 0(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 65536
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the high end of the negative aligned MG range.
+define i64 @f7(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f7:
+; CHECK: mg %r2, -8(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 -1
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the low end of the MG range.
+define i64 @f8(i64 %dummy, i64 %a, i64 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: mg %r2, -524288(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 -65536
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check the next doubleword down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f9(i64 *%dest, i64 %a, i64 *%src) {
+; CHECK-LABEL: f9:
+; CHECK: agfi %r4, -524296
+; CHECK: mg %r2, 0(%r4)
+; CHECK: br %r14
+  %ptr = getelementptr i64, i64 *%src, i64 -65537
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
+; Check that MG allows an index.
+define i64 @f10(i64 *%dest, i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f10:
+; CHECK: mg %r2, 524287(%r5,%r4)
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524287
+  %ptr = inttoptr i64 %add2 to i64 *
+  %b = load i64 , i64 *%ptr
+  %ax = sext i64 %a to i128
+  %bx = sext i64 %b to i128
+  %mulx = mul i128 %ax, %bx
+  %highx = lshr i128 %mulx, 64
+  %high = trunc i128 %highx to i64
+  ret i64 %high
+}
+
diff --git a/test/CodeGen/SystemZ/int-mul-11.ll b/test/CodeGen/SystemZ/int-mul-11.ll
new file mode 100644
index 000000000000..f26251982518
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-mul-11.ll
@@ -0,0 +1,32 @@
+; Test three-operand multiplication instructions on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Check MSRKC.
+define i32 @f1(i32 %dummy, i32 %a, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: msrkc %r2, %r3, %r4
+; CHECK: br %r14
+  %mul = mul i32 %a, %b
+  ret i32 %mul
+}
+
+; Check MSGRKC.
+define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
+; CHECK-LABEL: f2:
+; CHECK: msgrkc %r2, %r3, %r4
+; CHECK: br %r14
+  %mul = mul i64 %a, %b
+  ret i64 %mul
+}
+
+; Verify that we still use MSGFR for i32->i64 multiplies.
+define i64 @f3(i64 %a, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: msgfr %r2, %r3
+; CHECK: br %r14
+  %bext = sext i32 %b to i64
+  %mul = mul i64 %a, %bext
+  ret i64 %mul
+}
+
diff --git a/test/CodeGen/SystemZ/int-sub-10.ll b/test/CodeGen/SystemZ/int-sub-10.ll
new file mode 100644
index 000000000000..bf6638575e55
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-sub-10.ll
@@ -0,0 +1,95 @@
+; Test subtractions of a sign-extended i16 from an i64 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i64 @foo()
+
+; Check SGH with no displacement.
+define i64 @f1(i64 %a, i16 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: sgh %r2, 0(%r3)
+; CHECK: br %r14
+  %b = load i16, i16 *%src
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the high end of the aligned SGH range.
+define i64 @f2(i64 %a, i16 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: sgh %r2, 524286(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262143
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f3(i64 %a, i16 *%src) {
+; CHECK-LABEL: f3:
+; CHECK: agfi %r3, 524288
+; CHECK: sgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the high end of the negative aligned SGH range.
+define i64 @f4(i64 %a, i16 *%src) {
+; CHECK-LABEL: f4:
+; CHECK: sgh %r2, -2(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -1
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the low end of the SGH range.
+define i64 @f5(i64 %a, i16 *%src) {
+; CHECK-LABEL: f5:
+; CHECK: sgh %r2, -524288(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262144
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define i64 @f6(i64 %a, i16 *%src) {
+; CHECK-LABEL: f6:
+; CHECK: agfi %r3, -524290
+; CHECK: sgh %r2, 0(%r3)
+; CHECK: br %r14
+  %ptr = getelementptr i16, i16 *%src, i64 -262145
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check that SGH allows an index.
+define i64 @f7(i64 %a, i64 %src, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: sgh %r2, 524284({{%r4,%r3|%r3,%r4}})
+; CHECK: br %r14
+  %add1 = add i64 %src, %index
+  %add2 = add i64 %add1, 524284
+  %ptr = inttoptr i64 %add2 to i16 *
+  %b = load i16, i16 *%ptr
+  %bext = sext i16 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
diff --git a/test/CodeGen/SystemZ/tdc-07.ll b/test/CodeGen/SystemZ/tdc-07.ll
new file mode 100644
index 000000000000..6651410e7c66
--- /dev/null
+++ b/test/CodeGen/SystemZ/tdc-07.ll
@@ -0,0 +1,18 @@
+; Test the Test Data Class instruction on z14
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare i32 @llvm.s390.tdc.f128(fp128, i64)
+
+; Check using as i32 - f128
+define i32 @f3(fp128 %x) {
+; CHECK-LABEL: f3
+; CHECK: vl %v0, 0(%r2)
+; CHECK: vrepg  %v2, %v0, 1
+; CHECK: tcxb %f0, 123
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+  %res = call i32 @llvm.s390.tdc.f128(fp128 %x, i64 123)
+  ret i32 %res
+}
+
diff --git a/test/CodeGen/SystemZ/vec-abs-06.ll b/test/CodeGen/SystemZ/vec-abs-06.ll
new file mode 100644
index 000000000000..8eee1d9d2507
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-abs-06.ll
@@ -0,0 +1,47 @@
+; Test f32 and v4f32 absolute on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @llvm.fabs.f32(float)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+
+; Test a plain absolute.
+define <4 x float> @f1(<4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vflpsb %v24, %v24
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.fabs.v4f32(<4 x float> %val)
+  ret <4 x float> %ret
+}
+
+; Test a negative absolute.
+define <4 x float> @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: vflnsb %v24, %v24
+; CHECK: br %r14
+  %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %val)
+  %ret = fsub <4 x float> <float -0.0, float -0.0,
+                           float -0.0, float -0.0>, %abs
+  ret <4 x float> %ret
+}
+
+; Test an f32 absolute that uses vector registers.
+define float @f3(<4 x float> %val) {
+; CHECK-LABEL: f3:
+; CHECK: wflpsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %ret = call float @llvm.fabs.f32(float %scalar)
+  ret float %ret
+}
+
+; Test an f32 negative absolute that uses vector registers.
+define float @f4(<4 x float> %val) {
+; CHECK-LABEL: f4:
+; CHECK: wflnsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %abs = call float @llvm.fabs.f32(float %scalar)
+  %ret = fsub float -0.0, %abs
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-add-02.ll b/test/CodeGen/SystemZ/vec-add-02.ll
new file mode 100644
index 000000000000..97a9b84a063c
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-add-02.ll
@@ -0,0 +1,24 @@
+; Test vector addition on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 addition.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfasb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fadd <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 addition that uses vector registers.
+define float @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfasb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fadd float %scalar1, %scalar2
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-and-04.ll b/test/CodeGen/SystemZ/vec-and-04.ll
new file mode 100644
index 000000000000..e9355beb4296
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-and-04.ll
@@ -0,0 +1,47 @@
+; Test vector NAND on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v16i8 NAND.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <16 x i8> %val1, %val2
+  %not = xor <16 x i8> %ret, <i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %not
+}
+
+; Test a v8i16 NAND.
+define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <8 x i16> %val1, %val2
+  %not = xor <8 x i16> %ret, <i16 -1, i16 -1, i16 -1, i16 -1,
+                              i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %not
+}
+
+; Test a v4i32 NAND.
+define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <4 x i32> %val1, %val2
+  %not = xor <4 x i32> %ret, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %not
+}
+
+; Test a v2i64 NAND.
+define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: vnn %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = and <2 x i64> %val1, %val2
+  %not = xor <2 x i64> %ret, <i64 -1, i64 -1>
+  ret <2 x i64> %not
+}
diff --git a/test/CodeGen/SystemZ/vec-cmp-07.ll b/test/CodeGen/SystemZ/vec-cmp-07.ll
new file mode 100644
index 000000000000..f272ba4bd755
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-cmp-07.ll
@@ -0,0 +1,349 @@
+; Test f32 and v4f32 comparisons on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test oeq.
+define <4 x i32> @f1(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfcesb %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oeq <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test one.
+define <4 x i32> @f2(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vo %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp one <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ogt.
+define <4 x i32> @f3(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: vfchsb %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test oge.
+define <4 x i32> @f4(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: vfchesb %v24, %v26, %v28
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oge <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ole.
+define <4 x i32> @f5(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vfchesb %v24, %v28, %v26
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ole <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test olt.
+define <4 x i32> @f6(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: vfchsb %v24, %v28, %v26
+; CHECK-NEXT: br %r14
+  %cmp = fcmp olt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ueq.
+define <4 x i32> @f7(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f7:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vno %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ueq <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test une.
+define <4 x i32> @f8(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f8:
+; CHECK: vfcesb [[REG:%v[0-9]+]], %v26, %v28
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp une <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ugt.
+define <4 x i32> @f9(<4 x i32> %dummy, <4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f9:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v28, %v26
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ugt <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test uge.
+define <4 x i32> @f10(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f10:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v28, %v26
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uge <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ule.
+define <4 x i32> @f11(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f11:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v28
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ule <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ult.
+define <4 x i32> @f12(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f12:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v28
+; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ult <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test ord.
+define <4 x i32> @f13(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f13:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vo %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ord <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test uno.
+define <4 x i32> @f14(<4 x i32> %dummy, <4 x float> %val1,
+                      <4 x float> %val2) {
+; CHECK-LABEL: f14:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v28, %v26
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v26, %v28
+; CHECK: vno %v24, [[REG1]], [[REG2]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uno <4 x float> %val1, %val2
+  %ret = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Test oeq selects.
+define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f15:
+; CHECK: vfcesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oeq <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test one selects.
+define <4 x float> @f16(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f16:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp one <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ogt selects.
+define <4 x float> @f17(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f17:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test oge selects.
+define <4 x float> @f18(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f18:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp oge <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ole selects.
+define <4 x float> @f19(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f19:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ole <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test olt selects.
+define <4 x float> @f20(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f20:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp olt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ueq selects.
+define <4 x float> @f21(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f21:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchsb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ueq <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test une selects.
+define <4 x float> @f22(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f22:
+; CHECK: vfcesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp une <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ugt selects.
+define <4 x float> @f23(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f23:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ugt <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test uge selects.
+define <4 x float> @f24(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f24:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v26, %v24
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uge <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ule selects.
+define <4 x float> @f25(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f25:
+; CHECK: vfchsb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ule <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ult selects.
+define <4 x float> @f26(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f26:
+; CHECK: vfchesb [[REG:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ult <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test ord selects.
+define <4 x float> @f27(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f27:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp ord <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test uno selects.
+define <4 x float> @f28(<4 x float> %val1, <4 x float> %val2,
+                        <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: f28:
+; CHECK-DAG: vfchsb [[REG1:%v[0-9]+]], %v26, %v24
+; CHECK-DAG: vfchesb [[REG2:%v[0-9]+]], %v24, %v26
+; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
+; CHECK-NEXT: br %r14
+  %cmp = fcmp uno <4 x float> %val1, %val2
+  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %ret
+}
+
+; Test an f32 comparison that uses vector registers.
+define i64 @f29(i64 %a, i64 %b, float %f1, <4 x float> %vec) {
+; CHECK-LABEL: f29:
+; CHECK: wfcsb %f0, %v24
+; CHECK-NEXT: locgrne %r2, %r3
+; CHECK: br %r14
+  %f2 = extractelement <4 x float> %vec, i32 0
+  %cond = fcmp oeq float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/vec-ctpop-02.ll b/test/CodeGen/SystemZ/vec-ctpop-02.ll
new file mode 100644
index 000000000000..ee50e88d0430
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-ctpop-02.ll
@@ -0,0 +1,45 @@
+; Test vector population-count instruction on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+
+define <16 x i8> @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: vpopctb  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+  ret <16 x i8> %popcnt
+}
+
+define <8 x i16> @f2(<8 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: vpopcth  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+  ret <8 x i16> %popcnt
+}
+
+define <4 x i32> @f3(<4 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: vpopctf  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+  ret <4 x i32> %popcnt
+}
+
+define <2 x i64> @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: vpopctg  %v24, %v24
+; CHECK: br      %r14
+
+  %popcnt = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+  ret <2 x i64> %popcnt
+}
+
diff --git a/test/CodeGen/SystemZ/vec-div-02.ll b/test/CodeGen/SystemZ/vec-div-02.ll
new file mode 100644
index 000000000000..74e3b5148ad5
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-div-02.ll
@@ -0,0 +1,24 @@
+; Test vector division on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 division.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfdsb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fdiv <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 division that uses vector registers.
+define float @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfdsb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fdiv float %scalar1, %scalar2
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-intrinsics.ll b/test/CodeGen/SystemZ/vec-intrinsics-01.ll
similarity index 100%
rename from test/CodeGen/SystemZ/vec-intrinsics.ll
rename to test/CodeGen/SystemZ/vec-intrinsics-01.ll
diff --git a/test/CodeGen/SystemZ/vec-intrinsics-02.ll b/test/CodeGen/SystemZ/vec-intrinsics-02.ll
new file mode 100644
index 000000000000..84c6a0784031
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-intrinsics-02.ll
@@ -0,0 +1,441 @@
+; Test vector intrinsics added with z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <2 x i64> @llvm.s390.vbperm(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.s390.vmslg(<2 x i64>, <2 x i64>, <16 x i8>, i32)
+declare <16 x i8> @llvm.s390.vlrl(i32, i8 *)
+declare void @llvm.s390.vstrl(<16 x i8>, i32, i8 *)
+
+declare {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float>, <4 x float>)
+declare {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float>, <4 x float>)
+declare {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float>, <4 x float>)
+declare {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float>, i32)
+declare <4 x float> @llvm.s390.vfisb(<4 x float>, i32, i32)
+
+declare <2 x double> @llvm.s390.vfmaxdb(<2 x double>, <2 x double>, i32)
+declare <2 x double> @llvm.s390.vfmindb(<2 x double>, <2 x double>, i32)
+declare <4 x float> @llvm.s390.vfmaxsb(<4 x float>, <4 x float>, i32)
+declare <4 x float> @llvm.s390.vfminsb(<4 x float>, <4 x float>, i32)
+
+; VBPERM.
+define <2 x i64> @test_vbperm(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vbperm:
+; CHECK: vbperm %v24, %v24, %v26
+; CHECK: br %r14
+  %res = call <2 x i64> @llvm.s390.vbperm(<16 x i8> %a, <16 x i8> %b)
+  ret <2 x i64> %res
+}
+
+; VMSLG with no shifts.
+define <16 x i8> @test_vmslg1(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmslg1:
+; CHECK: vmslg %v24, %v24, %v26, %v28, 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 0)
+  ret <16 x i8> %res
+}
+
+; VMSLG with both shifts.
+define <16 x i8> @test_vmslg2(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmslg2:
+; CHECK: vmslg %v24, %v24, %v26, %v28, 12
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 12)
+  ret <16 x i8> %res
+}
+
+; VLRLR with the lowest in-range displacement.
+define <16 x i8> @test_vlrlr1(i8 *%ptr, i32 %length) {
+; CHECK-LABEL: test_vlrlr1:
+; CHECK: vlrlr %v24, %r3, 0(%r2)
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRLR with the highest in-range displacement.
+define <16 x i8> @test_vlrlr2(i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vlrlr2:
+; CHECK: vlrlr %v24, %r3, 4095(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRLR with an out-of-range displacement.
+define <16 x i8> @test_vlrlr3(i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vlrlr3:
+; CHECK: vlrlr %v24, %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; Check that VLRLR doesn't allow an index.
+define <16 x i8> @test_vlrlr4(i8 *%base, i64 %index, i32 %length) {
+; CHECK-LABEL: test_vlrlr4:
+; CHECK: vlrlr %v24, %r4, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 %length, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRL with the lowest in-range displacement.
+define <16 x i8> @test_vlrl1(i8 *%ptr) {
+; CHECK-LABEL: test_vlrl1:
+; CHECK: vlrl %v24, 0(%r2), 0
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRL with the highest in-range displacement.
+define <16 x i8> @test_vlrl2(i8 *%base) {
+; CHECK-LABEL: test_vlrl2:
+; CHECK: vlrl %v24, 4095(%r2), 0
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VLRL with an out-of-range displacement.
+define <16 x i8> @test_vlrl3(i8 *%base) {
+; CHECK-LABEL: test_vlrl3:
+; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; Check that VLRL doesn't allow an index.
+define <16 x i8> @test_vlrl4(i8 *%base, i64 %index) {
+; CHECK-LABEL: test_vlrl4:
+; CHECK: vlrl %v24, 0({{%r[1-5]}}), 0
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 0, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
+; VSTRLR with the lowest in-range displacement.
+define void @test_vstrlr1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
+; CHECK-LABEL: test_vstrlr1:
+; CHECK: vstrlr %v24, %r3, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTRLR with the highest in-range displacement.
+define void @test_vstrlr2(<16 x i8> %vec, i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vstrlr2:
+; CHECK: vstrlr %v24, %r3, 4095(%r2)
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTRLR with an out-of-range displacement.
+define void @test_vstrlr3(<16 x i8> %vec, i8 *%base, i32 %length) {
+; CHECK-LABEL: test_vstrlr3:
+; CHECK: vstrlr %v24, %r3, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; Check that VSTRLR doesn't allow an index.
+define void @test_vstrlr4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
+; CHECK-LABEL: test_vstrlr4:
+; CHECK: vstrlr %v24, %r4, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 %length, i8 *%ptr)
+  ret void
+}
+
+; VSTRL with the lowest in-range displacement.
+define void @test_vstrl1(<16 x i8> %vec, i8 *%ptr) {
+; CHECK-LABEL: test_vstrl1:
+; CHECK: vstrl %v24, 0(%r2), 8
+; CHECK: br %r14
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; VSTRL with the highest in-range displacement.
+define void @test_vstrl2(<16 x i8> %vec, i8 *%base) {
+; CHECK-LABEL: test_vstrl2:
+; CHECK: vstrl %v24, 4095(%r2), 8
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4095
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; VSTRL with an out-of-range displacement.
+define void @test_vstrl3(<16 x i8> %vec, i8 *%base) {
+; CHECK-LABEL: test_vstrl3:
+; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 4096
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; Check that VSTRL doesn't allow an index.
+define void @test_vstrl4(<16 x i8> %vec, i8 *%base, i64 %index) {
+; CHECK-LABEL: test_vstrl4:
+; CHECK: vstrl %v24, 0({{%r[1-5]}}), 8
+; CHECK: br %r14
+  %ptr = getelementptr i8, i8 *%base, i64 %index
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 8, i8 *%ptr)
+  ret void
+}
+
+; VFCESBS with no processing of the result.
+define i32 @test_vfcesbs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfcesbs:
+; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCESBS, returning 1 if any elements are equal (CC != 3).
+define i32 @test_vfcesbs_any_bool(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfcesbs_any_bool:
+; CHECK: vfcesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: afi %r2, -536870912
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp ne i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCESBS, storing to %ptr if any elements are equal.
+define <4 x i32> @test_vfcesbs_any_store(<4 x float> %a, <4 x float> %b,
+                                         i32 *%ptr) {
+; CHECK-LABEL: test_vfcesbs_any_store:
+; CHECK-NOT: %r
+; CHECK: vfcesbs %v24, %v24, %v26
+; CHECK-NEXT: {{bor|bnler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfcesbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp ule i32 %cc, 2
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VFCHSBS with no processing of the result.
+define i32 @test_vfchsbs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchsbs:
+; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCHSBS, returning 1 if not all elements are higher.
+define i32 @test_vfchsbs_notall_bool(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchsbs_notall_bool:
+; CHECK: vfchsbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 36
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp sge i32 %res, 1
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCHSBS, storing to %ptr if not all elements are higher.
+define <4 x i32> @test_vfchsbs_notall_store(<4 x float> %a, <4 x float> %b,
+                                            i32 *%ptr) {
+; CHECK-LABEL: test_vfchsbs_notall_store:
+; CHECK-NOT: %r
+; CHECK: vfchsbs %v24, %v24, %v26
+; CHECK-NEXT: {{bher|ber}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchsbs(<4 x float> %a,
+                                                   <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VFCHESBS with no processing of the result.
+define i32 @test_vfchesbs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchesbs:
+; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
+						    <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFCHESBS, returning 1 if neither element is higher or equal.
+define i32 @test_vfchesbs_none_bool(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfchesbs_none_bool:
+; CHECK: vfchesbs {{%v[0-9]+}}, %v24, %v26
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: risblg %r2, [[REG]], 31, 159, 35
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
+						    <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 3
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFCHESBS, storing to %ptr if neither element is higher or equal.
+define <4 x i32> @test_vfchesbs_none_store(<4 x float> %a, <4 x float> %b,
+                                           i32 *%ptr) {
+; CHECK-LABEL: test_vfchesbs_none_store:
+; CHECK-NOT: %r
+; CHECK: vfchesbs %v24, %v24, %v26
+; CHECK-NEXT: {{bnor|bler}} %r14
+; CHECK: mvhi 0(%r2), 0
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vfchesbs(<4 x float> %a,
+						    <4 x float> %b)
+  %res = extractvalue {<4 x i32>, i32} %call, 0
+  %cc = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp uge i32 %cc, 3
+  br i1 %cmp, label %store, label %exit
+
+store:
+  store i32 0, i32 *%ptr
+  br label %exit
+
+exit:
+  ret <4 x i32> %res
+}
+
+; VFTCISB with the lowest useful class selector and no processing of the result.
+define i32 @test_vftcisb(<4 x float> %a) {
+; CHECK-LABEL: test_vftcisb:
+; CHECK: vftcisb {{%v[0-9]+}}, %v24, 1
+; CHECK: ipm %r2
+; CHECK: srl %r2, 28
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 1)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  ret i32 %res
+}
+
+; VFTCISB with the highest useful class selector, returning 1 if all elements
+; have the right class (CC == 0).
+define i32 @test_vftcisb_all_bool(<4 x float> %a) {
+; CHECK-LABEL: test_vftcisb_all_bool:
+; CHECK: vftcisb {{%v[0-9]+}}, %v24, 4094
+; CHECK: afi %r2, -268435456
+; CHECK: srl %r2, 31
+; CHECK: br %r14
+  %call = call {<4 x i32>, i32} @llvm.s390.vftcisb(<4 x float> %a, i32 4094)
+  %res = extractvalue {<4 x i32>, i32} %call, 1
+  %cmp = icmp eq i32 %res, 0
+  %ext = zext i1 %cmp to i32
+  ret i32 %ext
+}
+
+; VFISB with a rounding mode not usable via standard intrinsics.
+define <4 x float> @test_vfisb_0_4(<4 x float> %a) {
+; CHECK-LABEL: test_vfisb_0_4:
+; CHECK: vfisb %v24, %v24, 0, 4
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 0, i32 4)
+  ret <4 x float> %res
+}
+
+; VFISB with IEEE-inexact exception suppressed.
+define <4 x float> @test_vfisb_4_0(<4 x float> %a) {
+; CHECK-LABEL: test_vfisb_4_0:
+; CHECK: vfisb %v24, %v24, 4, 0
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfisb(<4 x float> %a, i32 4, i32 0)
+  ret <4 x float> %res
+}
+
+; VFMAXDB.
+define <2 x double> @test_vfmaxdb(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfmaxdb:
+; CHECK: vfmaxdb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %a, <2 x double> %b, i32 4)
+  ret <2 x double> %res
+}
+
+; VFMINDB.
+define <2 x double> @test_vfmindb(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_vfmindb:
+; CHECK: vfmindb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <2 x double> @llvm.s390.vfmindb(<2 x double> %a, <2 x double> %b, i32 4)
+  ret <2 x double> %res
+}
+
+; VFMAXSB.
+define <4 x float> @test_vfmaxsb(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfmaxsb:
+; CHECK: vfmaxsb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %a, <4 x float> %b, i32 4)
+  ret <4 x float> %res
+}
+
+; VFMINSB.
+define <4 x float> @test_vfminsb(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vfminsb:
+; CHECK: vfminsb %v24, %v24, %v26, 4
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.s390.vfminsb(<4 x float> %a, <4 x float> %b, i32 4)
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/SystemZ/vec-max-05.ll b/test/CodeGen/SystemZ/vec-max-05.ll
new file mode 100644
index 000000000000..591d3bf36f16
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-max-05.ll
@@ -0,0 +1,175 @@
+; Test vector maximum on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare double @fmax(double, double)
+declare double @llvm.maxnum.f64(double, double)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+
+declare float @fmaxf(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+
+declare fp128 @fmaxl(fp128, fp128)
+declare fp128 @llvm.maxnum.f128(fp128, fp128)
+
+; Test the fmax library function.
+define double @f1(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f1:
+; CHECK: wfmaxdb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @fmax(double %val1, double %val2) readnone
+  ret double %ret
+}
+
+; Test the f64 maxnum intrinsic.
+define double @f2(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfmaxdb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @llvm.maxnum.f64(double %val1, double %val2)
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in maxnum.
+define double @f3(double %dummy, double %val) {
+; CHECK-LABEL: f3:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmaxdb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp ogt double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in maxnan.
+define double @f4(double %dummy, double %val) {
+; CHECK-LABEL: f4:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmaxdb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ugt double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test the v2f64 maxnum intrinsic.
+define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vfmaxdb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %val1, <2 x double> %val2)
+  ret <2 x double> %ret
+}
+
+; Test the fmaxf library function.
+define float @f11(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f11:
+; CHECK: wfmaxsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @fmaxf(float %val1, float %val2) readnone
+  ret float %ret
+}
+
+; Test the f32 maxnum intrinsic.
+define float @f12(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f12:
+; CHECK: wfmaxsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @llvm.maxnum.f32(float %val1, float %val2)
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in maxnum.
+define float @f13(float %dummy, float %val) {
+; CHECK-LABEL: f13:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfmaxsb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp ogt float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in maxnan.
+define float @f14(float %dummy, float %val) {
+; CHECK-LABEL: f14:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfmaxsb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ugt float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test the v4f32 maxnum intrinsic.
+define <4 x float> @f15(<4 x float> %dummy, <4 x float> %val1,
+                        <4 x float> %val2) {
+; CHECK-LABEL: f15:
+; CHECK: vfmaxsb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %val1, <4 x float> %val2)
+  ret <4 x float> %ret
+}
+
+; Test the fmaxl library function.
+define void @f21(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f21:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @fmaxl(fp128 %val1, fp128 %val2) readnone
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test the f128 maxnum intrinsic.
+define void @f22(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f22:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @llvm.maxnum.f128(fp128 %val1, fp128 %val2)
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in maxnum.
+define void @f23(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp ogt fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in maxnan.
+define void @f24(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f24:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfmaxxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 1
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp ugt fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/vec-min-05.ll b/test/CodeGen/SystemZ/vec-min-05.ll
new file mode 100644
index 000000000000..3eef9016cd08
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-min-05.ll
@@ -0,0 +1,175 @@
+; Test vector minimum on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare double @fmin(double, double)
+declare double @llvm.minnum.f64(double, double)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+
+declare float @fminf(float, float)
+declare float @llvm.minnum.f32(float, float)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+
+declare fp128 @fminl(fp128, fp128)
+declare fp128 @llvm.minnum.f128(fp128, fp128)
+
+; Test the fmin library function.
+define double @f1(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f1:
+; CHECK: wfmindb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @fmin(double %val1, double %val2) readnone
+  ret double %ret
+}
+
+; Test the f64 minnum intrinsic.
+define double @f2(double %dummy, double %val1, double %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfmindb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call double @llvm.minnum.f64(double %val1, double %val2)
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in minnum.
+define double @f3(double %dummy, double %val) {
+; CHECK-LABEL: f3:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmindb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp olt double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test a f64 constant compare/select resulting in minnan.
+define double @f4(double %dummy, double %val) {
+; CHECK-LABEL: f4:
+; CHECK: lzdr [[REG:%f[0-9]+]]
+; CHECK: wfmindb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ult double %val, 0.0
+  %ret = select i1 %cmp, double %val, double 0.0
+  ret double %ret
+}
+
+; Test the v2f64 minnum intrinsic.
+define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: vfmindb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <2 x double> @llvm.minnum.v2f64(<2 x double> %val1, <2 x double> %val2)
+  ret <2 x double> %ret
+}
+
+; Test the fminf library function.
+define float @f11(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f11:
+; CHECK: wfminsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @fminf(float %val1, float %val2) readnone
+  ret float %ret
+}
+
+; Test the f32 minnum intrinsic.
+define float @f12(float %dummy, float %val1, float %val2) {
+; CHECK-LABEL: f12:
+; CHECK: wfminsb %f0, %f2, %f4, 4
+; CHECK: br %r14
+  %ret = call float @llvm.minnum.f32(float %val1, float %val2)
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in minnum.
+define float @f13(float %dummy, float %val) {
+; CHECK-LABEL: f13:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfminsb %f0, %f2, [[REG]], 4
+; CHECK: br %r14
+  %cmp = fcmp olt float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test a f32 constant compare/select resulting in minnan.
+define float @f14(float %dummy, float %val) {
+; CHECK-LABEL: f14:
+; CHECK: lzer [[REG:%f[0-9]+]]
+; CHECK: wfminsb %f0, %f2, [[REG]], 1
+; CHECK: br %r14
+  %cmp = fcmp ult float %val, 0.0
+  %ret = select i1 %cmp, float %val, float 0.0
+  ret float %ret
+}
+
+; Test the v4f32 minnum intrinsic.
+define <4 x float> @f15(<4 x float> %dummy, <4 x float> %val1,
+                        <4 x float> %val2) {
+; CHECK-LABEL: f15:
+; CHECK: vfminsb %v24, %v26, %v28, 4
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.minnum.v4f32(<4 x float> %val1, <4 x float> %val2)
+  ret <4 x float> %ret
+}
+
+; Test the fminl library function.
+define void @f21(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f21:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @fminl(fp128 %val1, fp128 %val2) readnone
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test the f128 minnum intrinsic.
+define void @f22(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) {
+; CHECK-LABEL: f22:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3)
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r4)
+; CHECK: br %r14
+  %val1 = load fp128, fp128* %ptr1
+  %val2 = load fp128, fp128* %ptr2
+  %res = call fp128 @llvm.minnum.f128(fp128 %val1, fp128 %val2)
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in minnum.
+define void @f23(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 4
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp olt fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
+; Test a f128 constant compare/select resulting in minnan.
+define void @f24(fp128 *%ptr, fp128 *%dst) {
+; CHECK-LABEL: f24:
+; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
+; CHECK-DAG: vzero [[REG2:%v[0-9]+]]
+; CHECK: wfminxb [[RES:%v[0-9]+]], [[REG1]], [[REG2]], 1
+; CHECK: vst [[RES]], 0(%r3)
+; CHECK: br %r14
+  %val = load fp128, fp128* %ptr
+  %cmp = fcmp ult fp128 %val, 0xL00000000000000000000000000000000
+  %res = select i1 %cmp, fp128 %val, fp128 0xL00000000000000000000000000000000
+  store fp128 %res, fp128* %dst
+  ret void
+}
+
diff --git a/test/CodeGen/SystemZ/vec-move-18.ll b/test/CodeGen/SystemZ/vec-move-18.ll
new file mode 100644
index 000000000000..5d3d09d83ef1
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-move-18.ll
@@ -0,0 +1,24 @@
+; Test insertions of memory values into 0 on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test VLLEZLF.
+define <4 x i32> @f1(i32 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: vllezlf %v24, 0(%r2)
+; CHECK: br %r14
+  %val = load i32, i32 *%ptr
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0
+  ret <4 x i32> %ret
+}
+
+; Test VLLEZLF with a float.
+define <4 x float> @f2(float *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: vllezlf %v24, 0(%r2)
+; CHECK: br %r14
+  %val = load float, float *%ptr
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 0
+  ret <4 x float> %ret
+}
+
diff --git a/test/CodeGen/SystemZ/vec-mul-03.ll b/test/CodeGen/SystemZ/vec-mul-03.ll
new file mode 100644
index 000000000000..3733db9fb339
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-mul-03.ll
@@ -0,0 +1,24 @@
+; Test vector multiplication on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 multiplication.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vfmsb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fmul <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 multiplication that uses vector registers.
+define float @f2(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: wfmsb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fmul float %scalar1, %scalar2
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-mul-04.ll b/test/CodeGen/SystemZ/vec-mul-04.ll
new file mode 100644
index 000000000000..d96f0b6a745a
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-mul-04.ll
@@ -0,0 +1,31 @@
+; Test vector multiply-and-add on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; Test a v4f32 multiply-and-add.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: vfmasb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %val3)
+  ret <4 x float> %ret
+}
+
+; Test a v4f32 multiply-and-subtract.
+define <4 x float> @f2(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: vfmssb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %negval3 = fsub <4 x float> <float -0.0, float -0.0,
+                               float -0.0, float -0.0>, %val3
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %negval3)
+  ret <4 x float> %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-mul-05.ll b/test/CodeGen/SystemZ/vec-mul-05.ll
new file mode 100644
index 000000000000..90a1f7a7efdf
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-mul-05.ll
@@ -0,0 +1,63 @@
+; Test vector negative multiply-and-add on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; Test a v2f64 negative multiply-and-add.
+define <2 x double> @f1(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2, <2 x double> %val3) {
+; CHECK-LABEL: f1:
+; CHECK: vfnmadb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1,
+                                            <2 x double> %val2,
+                                            <2 x double> %val3)
+  %negret = fsub <2 x double> <double -0.0, double -0.0>, %ret
+  ret <2 x double> %negret
+}
+
+; Test a v2f64 negative multiply-and-subtract.
+define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1,
+                        <2 x double> %val2, <2 x double> %val3) {
+; CHECK-LABEL: f2:
+; CHECK: vfnmsdb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %negval3 = fsub <2 x double> <double -0.0, double -0.0>, %val3
+  %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1,
+                                            <2 x double> %val2,
+                                            <2 x double> %negval3)
+  %negret = fsub <2 x double> <double -0.0, double -0.0>, %ret
+  ret <2 x double> %negret
+}
+
+; Test a v4f32 negative multiply-and-add.
+define <4 x float> @f3(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f3:
+; CHECK: vfnmasb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %val3)
+  %negret = fsub <4 x float> <float -0.0, float -0.0,
+                              float -0.0, float -0.0>, %ret
+  ret <4 x float> %negret
+}
+
+; Test a v4f32 negative multiply-and-subtract.
+define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2, <4 x float> %val3) {
+; CHECK-LABEL: f4:
+; CHECK: vfnmssb %v24, %v26, %v28, %v30
+; CHECK: br %r14
+  %negval3 = fsub <4 x float> <float -0.0, float -0.0,
+                               float -0.0, float -0.0>, %val3
+  %ret = call <4 x float> @llvm.fma.v4f32 (<4 x float> %val1,
+                                           <4 x float> %val2,
+                                           <4 x float> %negval3)
+  %negret = fsub <4 x float> <float -0.0, float -0.0,
+                               float -0.0, float -0.0>, %ret
+  ret <4 x float> %negret
+}
diff --git a/test/CodeGen/SystemZ/vec-neg-02.ll b/test/CodeGen/SystemZ/vec-neg-02.ll
new file mode 100644
index 000000000000..07ce037542fd
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-neg-02.ll
@@ -0,0 +1,23 @@
+; Test vector negation on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 negation.
+define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vflcsb %v24, %v26
+; CHECK: br %r14
+  %ret = fsub <4 x float> <float -0.0, float -0.0,
+                           float -0.0, float -0.0>, %val
+  ret <4 x float> %ret
+}
+
+; Test an f32 negation that uses vector registers.
+define float @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wflcsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %ret = fsub float -0.0, %scalar
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-or-03.ll b/test/CodeGen/SystemZ/vec-or-03.ll
new file mode 100644
index 000000000000..010629d880d1
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-or-03.ll
@@ -0,0 +1,91 @@
+; Test vector OR-NOT on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v16i8 OR-NOT.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <16 x i8> %val2, <i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1>
+  %ret = or <16 x i8> %val1, %not
+  ret <16 x i8> %ret
+}
+
+; ...and again with the reverse.
+define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <16 x i8> %val1, <i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1,
+                               i8 -1, i8 -1, i8 -1, i8 -1>
+  %ret = or <16 x i8> %not, %val2
+  ret <16 x i8> %ret
+}
+
+; Test a v8i16 OR-NOT.
+define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <8 x i16> %val2, <i16 -1, i16 -1, i16 -1, i16 -1,
+                               i16 -1, i16 -1, i16 -1, i16 -1>
+  %ret = or <8 x i16> %val1, %not
+  ret <8 x i16> %ret
+}
+
+; ...and again with the reverse.
+define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <8 x i16> %val1, <i16 -1, i16 -1, i16 -1, i16 -1,
+                               i16 -1, i16 -1, i16 -1, i16 -1>
+  %ret = or <8 x i16> %not, %val2
+  ret <8 x i16> %ret
+}
+
+; Test a v4i32 OR-NOT.
+define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f5:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <4 x i32> %val2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = or <4 x i32> %val1, %not
+  ret <4 x i32> %ret
+}
+
+; ...and again with the reverse.
+define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <4 x i32> %val1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %ret = or <4 x i32> %not, %val2
+  ret <4 x i32> %ret
+}
+
+; Test a v2i64 OR-NOT.
+define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: voc %v24, %v26, %v28
+; CHECK: br %r14
+  %not = xor <2 x i64> %val2, <i64 -1, i64 -1>
+  %ret = or <2 x i64> %val1, %not
+  ret <2 x i64> %ret
+}
+
+; ...and again with the reverse.
+define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f8:
+; CHECK: voc %v24, %v28, %v26
+; CHECK: br %r14
+  %not = xor <2 x i64> %val1, <i64 -1, i64 -1>
+  %ret = or <2 x i64> %not, %val2
+  ret <2 x i64> %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-round-02.ll b/test/CodeGen/SystemZ/vec-round-02.ll
new file mode 100644
index 000000000000..bcd66ea803d1
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-round-02.ll
@@ -0,0 +1,118 @@
+; Test v4f32 rounding on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @llvm.rint.f32(float)
+declare float @llvm.nearbyint.f32(float)
+declare float @llvm.floor.f32(float)
+declare float @llvm.ceil.f32(float)
+declare float @llvm.trunc.f32(float)
+declare float @llvm.round.f32(float)
+declare <4 x float> @llvm.rint.v4f32(<4 x float>)
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
+
+define <4 x float> @f1(<4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vfisb %v24, %v24, 0, 0
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: vfisb %v24, %v24, 4, 0
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f3(<4 x float> %val) {
+; CHECK-LABEL: f3:
+; CHECK: vfisb %v24, %v24, 4, 7
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f4(<4 x float> %val) {
+; CHECK-LABEL: f4:
+; CHECK: vfisb %v24, %v24, 4, 6
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f5(<4 x float> %val) {
+; CHECK-LABEL: f5:
+; CHECK: vfisb %v24, %v24, 4, 5
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define <4 x float> @f6(<4 x float> %val) {
+; CHECK-LABEL: f6:
+; CHECK: vfisb %v24, %v24, 4, 1
+; CHECK: br %r14
+  %res = call <4 x float> @llvm.round.v4f32(<4 x float> %val)
+  ret <4 x float> %res
+}
+
+define float @f7(<4 x float> %val) {
+; CHECK-LABEL: f7:
+; CHECK: wfisb %f0, %v24, 0, 0
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.rint.f32(float %scalar)
+  ret float %res
+}
+
+define float @f8(<4 x float> %val) {
+; CHECK-LABEL: f8:
+; CHECK: wfisb %f0, %v24, 4, 0
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.nearbyint.f32(float %scalar)
+  ret float %res
+}
+
+define float @f9(<4 x float> %val) {
+; CHECK-LABEL: f9:
+; CHECK: wfisb %f0, %v24, 4, 7
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.floor.f32(float %scalar)
+  ret float %res
+}
+
+define float @f10(<4 x float> %val) {
+; CHECK-LABEL: f10:
+; CHECK: wfisb %f0, %v24, 4, 6
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.ceil.f32(float %scalar)
+  ret float %res
+}
+
+define float @f11(<4 x float> %val) {
+; CHECK-LABEL: f11:
+; CHECK: wfisb %f0, %v24, 4, 5
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.trunc.f32(float %scalar)
+  ret float %res
+}
+
+define float @f12(<4 x float> %val) {
+; CHECK-LABEL: f12:
+; CHECK: wfisb %f0, %v24, 4, 1
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %res = call float @llvm.round.f32(float %scalar)
+  ret float %res
+}
diff --git a/test/CodeGen/SystemZ/vec-sqrt-02.ll b/test/CodeGen/SystemZ/vec-sqrt-02.ll
new file mode 100644
index 000000000000..6970d9db6698
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-sqrt-02.ll
@@ -0,0 +1,23 @@
+; Test f32 and v4f32 square root on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <4 x float> @f1(<4 x float> %val) {
+; CHECK-LABEL: f1:
+; CHECK: vfsqsb %v24, %v24
+; CHECK: br %r14
+  %ret = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %val)
+  ret <4 x float> %ret
+}
+
+define float @f2(<4 x float> %val) {
+; CHECK-LABEL: f2:
+; CHECK: wfsqsb %f0, %v24
+; CHECK: br %r14
+  %scalar = extractelement <4 x float> %val, i32 0
+  %ret = call float @llvm.sqrt.f32(float %scalar)
+  ret float %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-sub-02.ll b/test/CodeGen/SystemZ/vec-sub-02.ll
new file mode 100644
index 000000000000..83c76b5d4aa6
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-sub-02.ll
@@ -0,0 +1,31 @@
+; Test vector subtraction on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v4f32 subtraction.
+define <4 x float> @f6(<4 x float> %dummy, <4 x float> %val1,
+                       <4 x float> %val2) {
+; CHECK-LABEL: f6:
+; CHECK: vfssb %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = fsub <4 x float> %val1, %val2
+  ret <4 x float> %ret
+}
+
+; Test an f32 subtraction that uses vector registers.
+define float @f7(<4 x float> %val1, <4 x float> %val2) {
+; CHECK-LABEL: f7:
+; CHECK: wfssb %f0, %v24, %v26
+; CHECK: br %r14
+  %scalar1 = extractelement <4 x float> %val1, i32 0
+  %scalar2 = extractelement <4 x float> %val2, i32 0
+  %ret = fsub float %scalar1, %scalar2
+  ret float %ret
+}
+
+; Test a v2f32 subtraction, which gets promoted to v4f32.
+define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) {
+; No particular output expected, but must compile.
+  %ret = fsub <2 x float> %val1, %val2
+  ret <2 x float> %ret
+}
diff --git a/test/CodeGen/SystemZ/vec-xor-02.ll b/test/CodeGen/SystemZ/vec-xor-02.ll
new file mode 100644
index 000000000000..b4b5a96ba254
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-xor-02.ll
@@ -0,0 +1,47 @@
+; Test vector NOT-XOR on z14.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+; Test a v16i8 NOT-XOR.
+define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
+; CHECK-LABEL: f1:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <16 x i8> %val1, %val2
+  %not = xor <16 x i8> %ret, <i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1,
+                              i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %not
+}
+
+; Test a v8i16 NOT-XOR.
+define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
+; CHECK-LABEL: f2:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <8 x i16> %val1, %val2
+  %not = xor <8 x i16> %ret, <i16 -1, i16 -1, i16 -1, i16 -1,
+                              i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %not
+}
+
+; Test a v4i32 NOT-XOR.
+define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
+; CHECK-LABEL: f3:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <4 x i32> %val1, %val2
+  %not = xor <4 x i32> %ret, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %not
+}
+
+; Test a v2i64 NOT-XOR.
+define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
+; CHECK-LABEL: f4:
+; CHECK: vnx %v24, %v26, %v28
+; CHECK: br %r14
+  %ret = xor <2 x i64> %val1, %val2
+  %not = xor <2 x i64> %ret, <i64 -1, i64 -1>
+  ret <2 x i64> %not
+}
diff --git a/test/CodeGen/Thumb/litpoolremat.ll b/test/CodeGen/Thumb/litpoolremat.ll
new file mode 100644
index 000000000000..6ed9b0c2a7ce
--- /dev/null
+++ b/test/CodeGen/Thumb/litpoolremat.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
+
+declare void @consume_value(i32) #1
+
+declare i32 @get_value(...) #1
+
+declare void @consume_three_values(i32, i32, i32) #1
+
+; Function Attrs: nounwind uwtable
+define void @should_not_spill() #0 {
+  tail call void @consume_value(i32 1764) #2
+  %1 = tail call i32 (...) @get_value() #2
+  %2 = tail call i32 (...) @get_value() #2
+  %3 = tail call i32 (...) @get_value() #2
+  tail call void @consume_value(i32 %1) #2
+  tail call void @consume_value(i32 %2) #2
+  tail call void @consume_value(i32 %3) #2
+  tail call void @consume_value(i32 1764) #2
+  tail call void @consume_three_values(i32 %1, i32 %2, i32 %3) #2
+  ret void
+}
+
+; CHECK: ldr r0, LCPI0_0
+; CHECK-NOT: str r0
+; CHECK: bl
+; CHECK: ldr r0, LCPI0_0
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 1764
diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll
index fe69a39e350c..75dbeab5ad0f 100644
--- a/test/CodeGen/Thumb/select.ll
+++ b/test/CodeGen/Thumb/select.ll
@@ -74,9 +74,9 @@ define double @f7(double %a, double %b) {
 }
 ; CHECK-LABEL: f7:
 ; CHECK: blt
-; CHECK: blt
+; CHECK: {{blt|bge}}
 ; CHECK: __ltdf2
 ; CHECK-EABI-LABEL: f7:
 ; CHECK-EABI: __aeabi_dcmplt
 ; CHECK-EABI: bne
-; CHECK-EABI: bne
+; CHECK-EABI: {{bne|beq}}
diff --git a/test/CodeGen/WebAssembly/indirect-import.ll b/test/CodeGen/WebAssembly/indirect-import.ll
index 1bde65bcbbba..7cac31a2aef5 100644
--- a/test/CodeGen/WebAssembly/indirect-import.ll
+++ b/test/CodeGen/WebAssembly/indirect-import.ll
@@ -19,9 +19,9 @@ entry:
   %vs = alloca void (%struct.big*)*, align 4
   %s = alloca void (%struct.big*)*, align 4
 
-; CHECK: i32.const       {{.+}}=, extern_fd@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_fd@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_vj@FUNCTION
   store float (double)* @extern_fd, float (double)** %fd, align 4
-; CHECK: i32.const       {{.+}}=, extern_vj@FUNCTION
   store void (i64)* @extern_vj, void (i64)** %vj, align 4
   %0 = load void (i64)*, void (i64)** %vj, align 4
   call void %0(i64 1)
@@ -36,10 +36,9 @@ entry:
   %2 = load i32 (i64, i32, double, float)*, i32 (i64, i32, double, float)** %ijidf, align 4
   %call = call i32 %2(i64 1, i32 2, double 3.000000e+00, float 4.000000e+00)
 
-; CHECK: i32.const       {{.+}}=, extern_struct@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_struct@FUNCTION
+; CHECK-DAG: i32.const       {{.+}}=, extern_sret@FUNCTION
   store void (%struct.big*)* @extern_struct, void (%struct.big*)** %vs, align 4
-
-; CHECK: i32.const       {{.+}}=, extern_sret@FUNCTION
   store void (%struct.big*)* @extern_sret, void (%struct.big*)** %s, align 4
   %3 = load float (double)*, float (double)** %fd, align 4
   %4 = ptrtoint float (double)* %3 to i32
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index c160b391f6e8..2580771eb2cf 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -36,13 +36,13 @@ define void @alloca3264() {
  ; CHECK-NEXT: tee_local $push[[L5:.+]]=, [[SP:.+]], $pop[[L6]]
  %r1 = alloca i32
  %r2 = alloca double
- ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
- ; CHECK-NEXT: i32.store 12($pop[[L5]]), $pop[[L0]]
  store i32 0, i32* %r1
- ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}}
- ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
- ; CHECK-NEXT: i64.store 0($pop[[L2]]), $pop[[L1]]
  store double 0.0, double* %r2
+ ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
+ ; CHECK-NEXT: i64.store 0($pop[[L5]]), $pop[[L1]]
+ ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}}
+ ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
+ ; CHECK-NEXT: i32.store 12($pop[[L2]]), $pop[[L0]]
  ; CHECK-NEXT: return
  ret void
 }
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index 7da85d3a9a1d..fa71bffaf8c6 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+cmov | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+cmov -x86-cmov-converter=false | FileCheck %s
 ;
 ; Test scheduling a multi-use compare. We should neither spill flags
 ; nor clone the compare.
diff --git a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll
index 8d387136da9c..37f01845db79 100644
--- a/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll
+++ b/test/CodeGen/X86/2009-06-03-Win64DisableRedZone.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
 ; CHECK-NOT: -{{[1-9][0-9]*}}(%rsp)
 
-define x86_64_win64cc x86_fp80 @a(i64 %x) nounwind readnone {
+define win64cc x86_fp80 @a(i64 %x) nounwind readnone {
 entry:
         %conv = sitofp i64 %x to x86_fp80               ; <x86_fp80> [#uses=1]
         ret x86_fp80 %conv
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index ba5de8eb5fcb..e812cbe3270a 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -83,10 +83,11 @@ define void @full_test() {
 ; X32-NEXT:    cmpeqps %xmm2, %xmm1
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; X32-NEXT:    extractps $1, %xmm4, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    addl $60, %esp
 ; X32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
index 9dff4e596caa..72807922a22b 100644
--- a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
+++ b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu  | FileCheck --check-prefix=CHECK %s
+; RUN: llc < %s -mtriple=i386-linux-gnu  | FileCheck %s
 
 declare x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0);
 
diff --git a/test/CodeGen/X86/alias-static-alloca.ll b/test/CodeGen/X86/alias-static-alloca.ll
new file mode 100644
index 000000000000..f4ca7e39f4fc
--- /dev/null
+++ b/test/CodeGen/X86/alias-static-alloca.ll
@@ -0,0 +1,37 @@
+; RUN: llc -o - -mtriple=x86_64-linux-gnu %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; We should be able to bypass the load values to their corresponding
+; stores here.
+
+; CHECK-LABEL: foo
+; CHECK-DAG: movl	%esi, -8(%rsp)
+; CHECK-DAG: movl	%ecx, -16(%rsp)
+; CHECK-DAG: movl	%edi, -4(%rsp)
+; CHECK-DAG: movl	%edx, -12(%rsp)
+; CHECK: leal
+; CHECK: addl
+; CHECK: addl
+; CHECK: retq
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %a0 = alloca i32
+  %a1 = alloca i32
+  %a2 = alloca i32
+  %a3 = alloca i32
+  store i32 %b, i32* %a1
+  store i32 %d, i32* %a3
+  store i32 %a, i32* %a0
+  store i32 %c, i32* %a2
+  %l0 = load i32, i32* %a0
+  %l1 = load i32, i32* %a1
+  %l2 = load i32, i32* %a2
+  %l3 = load i32, i32* %a3
+  %add0 = add nsw i32 %l0, %l1
+  %add1 = add nsw i32 %add0, %l2
+  %add2 = add nsw i32 %add1, %l3
+  ret i32 %add2
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
index d5d3fa6db5e8..1a6fde371f09 100644
--- a/test/CodeGen/X86/atomic-minmax-i6432.ll
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -9,32 +9,32 @@ define void @atomic_maxmin_i6432() {
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   %2 = atomicrmw min  i64* @sc64, i64 6 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   %3 = atomicrmw umax i64* @sc64, i64 7 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   %4 = atomicrmw umin i64* @sc64, i64 8 acquire
 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
 ; LINUX: cmpl
 ; LINUX: sbbl
-; LINUX: cmovne
-; LINUX: cmovne
+; LINUX: jne
+; LINUX: jne
 ; LINUX: lock cmpxchg8b
 ; LINUX: jne [[LABEL]]
   ret void
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index 77bbdec826a5..c6300708bcc1 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -167,14 +167,24 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %rdx, %rcx
 ; CHECK-NEXT:    setge %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB5_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB5_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB5_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB5_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB5_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB5_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
@@ -203,14 +213,24 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %r8, %rcx
 ; CHECK-NEXT:    setge %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB6_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB6_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB6_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB6_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB6_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB6_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
@@ -239,14 +259,24 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %rdx, %rcx
 ; CHECK-NEXT:    setae %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB7_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB7_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB7_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB7_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB7_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
@@ -275,14 +305,24 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
 ; CHECK-NEXT:    sbbq %rdx, %rcx
 ; CHECK-NEXT:    setb %cl
 ; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    jne LBB8_3
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:  LBB8_3: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    jne LBB8_5
+; CHECK-NEXT:  ## BB#4: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:  LBB8_5: ## %atomicrmw.start
+; CHECK-NEXT:    ## in Loop: Header=BB8_1 Depth=1
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne LBB8_1
-; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:  ## BB#6: ## %atomicrmw.end
 ; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
 ; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
 ; CHECK-NEXT:    popq %rbx
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index a12a412fb94d..953f3bdd06e8 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -27,9 +27,9 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_addpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fadd <4 x double> %1, %2
@@ -57,9 +57,9 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_addps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fadd <8 x float> %1, %2
@@ -87,9 +87,9 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; ZNVER1-LABEL: test_addsubpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2)
@@ -118,9 +118,9 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; ZNVER1-LABEL: test_addsubps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2)
@@ -152,10 +152,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; ZNVER1-LABEL: test_andnotpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -193,10 +193,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; ZNVER1-LABEL: test_andnotps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -234,10 +234,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_andpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = and <4 x i64> %1, %2
@@ -273,10 +273,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_andps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = and <4 x i64> %1, %2
@@ -313,9 +313,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; ZNVER1-LABEL: test_blendpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fadd <4 x double> %a1, %1
@@ -345,8 +345,8 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
 ; ZNVER1-LABEL: test_blendps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; ZNVER1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7>
@@ -374,9 +374,9 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ;
 ; ZNVER1-LABEL: test_blendvpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   %2 = load <4 x double>, <4 x double> *%a3, align 32
   %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2)
@@ -405,9 +405,9 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ;
 ; ZNVER1-LABEL: test_blendvps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   %2 = load <8 x float>, <8 x float> *%a3, align 32
   %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2)
@@ -433,8 +433,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastf128:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x float>, <4 x float> *%a0, align 32
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <8 x float> %2
@@ -458,8 +458,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastsd_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load double, double *%a0, align 8
   %2 = insertelement <4 x double> undef, double %1, i32 0
   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
@@ -484,8 +484,8 @@ define <4 x float> @test_broadcastss(float *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastss:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load float, float *%a0, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
@@ -510,8 +510,8 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
 ;
 ; ZNVER1-LABEL: test_broadcastss_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load float, float *%a0, align 4
   %2 = insertelement <8 x float> undef, float %1, i32 0
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
@@ -543,9 +543,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; ZNVER1-LABEL: test_cmppd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; ZNVER1-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fcmp oeq <4 x double> %a0, %2
@@ -581,9 +581,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; ZNVER1-LABEL: test_cmpps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; ZNVER1-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fcmp oeq <8 x float> %a0, %2
@@ -618,10 +618,10 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtdq2pd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp <4 x i32> %a0 to <4 x double>
   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   %3 = sitofp <4 x i32> %2 to <4 x double>
@@ -655,10 +655,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtdq2ps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp <8 x i32> %a0 to <8 x float>
   %2 = load <8 x i32>, <8 x i32> *%a1, align 16
   %3 = sitofp <8 x i32> %2 to <8 x float>
@@ -690,10 +690,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtpd2dq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [5:1.00]
 ; ZNVER1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <4 x double> %a0 to <4 x i32>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = fptosi <4 x double> %2 to <4 x i32>
@@ -725,10 +725,10 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtpd2ps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
 ; ZNVER1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptrunc <4 x double> %a0 to <4 x float>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = fptrunc <4 x double> %2 to <4 x float>
@@ -760,10 +760,10 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_cvtps2dq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00]
-; ZNVER1-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <8 x float> %a0 to <8 x i32>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = fptosi <8 x float> %2 to <8 x i32>
@@ -792,9 +792,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_divpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
-; ZNVER1-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fdiv <4 x double> %1, %2
@@ -822,9 +822,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_divps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
-; ZNVER1-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fdiv <8 x float> %1, %2
@@ -853,8 +853,8 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ; ZNVER1-LABEL: test_dpps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7)
@@ -886,9 +886,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
 ; ZNVER1-LABEL: test_extractf128:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; ZNVER1-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   store <4 x float> %2, <4 x float> *%a2
@@ -916,9 +916,9 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; ZNVER1-LABEL: test_haddpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2)
@@ -947,9 +947,9 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; ZNVER1-LABEL: test_haddps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2)
@@ -978,9 +978,9 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; ZNVER1-LABEL: test_hsubpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1009,9 +1009,9 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; ZNVER1-LABEL: test_hsubps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1044,9 +1044,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
 ; ZNVER1-LABEL: test_insertf128:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
-; ZNVER1-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %3 = load <4 x float>, <4 x float> *%a2, align 16
@@ -1074,8 +1074,8 @@ define <32 x i8> @test_lddqu(i8* %a0) {
 ;
 ; ZNVER1-LABEL: test_lddqu:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vlddqu (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vlddqu (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0)
   ret <32 x i8> %1
 }
@@ -1108,7 +1108,7 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
 ; ZNVER1-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1)
   call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2)
   ret <2 x double> %1
@@ -1143,7 +1143,7 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2
 ; ZNVER1-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
   call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2)
   ret <4 x double> %1
@@ -1178,7 +1178,7 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
 ; ZNVER1-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1)
   call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2)
   ret <4 x float> %1
@@ -1213,7 +1213,7 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
 ; ZNVER1-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
 ; ZNVER1-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
   call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
   ret <8 x float> %1
@@ -1243,8 +1243,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; ZNVER1-LABEL: test_maxpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1274,8 +1274,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; ZNVER1-LABEL: test_maxps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1305,8 +1305,8 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; ZNVER1-LABEL: test_minpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1336,8 +1336,8 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; ZNVER1-LABEL: test_minps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1369,10 +1369,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movapd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovapd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovapd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x double>, <4 x double> *%a0, align 32
   %2 = fadd <4 x double> %1, %1
   store <4 x double> %2, <4 x double> *%a1, align 32
@@ -1403,10 +1403,10 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movaps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovaps (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovaps (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <8 x float>, <8 x float> *%a0, align 32
   %2 = fadd <8 x float> %1, %1
   store <8 x float> %2, <8 x float> *%a1, align 32
@@ -1437,10 +1437,10 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movddup:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
+; ZNVER1-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [8:0.50]
 ; ZNVER1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -1468,9 +1468,9 @@ define i32 @test_movmskpd(<4 x double> %a0) {
 ;
 ; ZNVER1-LABEL: test_movmskpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovmskpd %ymm0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    vmovmskpd %ymm0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
   ret i32 %1
 }
@@ -1496,9 +1496,9 @@ define i32 @test_movmskps(<8 x float> %a0) {
 ;
 ; ZNVER1-LABEL: test_movmskps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovmskps %ymm0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    vmovmskps %ymm0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
   ret i32 %1
 }
@@ -1525,9 +1525,9 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movntpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <4 x double> %a0, %a0
   store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0
   ret <4 x double> %1
@@ -1554,9 +1554,9 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movntps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <8 x float> %a0, %a0
   store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0
   ret <8 x float> %1
@@ -1586,10 +1586,10 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movshdup:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
+; ZNVER1-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [8:0.50]
 ; ZNVER1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -1621,10 +1621,10 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movsldup:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
+; ZNVER1-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [8:0.50]
 ; ZNVER1-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -1658,10 +1658,10 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movupd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovupd (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovupd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x double>, <4 x double> *%a0, align 1
   %2 = fadd <4 x double> %1, %1
   store <4 x double> %2, <4 x double> *%a1, align 1
@@ -1694,10 +1694,10 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_movups:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmovups (%rdi), %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmovups (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <8 x float>, <8 x float> *%a0, align 1
   %2 = fadd <8 x float> %1, %1
   store <8 x float> %2, <8 x float> *%a1, align 1
@@ -1725,9 +1725,9 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_mulpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
-; ZNVER1-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fmul <4 x double> %1, %2
@@ -1755,9 +1755,9 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_mulps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; ZNVER1-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fmul <8 x float> %1, %2
@@ -1788,10 +1788,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
 ;
 ; ZNVER1-LABEL: orpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = or <4 x i64> %1, %2
@@ -1827,10 +1827,10 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ;
 ; ZNVER1-LABEL: test_orps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = or <4 x i64> %1, %2
@@ -1866,10 +1866,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50]
 ; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -1901,10 +1901,10 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilpd_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
@@ -1936,10 +1936,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50]
 ; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1971,10 +1971,10 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_permilps_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
+; ZNVER1-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:0.50]
 ; ZNVER1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -2004,8 +2004,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
 ; ZNVER1-LABEL: test_permilvarpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2)
@@ -2035,8 +2035,8 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
 ; ZNVER1-LABEL: test_permilvarpd_ymm:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2)
@@ -2066,8 +2066,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
 ; ZNVER1-LABEL: test_permilvarps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2)
@@ -2097,8 +2097,8 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
 ; ZNVER1-LABEL: test_permilvarps_ymm:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2)
@@ -2130,10 +2130,10 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_rcpps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vrcpps (%rdi), %ymm1 # sched: [7:2.00]
-; ZNVER1-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vrcpps (%rdi), %ymm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2)
@@ -2166,10 +2166,10 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_roundpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
 ; ZNVER1-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7)
@@ -2202,10 +2202,10 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_roundps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
+; ZNVER1-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
 ; ZNVER1-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7)
@@ -2238,10 +2238,10 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_rsqrtps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
-; ZNVER1-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2)
@@ -2275,9 +2275,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; ZNVER1-LABEL: test_shufpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
-; ZNVER1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
@@ -2307,8 +2307,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; ZNVER1-LABEL: test_shufps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50]
-; ZNVER1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 3, i32 8, i32 8, i32 4, i32 7, i32 12, i32 12>
@@ -2339,10 +2339,10 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
 ;
 ; ZNVER1-LABEL: test_sqrtpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
-; ZNVER1-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
   %2 = load <4 x double>, <4 x double> *%a1, align 32
   %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2)
@@ -2375,10 +2375,10 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ;
 ; ZNVER1-LABEL: test_sqrtps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
-; ZNVER1-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
-; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
   %2 = load <8 x float>, <8 x float> *%a1, align 32
   %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2)
@@ -2408,9 +2408,9 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_subpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <4 x double> %a0, %a1
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = fsub <4 x double> %1, %2
@@ -2438,9 +2438,9 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_subps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <8 x float> %a0, %a1
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = fsub <8 x float> %1, %2
@@ -2477,12 +2477,12 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ;
 ; ZNVER1-LABEL: test_testpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd (%rdi), %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2)
@@ -2523,13 +2523,13 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ;
 ; ZNVER1-LABEL: test_testpd_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestpd (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestpd (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2)
@@ -2568,12 +2568,12 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ;
 ; ZNVER1-LABEL: test_testps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps (%rdi), %xmm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2)
@@ -2614,13 +2614,13 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ;
 ; ZNVER1-LABEL: test_testps_ymm:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    setb %al # sched: [1:0.50]
-; ZNVER1-NEXT:    vtestps (%rdi), %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.50]
+; ZNVER1-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vtestps (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    adcl $0, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2)
@@ -2654,9 +2654,9 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; ZNVER1-LABEL: test_unpckhpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2686,8 +2686,8 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; ZNVER1-LABEL: test_unpckhps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -2719,9 +2719,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; ZNVER1-LABEL: test_unpcklpd:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %2 = load <4 x double>, <4 x double> *%a2, align 32
   %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2751,8 +2751,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; ZNVER1-LABEL: test_unpcklps:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50]
-; ZNVER1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %2 = load <8 x float>, <8 x float> *%a2, align 32
   %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -2783,10 +2783,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ;
 ; ZNVER1-LABEL: test_xorpd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x double> %a0 to <4 x i64>
   %2 = bitcast <4 x double> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, %2
@@ -2822,10 +2822,10 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ;
 ; ZNVER1-LABEL: test_xorps:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <8 x float> %a0 to <4 x i64>
   %2 = bitcast <8 x float> %a1 to <4 x i64>
   %3 = xor <4 x i64> %1, %2
@@ -2856,7 +2856,7 @@ define void @test_zeroall() {
 ; ZNVER1-LABEL: test_zeroall:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vzeroall # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.avx.vzeroall()
   ret void
 }
@@ -2881,7 +2881,7 @@ define void @test_zeroupper() {
 ; ZNVER1-LABEL: test_zeroupper:
 ; ZNVER1:       # BB#0:
 ; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.avx.vzeroupper()
   ret void
 }
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index 017f54b40b2d..9918d6680256 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -386,13 +386,13 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
 define <4 x i32> @mul_const10(<4 x i32> %x) {
 ; X32-LABEL: mul_const10:
 ; X32:       # BB#0:
-; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; X32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
 ; X32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_const10:
 ; X64:       # BB#0:
-; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
 ; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
@@ -403,13 +403,13 @@ define <4 x i32> @mul_const10(<4 x i32> %x) {
 define <4 x i32> @mul_const11(<4 x i32> %x) {
 ; X32-LABEL: mul_const11:
 ; X32:       # BB#0:
-; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm1
+; X32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
 ; X32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mul_const11:
 ; X64:       # BB#0:
-; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
 ; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index 042bc217b97c..a3862d7e27c6 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -13,10 +13,10 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
 ;
 ; ZNVER1-LABEL: test_pabsb:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpabsb (%rdi), %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
   %2 = load <32 x i8>, <32 x i8> *%a1, align 32
   %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2)
@@ -35,10 +35,10 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
 ;
 ; ZNVER1-LABEL: test_pabsd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpabsd (%rdi), %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
   %2 = load <8 x i32>, <8 x i32> *%a1, align 32
   %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2)
@@ -57,10 +57,10 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
 ;
 ; ZNVER1-LABEL: test_pabsw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpabsw (%rdi), %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
   %2 = load <16 x i16>, <16 x i16> *%a1, align 32
   %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2)
@@ -78,9 +78,9 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddb:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <32 x i8> %a0, %a1
   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   %3 = add <32 x i8> %1, %2
@@ -96,9 +96,9 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <8 x i32> %a0, %a1
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = add <8 x i32> %1, %2
@@ -114,9 +114,9 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = add <4 x i64> %1, %2
@@ -132,9 +132,9 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
 ;
 ; ZNVER1-LABEL: test_paddw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <16 x i16> %a0, %a1
   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   %3 = add <16 x i16> %1, %2
@@ -151,10 +151,10 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pand:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = and <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = and <4 x i64> %1, %2
@@ -172,10 +172,10 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pandn:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
   %2 = and <4 x i64> %a1, %1
   %3 = load <4 x i64>, <4 x i64> *%a2, align 32
@@ -194,9 +194,9 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pmulld:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <8 x i32> %a0, %a1
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = mul <8 x i32> %1, %2
@@ -212,9 +212,9 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2)
 ;
 ; ZNVER1-LABEL: test_pmullw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; ZNVER1-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <16 x i16> %a0, %a1
   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   %3 = mul <16 x i16> %1, %2
@@ -231,10 +231,10 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_por:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = or <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = or <4 x i64> %1, %2
@@ -251,9 +251,9 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubb:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <32 x i8> %a0, %a1
   %2 = load <32 x i8>, <32 x i8> *%a2, align 32
   %3 = sub <32 x i8> %1, %2
@@ -269,9 +269,9 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubd:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <8 x i32> %a0, %a1
   %2 = load <8 x i32>, <8 x i32> *%a2, align 32
   %3 = sub <8 x i32> %1, %2
@@ -287,9 +287,9 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubq:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = sub <4 x i64> %1, %2
@@ -305,9 +305,9 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
 ;
 ; ZNVER1-LABEL: test_psubw:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <16 x i16> %a0, %a1
   %2 = load <16 x i16>, <16 x i16> *%a2, align 32
   %3 = sub <16 x i16> %1, %2
@@ -324,10 +324,10 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
 ;
 ; ZNVER1-LABEL: test_pxor:
 ; ZNVER1:       # BB#0:
-; ZNVER1-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT:    retq # sched: [4:1.00]
+; ZNVER1-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <4 x i64> %a0, %a1
   %2 = load <4 x i64>, <4 x i64> *%a2, align 32
   %3 = xor <4 x i64> %1, %2
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 127726ea30da..c77714b9e181 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -376,7 +376,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
 ; X32:       # BB#0:
 ; X32-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; X32-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm2
+; X32-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
 ; X32-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vzeroupper
@@ -386,7 +386,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
 ; X64:       # BB#0:
 ; X64-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; X64-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
 ; X64-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 140299f5495d..e10a781fabc2 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1507,7 +1507,7 @@ define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
 ; NOVL:       # BB#0:
 ; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; NOVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NOVL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index e1a92c60d182..6f4bf061a215 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1630,8 +1630,9 @@ define void @f1(i32 %c) {
 ; CHECK-LABEL: f1:
 ; CHECK:       ## BB#0: ## %entry
 ; CHECK-NEXT:    movzbl {{.*}}(%rip), %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorb $1, %al
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movb %al, {{.*}}(%rip)
 ; CHECK-NEXT:    xorl $1, %edi
 ; CHECK-NEXT:    jmp _f2 ## TAILCALL
diff --git a/test/CodeGen/X86/avx512-rotate.ll b/test/CodeGen/X86/avx512-rotate.ll
new file mode 100644
index 000000000000..98fa67ad793d
--- /dev/null
+++ b/test/CodeGen/X86/avx512-rotate.ll
@@ -0,0 +1,256 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+
+declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+; Tests showing replacement of variable rotates with immediate splat versions.
+
+define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_rol_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprold $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprold $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprold $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_rol_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprold $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprold $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprold $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_rol_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprolq $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_rol_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprolq $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_ror_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprord $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprord $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprord $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_ror_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprord $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprord $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprord $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_ror_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprorq $5, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_ror_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprorq $5, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+; Tests showing replacement of out-of-bounds variable rotates with in-bounds immediate splat versions.
+
+define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_bounds_rol_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprold $1, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprold $31, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprold $30, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_rol_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprold $1, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprold $31, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprold $30, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_bounds_rol_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprolq $62, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprolq $1, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprolq $63, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_rol_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprolq $62, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprolq $1, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprolq $63, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; KNL-LABEL: test_splat_bounds_ror_v16i32:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprord $1, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprord $31, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprord $30, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_ror_v16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprord $1, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprord $31, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprord $30, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>, <16 x i32> %x1, i16 %x2)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> zeroinitializer, i16 %x2)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> <i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534, i32 65534>, <16 x i32> %x1, i16 -1)
+  %res3 = add <16 x i32> %res, %res1
+  %res4 = add <16 x i32> %res3, %res2
+  ret <16 x i32> %res4
+}
+
+define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; KNL-LABEL: test_splat_bounds_ror_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vprorq $62, %zmm0, %zmm1 {%k1}
+; KNL-NEXT:    vprorq $1, %zmm0, %zmm2 {%k1} {z}
+; KNL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; KNL-NEXT:    vprorq $63, %zmm0, %zmm0
+; KNL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_splat_bounds_ror_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vprorq $62, %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vprorq $1, %zmm0, %zmm2 {%k1} {z}
+; SKX-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; SKX-NEXT:    vprorq $63, %zmm0, %zmm0
+; SKX-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534, i64 65534>, <8 x i64> %x1, i8 %x2)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65, i64 65>, <8 x i64> zeroinitializer, i8 %x2)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> %x1, i8 -1)
+  %res3 = add <8 x i64> %res, %res1
+  %res4 = add <8 x i64> %res3, %res2
+  ret <8 x i64> %res4
+}
+
+; Constant folding
+
+define <8 x i64> @test_fold_rol_v8i64() {
+; CHECK-LABEL: test_fold_rol_v8i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808]
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_fold_ror_v8i64() {
+; CHECK-LABEL: test_fold_ror_v8i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [1,9223372036854775808,4611686018427387904,2,9223372036854775808,4,2,2]
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll
index 10883a5a9a62..ce2b010ec0f2 100644
--- a/test/CodeGen/X86/avx512-shift.ll
+++ b/test/CodeGen/X86/avx512-shift.ll
@@ -1,136 +1,178 @@
-;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
 
-;CHECK-LABEL: shift_16_i32
-;CHECK: vpsrld
-;CHECK: vpslld
-;CHECK: vpsrad
-;CHECK: ret
 define <16 x i32> @shift_16_i32(<16 x i32> %a) {
+; CHECK-LABEL: shift_16_i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrld $1, %zmm0, %zmm0
+; CHECK-NEXT:    vpslld $12, %zmm0, %zmm0
+; CHECK-NEXT:    vpsrad $12, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    %c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
    %d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
    ret <16 x i32> %d;
 }
 
-;CHECK-LABEL: shift_8_i64
-;CHECK: vpsrlq
-;CHECK: vpsllq
-;CHECK: vpsraq
-;CHECK: ret
 define <8 x i64> @shift_8_i64(<8 x i64> %a) {
+; CHECK-LABEL: shift_8_i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; CHECK-NEXT:    vpsllq $12, %zmm0, %zmm0
+; CHECK-NEXT:    vpsraq $12, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
    %c = shl <8 x i64> %b,  <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
    %d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
    ret <8 x i64> %d;
 }
 
-;SKX-LABEL: shift_4_i64
-;SKX: vpsrlq
-;SKX: vpsllq
-;SKX: vpsraq
-;SKX: ret
 define <4 x i64> @shift_4_i64(<4 x i64> %a) {
+; KNL-LABEL: shift_4_i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; KNL-NEXT:    vpsllq $12, %ymm0, %ymm0
+; KNL-NEXT:    vpsraq $12, %zmm0, %zmm0
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: shift_4_i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; SKX-NEXT:    vpsllq $12, %ymm0, %ymm0
+; SKX-NEXT:    vpsraq $12, %ymm0, %ymm0
+; SKX-NEXT:    retq
    %b = lshr <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
    %c = shl <4 x i64> %b,  <i64 12, i64 12, i64 12, i64 12>
    %d = ashr <4 x i64> %c, <i64 12, i64 12, i64 12, i64 12>
    ret <4 x i64> %d;
 }
 
-; CHECK-LABEL: variable_shl4
-; CHECK: vpsllvq %zmm
-; CHECK: ret
 define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
+; CHECK-LABEL: variable_shl4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = shl <8 x i64> %x, %y
   ret <8 x i64> %k
 }
 
-; CHECK-LABEL: variable_shl5
-; CHECK: vpsllvd %zmm
-; CHECK: ret
 define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
+; CHECK-LABEL: variable_shl5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = shl <16 x i32> %x, %y
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_srl0
-; CHECK: vpsrlvd
-; CHECK: ret
 define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
+; CHECK-LABEL: variable_srl0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = lshr <16 x i32> %x, %y
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_srl2
-; CHECK: psrlvq
-; CHECK: ret
 define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
+; CHECK-LABEL: variable_srl2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = lshr <8 x i64> %x, %y
   ret <8 x i64> %k
 }
 
-; CHECK-LABEL: variable_sra1
-; CHECK: vpsravd
-; CHECK: ret
 define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
+; CHECK-LABEL: variable_sra1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = ashr <16 x i32> %x, %y
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_sra2
-; CHECK: vpsravq %zmm
-; CHECK: ret
 define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
+; CHECK-LABEL: variable_sra2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %k = ashr <8 x i64> %x, %y
   ret <8 x i64> %k
 }
 
-; SKX-LABEL: variable_sra3
-; SKX: vpsravq %ymm
-; SKX: ret
 define <4 x i64> @variable_sra3(<4 x i64> %x, <4 x i64> %y) {
+; KNL-LABEL: variable_sra3:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: variable_sra3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; SKX-NEXT:    retq
   %k = ashr <4 x i64> %x, %y
   ret <4 x i64> %k
 }
 
-; SKX-LABEL: variable_sra4
-; SKX: vpsravw %xmm
-; SKX: ret
 define <8 x i16> @variable_sra4(<8 x i16> %x, <8 x i16> %y) {
+; KNL-LABEL: variable_sra4:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; KNL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; KNL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: variable_sra4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; SKX-NEXT:    retq
   %k = ashr <8 x i16> %x, %y
   ret <8 x i16> %k
 }
 
-; CHECK-LABEL: variable_sra01_load
-; CHECK: vpsravd (%
-; CHECK: ret
 define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
+; CHECK-LABEL: variable_sra01_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsravd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <16 x i32>, <16 x i32>* %y
   %k = ashr <16 x i32> %x, %y1
   ret <16 x i32> %k
 }
 
-; CHECK-LABEL: variable_shl1_load
-; CHECK: vpsllvd (%
-; CHECK: ret
 define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
+; CHECK-LABEL: variable_shl1_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllvd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <16 x i32>, <16 x i32>* %y
   %k = shl <16 x i32> %x, %y1
   ret <16 x i32> %k
 }
-; CHECK: variable_srl0_load
-; CHECK: vpsrlvd (%
-; CHECK: ret
+
 define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
+; CHECK-LABEL: variable_srl0_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <16 x i32>, <16 x i32>* %y
   %k = lshr <16 x i32> %x, %y1
   ret <16 x i32> %k
 }
 
-; CHECK: variable_srl3_load
-; CHECK: vpsrlvq (%
-; CHECK: ret
 define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
+; CHECK-LABEL: variable_srl3_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %y1 = load <8 x i64>, <8 x i64>* %y
   %k = lshr <8 x i64> %x, %y1
   ret <8 x i64> %k
diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll
new file mode 100644
index 000000000000..75be2d9c0f01
--- /dev/null
+++ b/test/CodeGen/X86/bmi-schedule.ll
@@ -0,0 +1,529 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi   | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
+; GENERIC-LABEL: test_andn_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    andnl %esi, %edi, %eax
+; GENERIC-NEXT:    notl %edi
+; GENERIC-NEXT:    andw (%rdx), %di
+; GENERIC-NEXT:    addl %edi, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_andn_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    notl %edi # sched: [1:0.25]
+; HASWELL-NEXT:    andw (%rdx), %di # sched: [5:0.50]
+; HASWELL-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andn_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    notl %edi # sched: [1:0.50]
+; BTVER2-NEXT:    andw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andn_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    andnl %esi, %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    notl %edi # sched: [1:0.25]
+; ZNVER1-NEXT:    andw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a2
+  %2 = xor i16 %a0, -1
+  %3 = and i16 %2, %a1
+  %4 = and i16 %2, %1
+  %5 = add i16 %3, %4
+  ret i16 %5
+}
+
+define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_andn_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    andnl %esi, %edi, %ecx
+; GENERIC-NEXT:    andnl (%rdx), %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_andn_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
+; HASWELL-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andn_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:1.00]
+; BTVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andn_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    andnl (%rdx), %edi, %eax # sched: [5:0.50]
+; ZNVER1-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = xor i32 %a0, -1
+  %3 = and i32 %2, %a1
+  %4 = and i32 %2, %1
+  %5 = add i32 %3, %4
+  ret i32 %5
+}
+
+define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_andn_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    andnq %rsi, %rdi, %rcx
+; GENERIC-NEXT:    andnq (%rdx), %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_andn_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; HASWELL-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_andn_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:1.00]
+; BTVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andn_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    andnq (%rdx), %rdi, %rax # sched: [5:0.50]
+; ZNVER1-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.25]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = xor i64 %a0, -1
+  %3 = and i64 %2, %a1
+  %4 = and i64 %2, %1
+  %5 = add i64 %3, %4
+  ret i64 %5
+}
+
+define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_bextr_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bextrl %edi, (%rdx), %ecx
+; GENERIC-NEXT:    bextrl %edi, %esi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bextr_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; HASWELL-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_bextr_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bextr_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0)
+  %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
+
+define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_bextr_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bextrq %rdi, (%rdx), %rcx
+; GENERIC-NEXT:    bextrq %rdi, %rsi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bextr_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; HASWELL-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_bextr_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bextr_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0)
+  %3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
+
+define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_blsi_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsil (%rsi), %ecx
+; GENERIC-NEXT:    blsil %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsi_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsil (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsil %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsi_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsil (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsil %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsi_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsil (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsil %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = sub i32 0, %1
+  %3 = sub i32 0, %a0
+  %4 = and i32 %1, %2
+  %5 = and i32 %a0, %3
+  %6 = add i32 %4, %5
+  ret i32 %6
+}
+
+define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_blsi_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsiq (%rsi), %rcx
+; GENERIC-NEXT:    blsiq %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsi_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsiq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsiq %rdi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsi_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsiq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsi_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsiq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = sub i64 0, %1
+  %3 = sub i64 0, %a0
+  %4 = and i64 %1, %2
+  %5 = and i64 %a0, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_blsmsk_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsmskl (%rsi), %ecx
+; GENERIC-NEXT:    blsmskl %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsmsk_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsmskl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsmskl %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsmsk_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsmskl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsmsk_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsmskl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = sub i32 %1, 1
+  %3 = sub i32 %a0, 1
+  %4 = xor i32 %1, %2
+  %5 = xor i32 %a0, %3
+  %6 = add i32 %4, %5
+  ret i32 %6
+}
+
+define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_blsmsk_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsmskq (%rsi), %rcx
+; GENERIC-NEXT:    blsmskq %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsmsk_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsmskq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsmskq %rdi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsmsk_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsmskq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsmsk_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsmskq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = sub i64 %1, 1
+  %3 = sub i64 %a0, 1
+  %4 = xor i64 %1, %2
+  %5 = xor i64 %a0, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_blsr_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsrl (%rsi), %ecx
+; GENERIC-NEXT:    blsrl %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsr_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsrl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    blsrl %edi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsr_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsrl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsr_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsrl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = sub i32 %1, 1
+  %3 = sub i32 %a0, 1
+  %4 = and i32 %1, %2
+  %5 = and i32 %a0, %3
+  %6 = add i32 %4, %5
+  ret i32 %6
+}
+
+define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_blsr_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    blsrq (%rsi), %rcx
+; GENERIC-NEXT:    blsrq %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_blsr_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    blsrq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    blsrq %rdi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_blsr_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    blsrq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blsr_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    blsrq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = sub i64 %1, 1
+  %3 = sub i64 %a0, 1
+  %4 = and i64 %1, %2
+  %5 = and i64 %a0, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_cttz_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    tzcntw (%rsi), %cx
+; GENERIC-NEXT:    tzcntw %di, %ax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_cttz_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    tzcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cttz_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    tzcntw %di, %ax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cttz_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    tzcntw %di, %ax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a1
+  %2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false )
+  %3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false )
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+declare i16 @llvm.cttz.i16(i16, i1)
+
+define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_cttz_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    tzcntl (%rsi), %ecx
+; GENERIC-NEXT:    tzcntl %edi, %eax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_cttz_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    tzcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cttz_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    tzcntl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cttz_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    tzcntl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false )
+  %3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false )
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_cttz_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    tzcntq (%rsi), %rcx
+; GENERIC-NEXT:    tzcntq %rdi, %rax
+; GENERIC-NEXT:    orq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_cttz_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    tzcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_cttz_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    tzcntq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cttz_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    tzcntq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false )
+  %3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false )
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.cttz.i64(i64, i1)
diff --git a/test/CodeGen/X86/bmi2-schedule.ll b/test/CodeGen/X86/bmi2-schedule.ll
new file mode 100644
index 000000000000..9666dd85d853
--- /dev/null
+++ b/test/CodeGen/X86/bmi2-schedule.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi2  | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_bzhi_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bzhil %edi, (%rdx), %ecx
+; GENERIC-NEXT:    bzhil %edi, %esi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bzhi_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [4:0.50]
+; HASWELL-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_bzhi_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bzhil %edi, %esi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0)
+  %3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
+
+define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_bzhi_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    bzhiq %rdi, (%rdx), %rcx
+; GENERIC-NEXT:    bzhiq %rdi, %rsi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_bzhi_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50]
+; HASWELL-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_bzhi_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0)
+  %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
+
+define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_pdep_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pdepl (%rdx), %edi, %ecx
+; GENERIC-NEXT:    pdepl %esi, %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pdep_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pdep_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pdepl %esi, %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1)
+  %3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
+
+define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_pdep_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pdepq (%rdx), %rdi, %rcx
+; GENERIC-NEXT:    pdepq %rsi, %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pdep_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pdep_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1)
+  %3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
+
+define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_pext_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pextl (%rdx), %edi, %ecx
+; GENERIC-NEXT:    pextl %esi, %edi, %eax
+; GENERIC-NEXT:    addl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pext_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pext_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pextl %esi, %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a2
+  %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1)
+  %3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.x86.bmi.pext.32(i32, i32)
+
+define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_pext_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    pextq (%rdx), %rdi, %rcx
+; GENERIC-NEXT:    pextq %rsi, %rdi, %rax
+; GENERIC-NEXT:    addq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_pext_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_pext_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a2
+  %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1)
+  %3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
+  %4 = add i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.x86.bmi.pext.64(i64, i64)
diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll
index e292ccd0be11..7c1042878d59 100644
--- a/test/CodeGen/X86/bool-ext-inc.ll
+++ b/test/CodeGen/X86/bool-ext-inc.ll
@@ -19,7 +19,7 @@ define i32 @sext_inc(i1 zeroext %x) nounwind {
 define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind {
 ; CHECK-LABEL: sext_inc_vec:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %ext = sext <4 x i1> %x to <4 x i32>
@@ -31,7 +31,7 @@ define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: cmpgt_sext_inc_vec:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %cmp = icmp sgt <4 x i32> %x, %y
@@ -56,7 +56,7 @@ define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: cmpgt_sext_inc_vec256:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
 ; CHECK-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %cmp = icmp sgt <4 x i64> %x, %y
@@ -91,7 +91,7 @@ define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 ; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %cmp1 = icmp ne <4 x i32> %a, %b
diff --git a/test/CodeGen/X86/bswap-rotate.ll b/test/CodeGen/X86/bswap-rotate.ll
new file mode 100644
index 000000000000..f686febe5645
--- /dev/null
+++ b/test/CodeGen/X86/bswap-rotate.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+; Combine BSWAP (lowered to rolw 8) with a second rotate.
+; This test checks for combining rotates with inconsistent constant value types.
+
+define i16 @combine_bswap_rotate(i16 %a0) {
+; X86-LABEL: combine_bswap_rotate:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rolw $9, %ax
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_bswap_rotate:
+; X64:       # BB#0:
+; X64-NEXT:    rolw $9, %di
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %1 = call i16 @llvm.bswap.i16(i16 %a0)
+  %2 = shl i16 %1, 1
+  %3 = lshr i16 %1, 15
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+
+declare i16 @llvm.bswap.i16(i16)
diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll
index 02f1a1616db2..b69b18531601 100644
--- a/test/CodeGen/X86/clobber-fi0.ll
+++ b/test/CodeGen/X86/clobber-fi0.ll
@@ -15,22 +15,22 @@ bb:
   %tmp = alloca i32, align 4                      ; [#uses=3 type=i32*]
   %tmp2 = alloca i32, align 4                     ; [#uses=3 type=i32*]
   %tmp3 = alloca i32                              ; [#uses=1 type=i32*]
-  store i32 1, i32* %tmp, align 4
-  store i32 1, i32* %tmp2, align 4
+  store volatile i32 1, i32* %tmp, align 4
+  store volatile i32 1, i32* %tmp2, align 4
   br label %bb4
 
 bb4:                                              ; preds = %bb4, %bb
-  %tmp6 = load i32, i32* %tmp2, align 4                ; [#uses=1 type=i32]
+  %tmp6 = load volatile i32, i32* %tmp2, align 4                ; [#uses=1 type=i32]
   %tmp7 = add i32 %tmp6, -1                       ; [#uses=2 type=i32]
-  store i32 %tmp7, i32* %tmp2, align 4
+  store volatile i32 %tmp7, i32* %tmp2, align 4
   %tmp8 = icmp eq i32 %tmp7, 0                    ; [#uses=1 type=i1]
-  %tmp9 = load i32, i32* %tmp                          ; [#uses=1 type=i32]
+  %tmp9 = load volatile i32, i32* %tmp                          ; [#uses=1 type=i32]
   %tmp10 = add i32 %tmp9, -1              ; [#uses=1 type=i32]
-  store i32 %tmp10, i32* %tmp3
+  store volatile i32 %tmp10, i32* %tmp3
   br i1 %tmp8, label %bb11, label %bb4
 
 bb11:                                             ; preds = %bb4
-  %tmp12 = load i32, i32* %tmp, align 4                ; [#uses=1 type=i32]
+  %tmp12 = load volatile i32, i32* %tmp, align 4                ; [#uses=1 type=i32]
   ret i32 %tmp12
 }
 
diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll
index 713ee5d0f65a..0d74c937af33 100644
--- a/test/CodeGen/X86/combine-rotates.ll
+++ b/test/CodeGen/X86/combine-rotates.ll
@@ -6,22 +6,12 @@
 define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
 ; XOP-LABEL: combine_vec_rot_rot:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512-LABEL: combine_vec_rot_rot:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
   %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
@@ -40,12 +30,7 @@ define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
 ;
 ; AVX512-LABEL: combine_vec_rot_rot_splat:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrld $3, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $29, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpsrld $22, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $10, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vprold $7, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29>
@@ -63,12 +48,6 @@ define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
 ;
 ; AVX512-LABEL: combine_vec_rot_rot_splat_zero:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrld $1, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpsrld $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
   %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 3dbff2680c22..a6491a0a8694 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -392,7 +392,7 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_gt_lshr0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -437,7 +437,7 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_le_lshr0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
 ; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -481,7 +481,7 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_ashr0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -515,7 +515,7 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
 ; AVX-LABEL: combine_vec_shl_add0:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -550,7 +550,7 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_or0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5]
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -585,7 +585,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_shl_mul0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 21564cdd7353..473fae19f4fd 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -91,7 +91,7 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_lshr_known_zero1:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -326,7 +326,7 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_lshr_shl_mask0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 =  shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -376,10 +376,10 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
 ;
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $4, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index e1e849929405..b6ae2fa6d157 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -166,7 +166,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index 91da268a8d75..4c7716bbaebe 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -43,7 +43,7 @@ define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_pow2a:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
 ; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %1 = urem <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
@@ -87,7 +87,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_pow2c:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
@@ -146,7 +146,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_pow2d:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
@@ -183,7 +183,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
 ;
 ; AVX2-LABEL: combine_vec_urem_by_shl_pow2a:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
new file mode 100644
index 000000000000..15ae4a49d7d3
--- /dev/null
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=IVY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
+; IVY-LABEL: test_vcvtph2ps_128:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
+; IVY-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtph2ps_128:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtph2ps_128:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtph2ps_128:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load <8 x i16>, <8 x i16> *%a1
+  %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+  %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
+  %4 = fadd <4 x float> %2, %3
+  ret <4 x float> %4
+}
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+
+define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
+; IVY-LABEL: test_vcvtph2ps_256:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; IVY-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; IVY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtph2ps_256:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; HASWELL-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtph2ps_256:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00]
+; BTVER2-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtph2ps_256:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load <8 x i16>, <8 x i16> *%a1
+  %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+  %3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
+  %4 = fadd <8 x float> %2, %3
+  ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> *%a2) {
+; IVY-LABEL: test_vcvtps2ph_128:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [7:1.00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtps2ph_128:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtps2ph_128:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtps2ph_128:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
+  %2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %3, <4 x i16> *%a2
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32)
+
+define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> *%a2) {
+; IVY-LABEL: test_vcvtps2ph_256:
+; IVY:       # BB#0:
+; IVY-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
+; IVY-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; IVY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtps2ph_256:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
+; HASWELL-NEXT:    vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_vcvtps2ph_256:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtps2ph_256:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [12:1.00]
+; ZNVER1-NEXT:    vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
+  %2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0)
+  store <8 x i16> %2, <8 x i16> *%a2
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32)
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 3d5c12c03484..c87353ed1f5a 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -316,7 +316,7 @@ define void @allocamaterialize() {
 
 ; STDERR-NOT: FastISel missed terminator:   ret void
 ; CHECK-LABEL: win64ccfun
-define x86_64_win64cc void @win64ccfun(i32 %i) {
+define win64cc void @win64ccfun(i32 %i) {
 ; CHECK: ret
   ret void
 }
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
index fbc4cd9d4f9c..86469dad23f2 100644
--- a/test/CodeGen/X86/hipe-cc.ll
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -48,11 +48,7 @@ entry:
   store i32 %arg0, i32* %arg0_var
   store i32 %arg1, i32* %arg1_var
   store i32 %arg2, i32* %arg2_var
-
-  ; CHECK:      movl  16(%esp), %esi
-  ; CHECK-NEXT: movl  12(%esp), %ebp
-  ; CHECK-NEXT: movl   8(%esp), %eax
-  ; CHECK-NEXT: movl   4(%esp), %edx
+  ; These loads are loading the values from their previous stores and are optimized away.
   %0 = load i32, i32* %hp_var
   %1 = load i32, i32* %p_var
   %2 = load i32, i32* %arg0_var
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index 43e2e1409fde..efe07cf6301e 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -57,11 +57,7 @@ entry:
   store i64 %arg2, i64* %arg2_var
   store i64 %arg3, i64* %arg3_var
 
-  ; CHECK:      movq  40(%rsp), %r15
-  ; CHECK-NEXT: movq  32(%rsp), %rbp
-  ; CHECK-NEXT: movq  24(%rsp), %rsi
-  ; CHECK-NEXT: movq  16(%rsp), %rdx
-  ; CHECK-NEXT: movq  8(%rsp), %rcx
+  ; Loads are reading values just writen from corresponding register and are therefore noops. 
   %0 = load i64, i64* %hp_var
   %1 = load i64, i64* %p_var
   %2 = load i64, i64* %arg0_var
diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll
new file mode 100644
index 000000000000..e42ce30c5a6d
--- /dev/null
+++ b/test/CodeGen/X86/lea32-schedule.ll
@@ -0,0 +1,653 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64      | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom        | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm         | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge   | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i32 @test_lea_offset(i32) {
+; GENERIC-LABEL: test_lea_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -24(%rdi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -24(%rdi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -24(%rdi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i32 %0, -24
+  ret i32 %2
+}
+
+define i32 @test_lea_offset_big(i32) {
+; GENERIC-LABEL: test_lea_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 1024(%rdi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 1024(%rdi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i32 %0, 1024
+  ret i32 %2
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @test_lea_add(i32, i32) {
+; GENERIC-LABEL: test_lea_add:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal (%rdi,%rsi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add nsw i32 %1, %0
+  ret i32 %3
+}
+
+define i32 @test_lea_add_offset(i32, i32) {
+; GENERIC-LABEL: test_lea_add_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 16(%rdi,%rsi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $16, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $16, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i32 %0, 16
+  %4 = add i32 %3, %1
+  ret i32 %4
+}
+
+define i32 @test_lea_add_offset_big(i32, i32) {
+; GENERIC-LABEL: test_lea_add_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -4096(%rdi,%rsi), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $-4096, %eax # imm = 0xF000
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $-4096, %eax # imm = 0xF000
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i32 %0, -4096
+  %4 = add i32 %3, %1
+  ret i32 %4
+}
+
+define i32 @test_lea_mul(i32) {
+; GENERIC-LABEL: test_lea_mul:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal (%rdi,%rdi,2), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i32 %0, 3
+  ret i32 %2
+}
+
+define i32 @test_lea_mul_offset(i32) {
+; GENERIC-LABEL: test_lea_mul_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -32(%rdi,%rdi,2), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $-32, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $-32, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i32 %0, 3
+  %3 = add nsw i32 %2, -32
+  ret i32 %3
+}
+
+define i32 @test_lea_mul_offset_big(i32) {
+; GENERIC-LABEL: test_lea_mul_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 10000(%rdi,%rdi,8), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $10000, %eax # imm = 0x2710
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $10000, %eax # imm = 0x2710
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i32 %0, 9
+  %3 = add nsw i32 %2, 10000
+  ret i32 %3
+}
+
+define i32 @test_lea_add_scale(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal (%rdi,%rsi,2), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i32 %1, 1
+  %4 = add nsw i32 %3, %0
+  ret i32 %4
+}
+
+define i32 @test_lea_add_scale_offset(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal 96(%rdi,%rsi,4), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $96, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $96, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i32 %1, 2
+  %4 = add i32 %0, 96
+  %5 = add i32 %4, %3
+  ret i32 %5
+}
+
+define i32 @test_lea_add_scale_offset_big(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; GENERIC-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; GENERIC-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ATOM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ATOM-NEXT:    leal -1200(%rdi,%rsi,8), %eax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SANDY-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SANDY-NEXT:    leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; SANDY-NEXT:    addl $-1200, %eax # imm = 0xFB50
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; HASWELL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HASWELL-NEXT:    leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; HASWELL-NEXT:    addl $-1200, %eax # imm = 0xFB50
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; BTVER2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; BTVER2-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ZNVER1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ZNVER1-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i32 %1, 3
+  %4 = add i32 %0, -1200
+  %5 = add i32 %4, %3
+  ret i32 %5
+}
diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll
new file mode 100644
index 000000000000..0ff1574c809d
--- /dev/null
+++ b/test/CodeGen/X86/lea64-schedule.ll
@@ -0,0 +1,534 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64      | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom        | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm         | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge   | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i64 @test_lea_offset(i64) {
+; GENERIC-LABEL: test_lea_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -24(%rdi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -24(%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i64 %0, -24
+  ret i64 %2
+}
+
+define i64 @test_lea_offset_big(i64) {
+; GENERIC-LABEL: test_lea_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 1024(%rdi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 1024(%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = add nsw i64 %0, 1024
+  ret i64 %2
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i64 @test_lea_add(i64, i64) {
+; GENERIC-LABEL: test_lea_add:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq (%rdi,%rsi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add nsw i64 %1, %0
+  ret i64 %3
+}
+
+define i64 @test_lea_add_offset(i64, i64) {
+; GENERIC-LABEL: test_lea_add_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 16(%rdi,%rsi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $16, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $16, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i64 %0, 16
+  %4 = add i64 %3, %1
+  ret i64 %4
+}
+
+define i64 @test_lea_add_offset_big(i64, i64) {
+; GENERIC-LABEL: test_lea_add_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -4096(%rdi,%rsi), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $-4096, %rax # imm = 0xF000
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $-4096, %rax # imm = 0xF000
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = add i64 %0, -4096
+  %4 = add i64 %3, %1
+  ret i64 %4
+}
+
+define i64 @test_lea_mul(i64) {
+; GENERIC-LABEL: test_lea_mul:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq (%rdi,%rdi,2), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i64 %0, 3
+  ret i64 %2
+}
+
+define i64 @test_lea_mul_offset(i64) {
+; GENERIC-LABEL: test_lea_mul_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -32(%rdi,%rdi,2), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $-32, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $-32, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i64 %0, 3
+  %3 = add nsw i64 %2, -32
+  ret i64 %3
+}
+
+define i64 @test_lea_mul_offset_big(i64) {
+; GENERIC-LABEL: test_lea_mul_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 10000(%rdi,%rdi,8), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_mul_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $10000, %rax # imm = 0x2710
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $10000, %rax # imm = 0x2710
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %2 = mul nsw i64 %0, 9
+  %3 = add nsw i64 %2, 10000
+  ret i64 %3
+}
+
+define i64 @test_lea_add_scale(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq (%rdi,%rsi,2), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i64 %1, 1
+  %4 = add nsw i64 %3, %0
+  ret i64 %4
+}
+
+define i64 @test_lea_add_scale_offset(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale_offset:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq 96(%rdi,%rsi,4), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $96, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $96, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i64 %1, 2
+  %4 = add i64 %0, 96
+  %5 = add i64 %4, %3
+  ret i64 %5
+}
+
+define i64 @test_lea_add_scale_offset_big(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale_offset_big:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset_big:
+; ATOM:       # BB#0:
+; ATOM-NEXT:    leaq -1200(%rdi,%rsi,8), %rax
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    nop
+; ATOM-NEXT:    retq
+;
+; SLM-LABEL: test_lea_add_scale_offset_big:
+; SLM:       # BB#0:
+; SLM-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:1.00]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset_big:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; SANDY-NEXT:    addq $-1200, %rax # imm = 0xFB50
+; SANDY-NEXT:    # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset_big:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; HASWELL-NEXT:    addq $-1200, %rax # imm = 0xFB50
+; HASWELL-NEXT:    # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset_big:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset_big:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %3 = shl i64 %1, 3
+  %4 = add i64 %0, -1200
+  %5 = add i64 %4, %3
+  ret i64 %5
+}
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index b3f2116e6486..3ad6cad32d83 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -148,8 +148,7 @@ define i32 @test6() {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $1, (%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    shldl $32, %eax, %ecx
@@ -175,9 +174,8 @@ define i32 @test6() {
 ; CHECK-NEXT:    retl
   %x = alloca i32, align 4
   %t = alloca i64, align 8
-  store i32 1, i32* %x, align 4
-  store i64 1, i64* %t, align 8  ;; DEAD
-  %load = load i32, i32* %x, align 4
+  store volatile i32 1, i32* %x, align 4
+  %load = load volatile i32, i32* %x, align 4
   %shl = shl i32 %load, 8
   %add = add i32 %shl, -224
   %sh_prom = zext i32 %add to i64
diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll
new file mode 100644
index 000000000000..cd0dcbbd6afb
--- /dev/null
+++ b/test/CodeGen/X86/lzcnt-schedule.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_ctlz_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    lzcntw (%rsi), %cx
+; GENERIC-NEXT:    lzcntw %di, %ax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_ctlz_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    lzcntw (%rsi), %cx
+; HASWELL-NEXT:    lzcntw %di, %ax
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctlz_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    lzcntw (%rsi), %cx
+; BTVER2-NEXT:    lzcntw %di, %ax
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctlz_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lzcntw (%rsi), %cx
+; ZNVER1-NEXT:    lzcntw %di, %ax
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a1
+  %2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false )
+  %3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false )
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+declare i16 @llvm.ctlz.i16(i16, i1)
+
+define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_ctlz_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    lzcntl (%rsi), %ecx
+; GENERIC-NEXT:    lzcntl %edi, %eax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_ctlz_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    lzcntl (%rsi), %ecx
+; HASWELL-NEXT:    lzcntl %edi, %eax
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctlz_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    lzcntl (%rsi), %ecx
+; BTVER2-NEXT:    lzcntl %edi, %eax
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctlz_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lzcntl (%rsi), %ecx
+; ZNVER1-NEXT:    lzcntl %edi, %eax
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false )
+  %3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false )
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_ctlz_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    lzcntq (%rsi), %rcx
+; GENERIC-NEXT:    lzcntq %rdi, %rax
+; GENERIC-NEXT:    orq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; HASWELL-LABEL: test_ctlz_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    lzcntq (%rsi), %rcx
+; HASWELL-NEXT:    lzcntq %rdi, %rax
+; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctlz_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    lzcntq (%rsi), %rcx
+; BTVER2-NEXT:    lzcntq %rdi, %rax
+; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctlz_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lzcntq (%rsi), %rcx
+; ZNVER1-NEXT:    lzcntq %rdi, %rax
+; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false )
+  %3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false )
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/CodeGen/X86/machine-outliner-debuginfo.ll b/test/CodeGen/X86/machine-outliner-debuginfo.ll
index 26a194764086..02d0964e37eb 100644
--- a/test/CodeGen/X86/machine-outliner-debuginfo.ll
+++ b/test/CodeGen/X86/machine-outliner-debuginfo.ll
@@ -17,6 +17,7 @@ define i32 @main() #0 !dbg !11 {
   call void @llvm.dbg.value(metadata i32 10, i64 0, metadata !15, metadata !16), !dbg !17
   store i32 4, i32* %5, align 4
   store i32 0, i32* @x, align 4, !dbg !24
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; This is the same sequence of instructions without a debug value. It should be outlined
   ; in the same way.
   ; CHECK: callq l_OUTLINED_FUNCTION_0
diff --git a/test/CodeGen/X86/machine-outliner.ll b/test/CodeGen/X86/machine-outliner.ll
index 9f8e6ec298f4..b4a277ec2d82 100644
--- a/test/CodeGen/X86/machine-outliner.ll
+++ b/test/CodeGen/X86/machine-outliner.ll
@@ -85,6 +85,7 @@ define i32 @main() #0 {
   store i32 3, i32* %4, align 4
   store i32 4, i32* %5, align 4
   store i32 1, i32* @x, align 4
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
   ; CHECK: callq [[OFUNC2]]
   store i32 1, i32* %2, align 4
   store i32 2, i32* %3, align 4
diff --git a/test/CodeGen/X86/memcmp-minsize.ll b/test/CodeGen/X86/memcmp-minsize.ll
new file mode 100644
index 000000000000..a7f42644ca2d
--- /dev/null
+++ b/test/CodeGen/X86/memcmp-minsize.ll
@@ -0,0 +1,721 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $2
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length2_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $2
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length3:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $3
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length3_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $3
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length4:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $4
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length4_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length4_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length5:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $5
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length5_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $5
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length8:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $8
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length8_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length8_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length12_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $12, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $12
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length12:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $12, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $12
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length16:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $16, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $16
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl $16, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind minsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl $16, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length32:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $32
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length32_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length32_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length64:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $64, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # BB#0:
+; X64-NEXT:    pushq $64
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length64_eq:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $64, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $64
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length64_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $64, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq $64
+; X64-NEXT:    popq %rdx
+; X64-NEXT:    movl $.L.str, %esi
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll
new file mode 100644
index 000000000000..450205a966d2
--- /dev/null
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@@ -0,0 +1,871 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    incl %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpw %dx, %cx
+; X86-NEXT:    cmovael %edi, %eax
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpw %cx, %ax
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length2_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length3:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    movzwl %dx, %edx
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    incl %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length3_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 2(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 2(%ecx), %dl
+; X86-NEXT:    je .LBB5_3
+; X86-NEXT:  .LBB5_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB5_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 2(%rsi), %cl
+; X64-NEXT:    je .LBB5_3
+; X64-NEXT:  .LBB5_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB5_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length4:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    incl %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovael %edi, %eax
+; X86-NEXT:    cmovel %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length4_eq:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length4_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length5:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    incl %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length5_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB10_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 4(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 4(%ecx), %dl
+; X86-NEXT:    je .LBB10_3
+; X86-NEXT:  .LBB10_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB10_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    jne .LBB10_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 4(%rsi), %cl
+; X64-NEXT:    je .LBB10_3
+; X64-NEXT:  .LBB10_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB10_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length8:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_1: # %res_block
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    incl %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length8_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB12_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl 4(%ecx), %edx
+; X86-NEXT:    je .LBB12_3
+; X86-NEXT:  .LBB12_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB12_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length8_eq_const:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $858927408, (%ecx) # imm = 0x33323130
+; X86-NEXT:    jne .LBB13_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $926299444, 4(%ecx) # imm = 0x37363534
+; X86-NEXT:    je .LBB13_3
+; X86-NEXT:  .LBB13_1: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    incl %eax
+; X86-NEXT:  .LBB13_3: # %endblock
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length12_eq:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB14_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl 8(%rsi), %ecx
+; X64-NEXT:    je .LBB14_3
+; X64-NEXT:  .LBB14_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB14_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length12:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB15_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length16:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-LABEL: length16_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB17_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq 8(%rsi), %rcx
+; X64-NEXT:    je .LBB17_3
+; X64-NEXT:  .LBB17_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB17_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # BB#0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-LABEL: length16_eq_const:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    jne .LBB18_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
+; X64-NEXT:    cmpq %rcx, 8(%rdi)
+; X64-NEXT:    je .LBB18_3
+; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB18_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # BB#0:
+; X64-NEXT:    movl $32, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-LABEL: length32_eq:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $32, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length32_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $32, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length64:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # BB#0:
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-LABEL: length64_eq:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length64_eq_const:
+; X86:       # BB#0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64_eq_const:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $.L.str, %esi
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 0e09abf73c8c..2e6782765462 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=AVX2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
@@ -12,43 +12,21 @@
 declare i32 @memcmp(i8*, i8*, i64)
 
 define i32 @length2(i8* %X, i8* %Y) nounwind {
-; X86-NOSSE-LABEL: length2:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movzwl (%ecx), %ecx
-; X86-NOSSE-NEXT:    movzwl (%eax), %eax
-; X86-NOSSE-NEXT:    rolw $8, %cx
-; X86-NOSSE-NEXT:    rolw $8, %ax
-; X86-NOSSE-NEXT:    cmpw %ax, %cx
-; X86-NOSSE-NEXT:    movl $-1, %eax
-; X86-NOSSE-NEXT:    jae .LBB0_1
-; X86-NOSSE-NEXT:  # BB#2:
-; X86-NOSSE-NEXT:    je .LBB0_3
-; X86-NOSSE-NEXT:  .LBB0_4:
-; X86-NOSSE-NEXT:    retl
-; X86-NOSSE-NEXT:  .LBB0_1:
-; X86-NOSSE-NEXT:    movl $1, %eax
-; X86-NOSSE-NEXT:    jne .LBB0_4
-; X86-NOSSE-NEXT:  .LBB0_3:
-; X86-NOSSE-NEXT:    xorl %eax, %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length2:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movzwl (%ecx), %ecx
-; X86-SSE2-NEXT:    movzwl (%eax), %eax
-; X86-SSE2-NEXT:    rolw $8, %cx
-; X86-SSE2-NEXT:    rolw $8, %ax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    cmpw %ax, %cx
-; X86-SSE2-NEXT:    movl $-1, %ecx
-; X86-SSE2-NEXT:    movl $1, %eax
-; X86-SSE2-NEXT:    cmovbl %ecx, %eax
-; X86-SSE2-NEXT:    cmovel %edx, %eax
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length2:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length2:
 ; X64:       # BB#0:
@@ -137,44 +115,90 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
 
 define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length3:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    movzwl %dx, %edx
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
-; X64:       # BB#0:
-; X64-NEXT:    movl $3, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length3_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 2(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 2(%ecx), %dl
+; X86-NEXT:    je .LBB5_3
+; X86-NEXT:  .LBB5_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB5_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $3, %edx
-; X64-NEXT:    callq memcmp
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    jne .LBB5_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 2(%rsi), %cl
+; X64-NEXT:    je .LBB5_3
+; X64-NEXT:  .LBB5_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB5_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   %c = icmp ne i32 %m, 0
@@ -182,43 +206,21 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind {
-; X86-NOSSE-LABEL: length4:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%ecx), %ecx
-; X86-NOSSE-NEXT:    movl (%eax), %eax
-; X86-NOSSE-NEXT:    bswapl %ecx
-; X86-NOSSE-NEXT:    bswapl %eax
-; X86-NOSSE-NEXT:    cmpl %eax, %ecx
-; X86-NOSSE-NEXT:    movl $-1, %eax
-; X86-NOSSE-NEXT:    jae .LBB6_1
-; X86-NOSSE-NEXT:  # BB#2:
-; X86-NOSSE-NEXT:    je .LBB6_3
-; X86-NOSSE-NEXT:  .LBB6_4:
-; X86-NOSSE-NEXT:    retl
-; X86-NOSSE-NEXT:  .LBB6_1:
-; X86-NOSSE-NEXT:    movl $1, %eax
-; X86-NOSSE-NEXT:    jne .LBB6_4
-; X86-NOSSE-NEXT:  .LBB6_3:
-; X86-NOSSE-NEXT:    xorl %eax, %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length4:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl (%eax), %eax
-; X86-SSE2-NEXT:    bswapl %ecx
-; X86-SSE2-NEXT:    bswapl %eax
-; X86-SSE2-NEXT:    xorl %edx, %edx
-; X86-SSE2-NEXT:    cmpl %eax, %ecx
-; X86-SSE2-NEXT:    movl $-1, %ecx
-; X86-SSE2-NEXT:    movl $1, %eax
-; X86-SSE2-NEXT:    cmovbl %ecx, %eax
-; X86-SSE2-NEXT:    cmovel %edx, %eax
-; X86-SSE2-NEXT:    retl
+; X86-LABEL: length4:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length4:
 ; X64:       # BB#0:
@@ -278,44 +280,86 @@ define i1 @length4_eq_const(i8* %X) nounwind {
 
 define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
-; X64:       # BB#0:
-; X64-NEXT:    movl $5, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB10_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movb 4(%eax), %dl
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpb 4(%ecx), %dl
+; X86-NEXT:    je .LBB10_3
+; X86-NEXT:  .LBB10_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB10_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $5, %edx
-; X64-NEXT:    callq memcmp
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    jne .LBB10_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpb 4(%rsi), %cl
+; X64-NEXT:    je .LBB10_3
+; X64-NEXT:  .LBB10_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB10_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   %c = icmp ne i32 %m, 0
@@ -324,13 +368,33 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 
 define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_1
+; X86-NEXT:  # BB#3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB11_1: # %res_block
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length8:
@@ -352,13 +416,20 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    cmpl (%ecx), %edx
+; X86-NEXT:    jne .LBB12_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl 4(%ecx), %edx
+; X86-NEXT:    je .LBB12_3
+; X86-NEXT:  .LBB12_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB12_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -376,13 +447,17 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
 
 define i1 @length8_eq_const(i8* %X) nounwind {
 ; X86-LABEL: length8_eq_const:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $858927408, (%ecx) # imm = 0x33323130
+; X86-NEXT:    jne .LBB13_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $926299444, 4(%ecx) # imm = 0x37363534
+; X86-NEXT:    je .LBB13_3
+; X86-NEXT:  .LBB13_1: # %res_block
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:  .LBB13_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -400,25 +475,43 @@ define i1 @length8_eq_const(i8* %X) nounwind {
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length12_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    cmpl (%eax), %edx
+; X86-NEXT:    jne .LBB14_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    cmpl 4(%eax), %edx
+; X86-NEXT:    jne .LBB14_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl 8(%eax), %edx
+; X86-NEXT:    je .LBB14_4
+; X86-NEXT:  .LBB14_1: # %res_block
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:  .LBB14_4: # %endblock
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length12_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $12, %edx
-; X64-NEXT:    callq memcmp
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB14_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl 8(%rsi), %ecx
+; X64-NEXT:    je .LBB14_3
+; X64-NEXT:  .LBB14_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB14_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   %c = icmp ne i32 %m, 0
@@ -427,19 +520,66 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 
 define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length12:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB15_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB15_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB15_1
+; X86-NEXT:  # BB#4: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB15_1: # %res_block
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length12:
-; X64:       # BB#0:
-; X64-NEXT:    movl $12, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB15_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
 }
@@ -448,111 +588,165 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 
 define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length16:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#4: # %loadbb3
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB16_1
+; X86-NEXT:  # BB#5: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB16_1: # %res_block
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length16:
-; X64:       # BB#0:
-; X64-NEXT:    movl $16, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_1
+; X64-NEXT:  # BB#3: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
+; X86-LABEL: length16_eq:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    cmpl (%eax), %edx
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    cmpl 4(%eax), %edx
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    cmpl 8(%eax), %edx
+; X86-NEXT:    jne .LBB17_1
+; X86-NEXT:  # BB#4: # %loadbb3
+; X86-NEXT:    movl 12(%ecx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl 12(%eax), %edx
+; X86-NEXT:    je .LBB17_5
+; X86-NEXT:  .LBB17_1: # %res_block
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:  .LBB17_5: # %endblock
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX2-LABEL: length16_eq:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
-; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length16_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB17_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq 8(%rsi), %rcx
+; X64-NEXT:    je .LBB17_3
+; X64-NEXT:  .LBB17_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB17_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # BB#0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
+; X86-LABEL: length16_eq_const:
+; X86:       # BB#0: # %loadbb
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $858927408, (%eax) # imm = 0x33323130
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # BB#2: # %loadbb1
+; X86-NEXT:    cmpl $926299444, 4(%eax) # imm = 0x37363534
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # BB#3: # %loadbb2
+; X86-NEXT:    cmpl $825243960, 8(%eax) # imm = 0x31303938
+; X86-NEXT:    jne .LBB18_1
+; X86-NEXT:  # BB#4: # %loadbb3
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl $892613426, 12(%eax) # imm = 0x35343332
+; X86-NEXT:    je .LBB18_5
+; X86-NEXT:  .LBB18_1: # %res_block
+; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:  .LBB18_5: # %endblock
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # BB#0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX2-LABEL: length16_eq_const:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
-; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length16_eq_const:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    jne .LBB18_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
+; X64-NEXT:    cmpq %rcx, 8(%rdi)
+; X64-NEXT:    je .LBB18_3
+; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB18_3: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -570,9 +764,43 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length32:
-; X64:       # BB#0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#3: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rcx
+; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#4: # %loadbb3
+; X64-NEXT:    movq 24(%rdi), %rcx
+; X64-NEXT:    movq 24(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB19_1
+; X64-NEXT:  # BB#5: # %endblock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB19_1: # %res_block
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    movl $-1, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    cmovbl %ecx, %eax
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
   ret i32 %m
 }
@@ -592,25 +820,30 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
-; X64-AVX2-NEXT:    cmpl $-1, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length32_eq:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    cmpq 8(%rsi), %rax
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # BB#3: # %loadbb2
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    cmpq 16(%rsi), %rax
+; X64-NEXT:    jne .LBB20_1
+; X64-NEXT:  # BB#4: # %loadbb3
+; X64-NEXT:    movq 24(%rdi), %rcx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq 24(%rsi), %rcx
+; X64-NEXT:    je .LBB20_5
+; X64-NEXT:  .LBB20_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB20_5: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -629,26 +862,30 @@ define i1 @length32_eq_const(i8* %X) nounwind {
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # BB#0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
-; X64-AVX2-NEXT:    cmpl $-1, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-LABEL: length32_eq_const:
+; X64:       # BB#0: # %loadbb
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    jne .LBB21_1
+; X64-NEXT:  # BB#2: # %loadbb1
+; X64-NEXT:    movabsq $3833745473465760056, %rax # imm = 0x3534333231303938
+; X64-NEXT:    cmpq %rax, 8(%rdi)
+; X64-NEXT:    jne .LBB21_1
+; X64-NEXT:  # BB#3: # %loadbb2
+; X64-NEXT:    movabsq $3689065127958034230, %rax # imm = 0x3332313039383736
+; X64-NEXT:    cmpq %rax, 16(%rdi)
+; X64-NEXT:    jne .LBB21_1
+; X64-NEXT:  # BB#4: # %loadbb3
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movabsq $3544395820347831604, %rcx # imm = 0x3130393837363534
+; X64-NEXT:    cmpq %rcx, 24(%rdi)
+; X64-NEXT:    je .LBB21_5
+; X64-NEXT:  .LBB21_1: # %res_block
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:  .LBB21_5: # %endblock
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 50a661fcca11..76d750855cd4 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -105,7 +105,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v4i32c:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117]
 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -523,7 +523,7 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v8i32c:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117]
 ; AVX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 entry:
@@ -551,7 +551,7 @@ define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind  {
 ;
 ; AVX-LABEL: mul_v4i64c:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117]
 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
 ; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
 ; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll
new file mode 100644
index 000000000000..c0d11280fc1d
--- /dev/null
+++ b/test/CodeGen/X86/popcnt-schedule.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+popcnt    | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm         | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont    | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge   | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_ctpop_i16:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    popcntw (%rsi), %cx
+; GENERIC-NEXT:    popcntw %di, %ax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT:    retq
+;
+; SLM-LABEL: test_ctpop_i16:
+; SLM:       # BB#0:
+; SLM-NEXT:    popcntw (%rsi), %cx # sched: [6:1.00]
+; SLM-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; SLM-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i16:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    popcntw (%rsi), %cx # sched: [7:1.00]
+; SANDY-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; SANDY-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i16:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    popcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i16:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    popcntw (%rsi), %cx # sched: [8:1.00]
+; BTVER2-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    popcntw (%rsi), %cx # sched: [10:1.00]
+; ZNVER1-NEXT:    popcntw %di, %ax # sched: [3:1.00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i16, i16 *%a1
+  %2 = tail call i16 @llvm.ctpop.i16( i16 %1 )
+  %3 = tail call i16 @llvm.ctpop.i16( i16 %a0 )
+  %4 = or i16 %2, %3
+  ret i16 %4
+}
+declare i16 @llvm.ctpop.i16(i16)
+
+define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_ctpop_i32:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    popcntl (%rsi), %ecx
+; GENERIC-NEXT:    popcntl %edi, %eax
+; GENERIC-NEXT:    orl %ecx, %eax
+; GENERIC-NEXT:    retq
+;
+; SLM-LABEL: test_ctpop_i32:
+; SLM:       # BB#0:
+; SLM-NEXT:    popcntl (%rsi), %ecx # sched: [6:1.00]
+; SLM-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; SLM-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i32:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    popcntl (%rsi), %ecx # sched: [7:1.00]
+; SANDY-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; SANDY-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i32:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    popcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i32:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    popcntl (%rsi), %ecx # sched: [8:1.00]
+; BTVER2-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    popcntl (%rsi), %ecx # sched: [10:1.00]
+; ZNVER1-NEXT:    popcntl %edi, %eax # sched: [3:1.00]
+; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i32, i32 *%a1
+  %2 = tail call i32 @llvm.ctpop.i32( i32 %1 )
+  %3 = tail call i32 @llvm.ctpop.i32( i32 %a0 )
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+declare i32 @llvm.ctpop.i32(i32)
+
+define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_ctpop_i64:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    popcntq (%rsi), %rcx
+; GENERIC-NEXT:    popcntq %rdi, %rax
+; GENERIC-NEXT:    orq %rcx, %rax
+; GENERIC-NEXT:    retq
+;
+; SLM-LABEL: test_ctpop_i64:
+; SLM:       # BB#0:
+; SLM-NEXT:    popcntq (%rsi), %rcx # sched: [6:1.00]
+; SLM-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; SLM-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i64:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    popcntq (%rsi), %rcx # sched: [9:1.00]
+; SANDY-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; SANDY-NEXT:    orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT:    retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i64:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    popcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i64:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    popcntq (%rsi), %rcx # sched: [8:1.00]
+; BTVER2-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    popcntq (%rsi), %rcx # sched: [10:1.00]
+; ZNVER1-NEXT:    popcntq %rdi, %rax # sched: [3:1.00]
+; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
+  %1 = load i64, i64 *%a1
+  %2 = tail call i64 @llvm.ctpop.i64( i64 %1 )
+  %3 = tail call i64 @llvm.ctpop.i64( i64 %a0 )
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+declare i64 @llvm.ctpop.i64(i64)
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
new file mode 100644
index 000000000000..26c4bdb2375a
--- /dev/null
+++ b/test/CodeGen/X86/pr32282.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=X64
+
+; Check for assert in foldMaskAndShiftToScale due to out of range mask scaling.
+
+@b = common global i8 zeroinitializer, align 1
+@c = common global i8 zeroinitializer, align 1
+@d = common global i64 zeroinitializer, align 8
+@e = common global i64 zeroinitializer, align 8
+
+define void @foo() {
+; X86-LABEL: foo:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:  .Lcfi0:
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    movl d, %eax
+; X86-NEXT:    movl d+4, %ecx
+; X86-NEXT:    movl $701685459, %edx # imm = 0x29D2DED3
+; X86-NEXT:    andnl %edx, %ecx, %ecx
+; X86-NEXT:    movl $-564453154, %edx # imm = 0xDE5B20DE
+; X86-NEXT:    andnl %edx, %eax, %edx
+; X86-NEXT:    shrdl $21, %ecx, %edx
+; X86-NEXT:    shrl $21, %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    andl $-2, %edx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    addl $7, %edx
+; X86-NEXT:    adcxl %eax, %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:  .Lcfi1:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl %edx
+; X86-NEXT:  .Lcfi2:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $0
+; X86-NEXT:  .Lcfi3:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl $0
+; X86-NEXT:  .Lcfi4:
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __divdi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:  .Lcfi5:
+; X86-NEXT:    .cfi_adjust_cfa_offset -16
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    setne {{[0-9]+}}(%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: foo:
+; X64:       # BB#0:
+; X64-NEXT:    movq {{.*}}(%rip), %rax
+; X64-NEXT:    movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000
+; X64-NEXT:    andnq %rcx, %rax, %rcx
+; X64-NEXT:    shrq $21, %rcx
+; X64-NEXT:    addq $7, %rcx
+; X64-NEXT:    movabsq $4393751543808, %rax # imm = 0x3FF00000000
+; X64-NEXT:    testq %rax, %rcx
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # BB#2:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    idivq %rcx
+; X64-NEXT:    jmp .LBB0_3
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    divl %ecx
+; X64-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<def>
+; X64-NEXT:  .LBB0_3:
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    setne -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  %1 = alloca i8, align 1
+  %2 = load i64, i64* @d, align 8
+  %3 = or i64 -3013716102214263007, %2
+  %4 = xor i64 %3, -1
+  %5 = load i64, i64* @e, align 8
+  %6 = load i8, i8* @b, align 1
+  %7 = trunc i8 %6 to i1
+  %8 = zext i1 %7 to i64
+  %9 = xor i64 %5, %8
+  %10 = load i8, i8* @c, align 1
+  %11 = trunc i8 %10 to i1
+  %12 = zext i1 %11 to i32
+  %13 = or i32 551409149, %12
+  %14 = sub nsw i32 %13, 551409131
+  %15 = zext i32 %14 to i64
+  %16 = shl i64 %9, %15
+  %17 = sub nsw i64 %16, 223084523
+  %18 = ashr i64 %4, %17
+  %19 = and i64 %18, 9223372036854775806
+  %20 = add nsw i64 7, %19
+  %21 = sdiv i64 0, %20
+  %22 = icmp ne i64 %21, 0
+  %23 = zext i1 %22 to i8
+  store i8 %23, i8* %1, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32515.ll b/test/CodeGen/X86/pr32515.ll
new file mode 100644
index 000000000000..aeb6803867aa
--- /dev/null
+++ b/test/CodeGen/X86/pr32515.ll
@@ -0,0 +1,29 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s
+; RUN: llc     -mtriple=x86_64-unknown -mcpu=skx -o - %s
+; RUN: llc -O0 -mtriple=i686-unknown   -mcpu=skx -o - %s
+; RUN: llc     -mtriple=i686-unknown   -mcpu=skx -o - %s
+; REQUIRES: asserts
+
+@var_26 = external global i16, align 2
+
+define void @foo() #0 {
+ %1 = alloca i16, align 2
+ %2 = load i16, i16* @var_26, align 2
+ %3 = zext i16 %2 to i32
+ %4 = icmp ne i32 %3, 7
+ %5 = zext i1 %4 to i16
+ store i16 %5, i16* %1, align 2
+ %6 = load i16, i16* @var_26, align 2
+ %7 = zext i16 %6 to i32
+ %8 = and i32 1, %7
+ %9 = shl i32 %8, 0
+ %10 = load i16, i16* @var_26, align 2
+ %11 = zext i16 %10 to i32
+ %12 = icmp ne i32 %11, 7
+ %13 = zext i1 %12 to i32
+ %14 = and i32 %9, %13
+ %15 = icmp ne i32 %14, 0
+ %16 = zext i1 %15 to i8
+ store i8 %16, i8* undef, align 1
+ unreachable
+ }
diff --git a/test/CodeGen/X86/pr33772.ll b/test/CodeGen/X86/pr33772.ll
new file mode 100644
index 000000000000..ff22c7478866
--- /dev/null
+++ b/test/CodeGen/X86/pr33772.ll
@@ -0,0 +1,15 @@
+; RUN: not llc < %s -mcpu=skylake-avx512 2>&1 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; make sure we don't crash if scale for gather isn't constant.
+
+; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.x86.avx512.gather.dpi.512
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, i16, i32)
+
+define internal <16 x i32> @__gather_base_offsets32_i32(i8* readonly %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i8> %vecmask) {
+  %mask_vec_i1.i.i = icmp ne <16 x i8> %vecmask, zeroinitializer
+  %mask_i16.i = bitcast <16 x i1> %mask_vec_i1.i.i to i16
+  %res = tail call <16 x i32> @llvm.x86.avx512.gather.dpi.512(<16 x i32> undef, i8* %ptr, <16 x i32> %offsets, i16 %mask_i16.i, i32 %offset_scale)
+  ret <16 x i32> %res
+}
diff --git a/test/CodeGen/X86/pr33828.ll b/test/CodeGen/X86/pr33828.ll
new file mode 100644
index 000000000000..1b7f44323b61
--- /dev/null
+++ b/test/CodeGen/X86/pr33828.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s --check-prefix=X64
+
+@var_580 = external local_unnamed_addr global i8, align 1
+
+define void @foo() {
+; X86-LABEL: foo:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    movsbl var_580, %eax
+; X86-NEXT:    testl $-536870913, %eax # imm = 0xDFFFFFFF
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # BB#2: # %if.end13
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1: # %if.then11
+;
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movsbl {{.*}}(%rip), %eax
+; X64-NEXT:    testl $-536870913, %eax # imm = 0xDFFFFFFF
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # BB#2: # %if.end13
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1: # %if.then11
+entry:
+  %tmp = icmp ugt i8 undef, 60
+  %phitmp = zext i1 %tmp to i16
+  br label %if.end
+
+if.end:
+  %tmp1 = load i8, i8* @var_580, align 1
+  %conv7 = sext i8 %tmp1 to i32
+  %conv8 = zext i16 %phitmp to i32
+  %mul = shl nuw nsw i32 %conv8, 1
+  %div9 = udiv i32 %mul, 71
+  %sub = add nsw i32 %div9, -3
+  %shl = shl i32 1, %sub
+  %neg = xor i32 %shl, -1
+  %and = and i32 %neg, %conv7
+  %tobool10 = icmp eq i32 %and, 0
+  br i1 %tobool10, label %if.end13, label %if.then11
+
+if.then11:
+  unreachable
+
+if.end13:
+  ret void
+}
diff --git a/test/CodeGen/X86/regparm.ll b/test/CodeGen/X86/regparm.ll
index 9484e5a9490b..f427010edc51 100644
--- a/test/CodeGen/X86/regparm.ll
+++ b/test/CodeGen/X86/regparm.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck -check-prefix=CHECK %s 
+; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck %s 
 ; RUN: llc %s -mtriple=i386-pc-win32 -o - | FileCheck -check-prefix=WIN %s
 ; RUN: llc %s -mtriple=i386-pc-linux -fast-isel -o - | FileCheck -check-prefix=FAST %s 
 ; RUN: llc %s -mtriple=i386-pc-win32 -fast-isel -o - | FileCheck -check-prefix=FASTWIN %s
diff --git a/test/CodeGen/X86/rotate_vec.ll b/test/CodeGen/X86/rotate_vec.ll
new file mode 100644
index 000000000000..8fb000bae827
--- /dev/null
+++ b/test/CodeGen/X86/rotate_vec.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s
+
+define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_splat:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd $31, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_non_splat:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+  %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_splat_2masks:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd $31, %xmm0, %xmm0
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
+
+  %3 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760>
+  %5 = or <4 x i32> %2, %4
+  ret <4 x i32> %5
+}
+
+define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) {
+; CHECK-LABEL: rot_v4i32_non_splat_2masks:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
+  %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
+
+  %3 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
+  %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760>
+  %5 = or <4 x i32> %2, %4
+  ret <4 x i32> %5
+}
diff --git a/test/CodeGen/X86/sibcall-win64.ll b/test/CodeGen/X86/sibcall-win64.ll
index 204e1f8b050b..b9d5a4813e09 100644
--- a/test/CodeGen/X86/sibcall-win64.ll
+++ b/test/CodeGen/X86/sibcall-win64.ll
@@ -1,15 +1,15 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
 
-declare x86_64_win64cc void @win64_callee(i32)
-declare x86_64_win64cc void (i32)* @win64_indirect()
-declare x86_64_win64cc void @win64_other(i32)
+declare win64cc void @win64_callee(i32)
+declare win64cc void (i32)* @win64_indirect()
+declare win64cc void @win64_other(i32)
 declare void @sysv_callee(i32)
 declare void (i32)* @sysv_indirect()
 declare void @sysv_other(i32)
 
 define void @sysv_caller(i32 %p1) {
 entry:
-  tail call x86_64_win64cc void @win64_callee(i32 %p1)
+  tail call win64cc void @win64_callee(i32 %p1)
   ret void
 }
 
@@ -19,7 +19,7 @@ entry:
 ; CHECK: addq $40, %rsp
 ; CHECK: retq
 
-define x86_64_win64cc void @win64_caller(i32 %p1) {
+define win64cc void @win64_caller(i32 %p1) {
 entry:
   tail call void @sysv_callee(i32 %p1)
   ret void
@@ -37,18 +37,18 @@ define void @sysv_matched(i32 %p1) {
 ; CHECK-LABEL: sysv_matched:
 ; CHECK: jmp sysv_callee # TAILCALL
 
-define x86_64_win64cc void @win64_matched(i32 %p1) {
-  tail call x86_64_win64cc void @win64_callee(i32 %p1)
+define win64cc void @win64_matched(i32 %p1) {
+  tail call win64cc void @win64_callee(i32 %p1)
   ret void
 }
 
 ; CHECK-LABEL: win64_matched:
 ; CHECK: jmp win64_callee # TAILCALL
 
-define x86_64_win64cc void @win64_indirect_caller(i32 %p1) {
-  %1 = call x86_64_win64cc void (i32)* @win64_indirect()
-  call x86_64_win64cc void @win64_other(i32 0)
-  tail call x86_64_win64cc void %1(i32 %p1)
+define win64cc void @win64_indirect_caller(i32 %p1) {
+  %1 = call win64cc void (i32)* @win64_indirect()
+  call win64cc void @win64_other(i32 0)
+  tail call win64cc void %1(i32 %p1)
   ret void
 }
 
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index c41acd43b3ab..29f726c3df6a 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; GENERIC-LABEL: test_addps:
@@ -45,6 +45,12 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fadd <4 x float> %1, %2
@@ -87,6 +93,12 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fadd float %1, %2
@@ -137,6 +149,12 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = and <4 x i32> %1, %2
@@ -191,6 +209,12 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -245,6 +269,13 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fcmp oeq <4 x float> %a0, %2
@@ -290,6 +321,12 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = insertelement <4 x float> undef, float %a1, i32 0
   %3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0)
@@ -385,6 +422,20 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_comiss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcomiss %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vcomiss (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 4
   %3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2)
@@ -435,6 +486,13 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i32 %a0 to float
   %2 = load i32, i32 *%a1, align 4
   %3 = sitofp i32 %2 to float
@@ -484,6 +542,13 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2ssq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i64 %a0 to float
   %2 = load i64, i64 *%a1, align 8
   %3 = sitofp i64 %2 to float
@@ -533,6 +598,13 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvtss2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtss2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -585,6 +657,13 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvtss2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtss2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -637,6 +716,13 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvttss2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttss2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttss2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi float %a0 to i32
   %2 = load float, float *%a1, align 4
   %3 = fptosi float %2 to i32
@@ -686,6 +772,13 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvttss2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttss2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttss2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi float %a0 to i64
   %2 = load float, float *%a1, align 4
   %3 = fptosi float %2 to i64
@@ -729,6 +822,12 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fdiv <4 x float> %1, %2
@@ -771,6 +870,12 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fdiv float %1, %2
@@ -813,6 +918,12 @@ define void @test_ldmxcsr(i32 %a0) {
 ; BTVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BTVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ldmxcsr:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   store i32 %a0, i32* %1
@@ -857,6 +968,12 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2)
@@ -900,6 +1017,12 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2)
@@ -943,6 +1066,12 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2)
@@ -986,6 +1115,12 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2)
@@ -1035,6 +1170,13 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movaps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovaps (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x float>, <4 x float> *%a0, align 16
   %2 = fadd <4 x float> %1, %1
   store <4 x float> %2, <4 x float> *%a1, align 16
@@ -1079,6 +1221,11 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhlps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   ret <4 x float> %1
 }
@@ -1129,6 +1276,13 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
   %2 = load <2 x float>, <2 x float> *%1, align 8
   %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1177,6 +1331,12 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlhps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %2 = fadd <4 x float> %a1, %1
   ret <4 x float> %2
@@ -1224,6 +1384,13 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
   %2 = load <2 x float>, <2 x float> *%1, align 8
   %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1266,6 +1433,11 @@ define i32 @test_movmskps(<4 x float> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovmskps %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovmskps %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
   ret i32 %1
 }
@@ -1307,6 +1479,11 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0
   ret void
 }
@@ -1353,6 +1530,13 @@ define void @test_movss_mem(float* %a0, float* %a1) {
 ; BTVER2-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movss_mem:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load float, float* %a0, align 1
   %2 = fadd float %1, %1
   store float %2, float *%a1, align 1
@@ -1395,6 +1579,11 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movss_reg:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %1
 }
@@ -1441,6 +1630,13 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movups:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovups (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <4 x float>, <4 x float> *%a0, align 1
   %2 = fadd <4 x float> %1, %1
   store <4 x float> %2, <4 x float> *%a1, align 1
@@ -1483,6 +1679,12 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fmul <4 x float> %1, %2
@@ -1525,6 +1727,12 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fmul float %1, %2
@@ -1575,6 +1783,12 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; BTVER2-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_orps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = or <4 x i32> %1, %2
@@ -1621,6 +1835,11 @@ define void @test_prefetchnta(i8* %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    prefetchnta (%rdi) # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_prefetchnta:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    prefetchnta (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
   ret void
 }
@@ -1670,6 +1889,13 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcpps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vrcpps (%rdi), %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2)
@@ -1728,6 +1954,14 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcpss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -1782,6 +2016,13 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rsqrtps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2)
@@ -1840,6 +2081,14 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rsqrtss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x float> undef, float %a0, i32 0
   %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1)
   %3 = load float, float *%a1, align 4
@@ -1886,6 +2135,11 @@ define void @test_sfence() {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    sfence # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sfence:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    sfence # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse.sfence()
   ret void
 }
@@ -1931,6 +2185,12 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; BTVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
 ; BTVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
+; ZNVER1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 3, i32 4, i32 4>
@@ -1980,6 +2240,13 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [21:21.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2)
@@ -2038,6 +2305,14 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovaps (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2)
@@ -2082,6 +2357,12 @@ define i32 @test_stmxcsr() {
 ; BTVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
 ; BTVER2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_stmxcsr:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -2126,6 +2407,12 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <4 x float> %a0, %a1
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = fsub <4 x float> %1, %2
@@ -2168,6 +2455,12 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
 ; BTVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub float %a0, %a1
   %2 = load float, float *%a2, align 4
   %3 = fsub float %1, %2
@@ -2258,6 +2551,20 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ucomiss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vucomiss %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vucomiss (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 4
   %3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2)
@@ -2306,6 +2613,12 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
 ; BTVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2352,6 +2665,12 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
 ; BTVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2402,6 +2721,12 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; BTVER2-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <4 x float> %a0 to <4 x i32>
   %2 = bitcast <4 x float> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, %2
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 3c36b2138139..6ee908e0c787 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; GENERIC-LABEL: test_addpd:
@@ -45,6 +45,12 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fadd <2 x double> %1, %2
@@ -87,6 +93,12 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fadd double %1, %2
@@ -135,6 +147,13 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = and <4 x i32> %1, %2
@@ -188,6 +207,13 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_andnotpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -243,6 +269,13 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmppd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fcmp oeq <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fcmp oeq <2 x double> %a0, %2
@@ -288,6 +321,12 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x double> undef, double %a0, i32 0
   %2 = insertelement <2 x double> undef, double %a1, i32 0
   %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0)
@@ -383,6 +422,20 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_comisd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcomisd %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vcomisd (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 8
   %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -433,6 +486,13 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %2 = sitofp <2 x i32> %1 to <2 x double>
   %3 = load <4 x i32>, <4 x i32>*%a1, align 16
@@ -485,6 +545,13 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtdq2ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp <4 x i32> %a0 to <4 x float>
   %2 = load <4 x i32>, <4 x i32>*%a1, align 16
   %3 = sitofp <4 x i32> %2 to <4 x float>
@@ -535,6 +602,13 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2)
@@ -586,6 +660,13 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2ps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2)
@@ -637,6 +718,13 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2)
@@ -688,6 +776,13 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2pd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %2 = fpext <2 x float> %1 to <2 x double>
   %3 = load <4 x float>, <4 x float> *%a1, align 16
@@ -739,6 +834,13 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsd2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x double> undef, double %a0, i32 0
   %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1)
   %3 = load double, double *%a1, align 8
@@ -791,6 +893,13 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsd2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x double> undef, double %a0, i32 0
   %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1)
   %3 = load double, double *%a1, align 8
@@ -850,6 +959,14 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsd2ss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptrunc double %a0 to float
   %2 = load double, double *%a1, align 8
   %3 = fptrunc double %2 to float
@@ -899,6 +1016,13 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i32 %a0 to double
   %2 = load i32, i32 *%a1, align 8
   %3 = sitofp i32 %2 to double
@@ -948,6 +1072,13 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
 ; BTVER2-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtsi2sdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sitofp i64 %a0 to double
   %2 = load i64, i64 *%a1, align 8
   %3 = sitofp i64 %2 to double
@@ -1006,6 +1137,14 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
 ; BTVER2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtss2sd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fpext float %a0 to double
   %2 = load float, float *%a1, align 4
   %3 = fpext float %2 to double
@@ -1056,6 +1195,13 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttpd2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <2 x double> %a0 to <2 x i32>
   %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = load <2 x double>, <2 x double> *%a1, align 16
@@ -1108,6 +1254,13 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttps2dq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi <4 x float> %a0 to <4 x i32>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = fptosi <4 x float> %2 to <4 x i32>
@@ -1157,6 +1310,13 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
 ; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttsd2si:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttsd2si (%rdi), %eax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi double %a0 to i32
   %2 = load double, double *%a1, align 8
   %3 = fptosi double %2 to i32
@@ -1206,6 +1366,13 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
 ; BTVER2-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
 ; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttsd2siq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vcvttsd2si (%rdi), %rax # sched: [12:1.00]
+; ZNVER1-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fptosi double %a0 to i64
   %2 = load double, double *%a1, align 8
   %3 = fptosi double %2 to i64
@@ -1249,6 +1416,12 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fdiv <2 x double> %1, %2
@@ -1291,6 +1464,12 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_divsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
+; ZNVER1-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fdiv double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fdiv double %1, %2
@@ -1333,6 +1512,11 @@ define void @test_lfence() {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    lfence # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lfence:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    lfence # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse2.lfence()
   ret void
 }
@@ -1374,6 +1558,11 @@ define void @test_mfence() {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    mfence # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mfence:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    mfence # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse2.mfence()
   ret void
 }
@@ -1413,6 +1602,11 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovdqu:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2)
   ret void
 }
@@ -1454,6 +1648,12 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2)
@@ -1497,6 +1697,12 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maxsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2)
@@ -1540,6 +1746,12 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2)
@@ -1583,6 +1795,12 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_minsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2)
@@ -1632,6 +1850,13 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movapd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovapd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x double>, <2 x double> *%a0, align 16
   %2 = fadd <2 x double> %1, %1
   store <2 x double> %2, <2 x double> *%a1, align 16
@@ -1680,6 +1905,13 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdqa:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x i64>, <2 x i64> *%a0, align 16
   %2 = add <2 x i64> %1, %1
   store <2 x i64> %2, <2 x i64> *%a1, align 16
@@ -1728,6 +1960,13 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdqu:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x i64>, <2 x i64> *%a0, align 1
   %2 = add <2 x i64> %1, %1
   store <2 x i64> %2, <2 x i64> *%a1, align 1
@@ -1794,6 +2033,16 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; BTVER2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovd %xmm0, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vmovd %edi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovd %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x i32> undef, i32 %a1, i32 0
   %2 = load i32, i32 *%a2
   %3 = insertelement <4 x i32> undef, i32 %2, i32 0
@@ -1865,6 +2114,16 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
 ; BTVER2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovq %xmm0, %rax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd_64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vmovq %rdi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovq %xmm0, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x i64> undef, i64 %a1, i64 0
   %2 = load i64, i64 *%a2
   %3 = insertelement <2 x i64> undef, i64 %2, i64 0
@@ -1918,6 +2177,13 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movhpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to double*
   %2 = load double, double *%1, align 8
   %3 = insertelement <2 x double> %a1, double %2, i32 1
@@ -1969,6 +2235,13 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movlpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast x86_mmx* %a2 to double*
   %2 = load double, double *%1, align 8
   %3 = insertelement <2 x double> %a1, double %2, i32 0
@@ -2010,6 +2283,11 @@ define i32 @test_movmskpd(<2 x double> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovmskpd %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movmskpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovmskpd %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
   ret i32 %1
 }
@@ -2053,6 +2331,12 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntdqa:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <2 x i64> %a0, %a0
   store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0
   ret void
@@ -2094,6 +2378,12 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fadd <2 x double> %a0, %a0
   store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0
   ret void
@@ -2141,6 +2431,13 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq_mem:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load i64, i64* %a1, align 1
   %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0
   %3 = add <2 x i64> %a0, %2
@@ -2187,6 +2484,12 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
 ; BTVER2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq_reg:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
   %2 = add <2 x i64> %a1, %1
   ret <2 x i64> %2
@@ -2234,6 +2537,13 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
 ; BTVER2-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsd_mem:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load double, double* %a0, align 1
   %2 = fadd double %1, %1
   store double %2, double *%a1, align 1
@@ -2277,6 +2587,11 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsd_reg:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 0>
   ret <2 x double> %1
 }
@@ -2323,6 +2638,13 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movupd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovupd (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <2 x double>, <2 x double> *%a0, align 1
   %2 = fadd <2 x double> %1, %1
   store <2 x double> %2, <2 x double> *%a1, align 1
@@ -2365,6 +2687,12 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fmul <2 x double> %1, %2
@@ -2407,6 +2735,12 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mulsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fmul double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fmul double %1, %2
@@ -2455,6 +2789,13 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_orpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = or <4 x i32> %1, %2
@@ -2510,6 +2851,12 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packssdw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <8 x i16> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -2562,6 +2909,12 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packsswb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2614,6 +2967,12 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packuswb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = bitcast <16 x i8> %1 to <8 x i16>
   %3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2662,6 +3021,12 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = add <16 x i8> %1, %2
@@ -2708,6 +3073,12 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = add <4 x i32> %1, %2
@@ -2750,6 +3121,12 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = add <2 x i64> %1, %2
@@ -2796,6 +3173,12 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2)
@@ -2843,6 +3226,12 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2)
@@ -2890,6 +3279,12 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2)
@@ -2937,6 +3332,12 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2)
@@ -2984,6 +3385,12 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = add <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = add <8 x i16> %1, %2
@@ -3032,6 +3439,13 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pand:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = and <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = and <2 x i64> %1, %2
@@ -3087,6 +3501,13 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pandn:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <2 x i64> %a0, <i64 -1, i64 -1>
   %2 = and <2 x i64> %a1, %1
   %3 = load <2 x i64>, <2 x i64> *%a2, align 16
@@ -3136,6 +3557,12 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2)
@@ -3183,6 +3610,12 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2)
@@ -3234,6 +3667,13 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = icmp eq <16 x i8> %a0, %2
@@ -3286,6 +3726,13 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = icmp eq <4 x i32> %a0, %2
@@ -3338,6 +3785,13 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = icmp eq <8 x i16> %a0, %2
@@ -3391,6 +3845,13 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = icmp sgt <16 x i8> %a0, %2
@@ -3444,6 +3905,13 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = icmp eq <4 x i32> %a0, %2
@@ -3497,6 +3965,13 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = icmp sgt <8 x i16> %a0, %2
@@ -3541,6 +4016,12 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ; BTVER2-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrw $6, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <8 x i16> %a0, i32 6
   ret i16 %1
 }
@@ -3585,6 +4066,12 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
 ; BTVER2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1
   %2 = load i16, i16 *%a2
   %3 = insertelement <8 x i16> %1, i16 %2, i32 3
@@ -3635,6 +4122,12 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
   %2 = bitcast <4 x i32> %1 to <8 x i16>
   %3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -3683,6 +4176,12 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2)
@@ -3730,6 +4229,12 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxub:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2)
@@ -3777,6 +4282,12 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2)
@@ -3824,6 +4335,12 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminub:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2)
@@ -3863,6 +4380,11 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpmovmskb %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovmskb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovmskb %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
   ret i32 %1
 }
@@ -3904,6 +4426,12 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2)
@@ -3947,6 +4475,12 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2)
@@ -3990,6 +4524,12 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmullw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = mul <8 x i16> %1, %2
@@ -4040,6 +4580,12 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmuludq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <2 x i64> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -4090,6 +4636,13 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_por:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = or <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = or <2 x i64> %1, %2
@@ -4141,6 +4694,12 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psadbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
   %2 = bitcast <2 x i64> %1 to <16 x i8>
   %3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -4193,6 +4752,13 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
+; ZNVER1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -4244,6 +4810,13 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; BTVER2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufhw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50]
+; ZNVER1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4>
@@ -4295,6 +4868,13 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; BTVER2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshuflw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50]
+; ZNVER1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -4344,6 +4924,13 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslld:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2)
@@ -4389,6 +4976,11 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i32> %1
 }
@@ -4435,6 +5027,13 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2)
@@ -4486,6 +5085,13 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2)
@@ -4537,6 +5143,13 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrad:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2)
@@ -4588,6 +5201,13 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psraw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2)
@@ -4639,6 +5259,13 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrld:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2)
@@ -4684,6 +5311,11 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %1
 }
@@ -4730,6 +5362,13 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2)
@@ -4781,6 +5420,13 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2)
@@ -4830,6 +5476,12 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <16 x i8> %a0, %a1
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = sub <16 x i8> %1, %2
@@ -4876,6 +5528,12 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = sub <4 x i32> %1, %2
@@ -4918,6 +5576,12 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = sub <2 x i64> %1, %2
@@ -4964,6 +5628,12 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2)
@@ -5011,6 +5681,12 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2)
@@ -5058,6 +5734,12 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2)
@@ -5105,6 +5787,12 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2)
@@ -5152,6 +5840,12 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = sub <8 x i16> %a0, %a1
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = sub <8 x i16> %1, %2
@@ -5198,6 +5892,12 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -5248,6 +5948,13 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -5297,6 +6004,13 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; BTVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhqdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 1, i32 3>
@@ -5344,6 +6058,12 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -5390,6 +6110,12 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -5440,6 +6166,13 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -5489,6 +6222,13 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; BTVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklqdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 0, i32 2>
@@ -5536,6 +6276,12 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
 ; BTVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
+; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -5584,6 +6330,13 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pxor:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = xor <2 x i64> %a0, %a1
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = xor <2 x i64> %1, %2
@@ -5633,6 +6386,13 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BTVER2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shufpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 2>
@@ -5683,6 +6443,13 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [21:21.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [20:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2)
@@ -5741,6 +6508,14 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sqrtsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovapd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
+; ZNVER1-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
@@ -5785,6 +6560,12 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub <2 x double> %a0, %a1
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fsub <2 x double> %1, %2
@@ -5827,6 +6608,12 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
 ; BTVER2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_subsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = fsub double %a0, %a1
   %2 = load double, double *%a2, align 8
   %3 = fsub double %1, %2
@@ -5917,6 +6704,20 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
 ; BTVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ucomisd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    vucomisd (%rdi), %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    setnp %al # sched: [1:0.25]
+; ZNVER1-NEXT:    sete %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    orb %cl, %dl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 8
   %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -5967,6 +6768,13 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpckhpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
@@ -6022,6 +6830,13 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_unpcklpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
@@ -6071,6 +6886,13 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xorpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = bitcast <2 x double> %a0 to <4 x i32>
   %2 = bitcast <2 x double> %a1 to <4 x i32>
   %3 = xor <4 x i32> %1, %2
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index ef1ddae4532d..ad38d1c6ff49 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; GENERIC-LABEL: test_addsubpd:
@@ -45,6 +45,12 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsubpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2)
@@ -88,6 +94,12 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_addsubps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2)
@@ -131,6 +143,12 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BTVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_haddpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2)
@@ -174,6 +192,12 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; BTVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_haddps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2)
@@ -217,6 +241,12 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; BTVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_hsubpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2)
@@ -260,6 +290,12 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; BTVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_hsubps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2)
@@ -299,6 +335,11 @@ define <16 x i8> @test_lddqu(i8* %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vlddqu (%rdi), %xmm0 # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lddqu:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vlddqu (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
   ret <16 x i8> %1
 }
@@ -347,6 +388,13 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movddup:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50]
+; ZNVER1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
@@ -397,6 +445,13 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movshdup:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50]
+; ZNVER1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -447,6 +502,13 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movsldup:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50]
+; ZNVER1-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 1ab1598fcab7..26cca98816a3 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; GENERIC-LABEL: test_blendpd:
@@ -43,6 +43,13 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = fadd <2 x double> %a1, %1
@@ -80,6 +87,12 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; BTVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
 ; BTVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; ZNVER1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
@@ -122,6 +135,12 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; BTVER2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendvpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   %2 = load <2 x double>, <2 x double> *%a3, align 16
   %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2)
@@ -165,6 +184,12 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; BTVER2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_blendvps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   %2 = load <4 x float>, <4 x float> *%a3
   %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2)
@@ -202,6 +227,12 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; BTVER2-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dppd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
   %2 = load <2 x double>, <2 x double> *%a2, align 16
   %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7)
@@ -239,6 +270,12 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; BTVER2-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dpps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7)
@@ -276,6 +313,12 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
 ; BTVER2-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
 ; BTVER2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
+; ZNVER1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17)
   %2 = load float, float *%a2
   %3 = insertelement <4 x float> %1, float %2, i32 3
@@ -308,6 +351,11 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [5:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntdqa:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0)
   ret <2 x i64> %1
 }
@@ -343,6 +391,12 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
 ; BTVER2-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mpsadbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
   %2 = bitcast <8 x i16> %1 to <16 x i8>
   %3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -381,6 +435,12 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packusdw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <8 x i16> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -425,6 +485,12 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
 ; BTVER2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pblendvb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
   %2 = load <16 x i8>, <16 x i8> *%a3, align 16
   %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2)
@@ -462,6 +528,12 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
 ; BTVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pblendw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; ZNVER1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
@@ -498,6 +570,12 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp eq <2 x i64> %a0, %a1
   %2 = sext <2 x i1> %1 to <2 x i64>
   %3 = load <2 x i64>, <2 x i64>*%a2, align 16
@@ -536,6 +614,12 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
 ; BTVER2-NEXT:    vpextrb $3, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrb $3, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <16 x i8> %a0, i32 3
   %2 = extractelement <16 x i8> %a0, i32 1
   store i8 %2, i8 *%a1
@@ -573,6 +657,12 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
 ; BTVER2-NEXT:    vpextrd $3, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrd $3, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <4 x i32> %a0, i32 3
   %2 = extractelement <4 x i32> %a0, i32 1
   store i32 %2, i32 *%a1
@@ -609,6 +699,12 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
 ; BTVER2-NEXT:    vpextrq $1, %xmm0, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrq $1, %xmm0, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <2 x i64> %a0, i32 1
   %2 = extractelement <2 x i64> %a0, i32 1
   store i64 %2, i64 *%a2
@@ -645,6 +741,12 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
 ; BTVER2-NEXT:    vpextrw $3, %xmm0, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpextrw $3, %xmm0, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = extractelement <8 x i16> %a0, i32 3
   %2 = extractelement <8 x i16> %a0, i32 1
   store i16 %2, i16 *%a1
@@ -682,6 +784,12 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
 ; BTVER2-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phminposuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = load <8 x i16>, <8 x i16> *%a0, align 16
   %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1)
   %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2)
@@ -719,6 +827,12 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
 ; BTVER2-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1
   %2 = load i8, i8 *%a2
   %3 = insertelement <16 x i8> %1, i8 %2, i32 3
@@ -755,6 +869,12 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; BTVER2-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1
   %2 = load i32, i32 *%a2
   %3 = insertelement <4 x i32> %1, i32 %2, i32 3
@@ -796,6 +916,13 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
 ; BTVER2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1
   %2 = load i64, i64 *%a3
   %3 = insertelement <2 x i64> %a1, i64 %2, i32 1
@@ -833,6 +960,12 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2)
@@ -870,6 +1003,12 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2)
@@ -907,6 +1046,12 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxud:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2)
@@ -944,6 +1089,12 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2)
@@ -981,6 +1132,12 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2)
@@ -1018,6 +1175,12 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2)
@@ -1055,6 +1218,12 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminud:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2)
@@ -1092,6 +1261,12 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminuw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2)
@@ -1135,6 +1310,13 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = sext <8 x i8> %1 to <8 x i16>
   %3 = load <8 x i8>, <8 x i8>* %a1, align 1
@@ -1179,6 +1361,13 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = sext <4 x i8> %1 to <4 x i32>
   %3 = load <4 x i8>, <4 x i8>* %a1, align 1
@@ -1223,6 +1412,13 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
   %2 = sext <2 x i8> %1 to <2 x i64>
   %3 = load <2 x i8>, <2 x i8>* %a1, align 1
@@ -1267,6 +1463,13 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; BTVER2-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %2 = sext <2 x i32> %1 to <2 x i64>
   %3 = load <2 x i32>, <2 x i32>* %a1, align 1
@@ -1311,6 +1514,13 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = sext <4 x i16> %1 to <4 x i32>
   %3 = load <4 x i16>, <4 x i16>* %a1, align 1
@@ -1355,6 +1565,13 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxwq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   %2 = sext <2 x i16> %1 to <2 x i64>
   %3 = load <2 x i16>, <2 x i16>* %a1, align 1
@@ -1399,6 +1616,13 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = zext <8 x i8> %1 to <8 x i16>
   %3 = load <8 x i8>, <8 x i8>* %a1, align 1
@@ -1443,6 +1667,13 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = zext <4 x i8> %1 to <4 x i32>
   %3 = load <4 x i8>, <4 x i8>* %a1, align 1
@@ -1487,6 +1718,13 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; BTVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
   %2 = zext <2 x i8> %1 to <2 x i64>
   %3 = load <2 x i8>, <2 x i8>* %a1, align 1
@@ -1531,6 +1769,13 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; BTVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxdq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %2 = zext <2 x i32> %1 to <2 x i64>
   %3 = load <2 x i32>, <2 x i32>* %a1, align 1
@@ -1575,6 +1820,13 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxwd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %2 = zext <4 x i16> %1 to <4 x i32>
   %3 = load <4 x i16>, <4 x i16>* %a1, align 1
@@ -1619,6 +1871,13 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; BTVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
 ; BTVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxwq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
   %2 = zext <2 x i16> %1 to <2 x i64>
   %3 = load <2 x i16>, <2 x i16>* %a1, align 1
@@ -1657,6 +1916,12 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmuldq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
   %2 = bitcast <2 x i64> %1 to <4 x i32>
   %3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -1695,6 +1960,12 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulld:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = mul <4 x i32> %a0, %a1
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = mul <4 x i32> %1, %2
@@ -1751,6 +2022,16 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
 ; BTVER2-NEXT:    movzbl %cl, %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ptest:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vptest %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    setb %al # sched: [1:0.25]
+; ZNVER1-NEXT:    vptest (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    setb %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    andb %al, %cl # sched: [1:0.25]
+; ZNVER1-NEXT:    movzbl %cl, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
   %2 = load <2 x i64>, <2 x i64> *%a2, align 16
   %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2)
@@ -1795,6 +2076,13 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
 ; BTVER2-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundpd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00]
+; ZNVER1-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
   %2 = load <2 x double>, <2 x double> *%a1, align 16
   %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7)
@@ -1839,6 +2127,13 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
 ; BTVER2-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundps:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [10:1.00]
+; ZNVER1-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
   %2 = load <4 x float>, <4 x float> *%a1, align 16
   %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7)
@@ -1884,6 +2179,13 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; BTVER2-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
   %2 = load <2 x double>, <2 x double>* %a2, align 16
   %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7)
@@ -1929,6 +2231,13 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; BTVER2-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_roundss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; ZNVER1-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
   %2 = load <4 x float>, <4 x float> *%a2, align 16
   %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7)
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index 7ce9ffdbd0ea..adf857e12179 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; GENERIC-LABEL: crc32_32_8:
@@ -43,6 +43,13 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; BTVER2-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_32_8:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
   %2 = load i8, i8 *%a2
   %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2)
@@ -85,6 +92,13 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
 ; BTVER2-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_32_16:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32w %si, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32w (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
   %2 = load i16, i16 *%a2
   %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2)
@@ -127,6 +141,13 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
 ; BTVER2-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_32_32:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32l (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
   %2 = load i32, i32 *%a2
   %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2)
@@ -169,6 +190,13 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
 ; BTVER2-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_64_8:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
+; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
   %2 = load i8, i8 *%a2
   %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2)
@@ -211,6 +239,13 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
 ; BTVER2-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.17]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: crc32_64_64:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32q (%rdx), %rdi # sched: [10:1.00]
+; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
   %2 = load i64, i64 *%a2
   %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2)
@@ -283,6 +318,19 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; BTVER2-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpestri:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    movl %ecx, %esi # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ZNVER1-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7)
@@ -341,6 +389,16 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    movl $7, %edx # sched: [1:0.17]
 ; BTVER2-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpestrm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    movl $7, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl $7, %edx # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7)
@@ -393,6 +451,15 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; BTVER2-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpistri:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    movl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ZNVER1-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7)
@@ -431,6 +498,12 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpistrm:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7)
@@ -468,6 +541,12 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; BTVER2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = icmp sgt <2 x i64> %a0, %a1
   %2 = sext <2 x i1> %1 to <2 x i64>
   %3 = load <2 x i64>, <2 x i64>*%a2, align 16
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
index 11afdb7989f1..9ad6b0dfd4d6 100644
--- a/test/CodeGen/X86/sse4a-schedule.ll
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
 
 define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ; GENERIC-LABEL: test_extrq:
@@ -11,8 +11,13 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ;
 ; BTVER2-LABEL: test_extrq:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    extrq %xmm1, %xmm0
+; BTVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_extrq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    extrq %xmm1, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1)
   ret <2 x i64> %1
 }
@@ -26,8 +31,13 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
 ;
 ; BTVER2-LABEL: test_extrqi:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    extrq $2, $3, %xmm0
+; BTVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_extrqi:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    extrq $2, $3, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2)
   ret <2 x i64> %1
 }
@@ -41,8 +51,13 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BTVER2-LABEL: test_insertq:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    insertq %xmm1, %xmm0
+; BTVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertq:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    insertq %xmm1, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1)
   ret <2 x i64> %1
 }
@@ -56,8 +71,13 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
 ;
 ; BTVER2-LABEL: test_insertqi:
 ; BTVER2:       # BB#0:
-; BTVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0
+; BTVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_insertqi:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6)
   ret <2 x i64> %1
 }
@@ -73,6 +93,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
   ret void
 }
@@ -88,6 +113,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntss:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    movntss %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
   ret void
 }
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index f24969a30c33..24ace69ebb9e 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -7,7 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
 define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ; GENERIC-LABEL: test_pabsb:
@@ -52,6 +52,13 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ; BTVER2-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpabsb (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
   %2 = load <16 x i8>, <16 x i8> *%a1, align 16
   %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2)
@@ -103,6 +110,13 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; BTVER2-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpabsd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
   %2 = load <4 x i32>, <4 x i32> *%a1, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2)
@@ -147,6 +161,11 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
   %2 = load <8 x i16>, <8 x i16> *%a1, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2)
@@ -196,6 +215,12 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
 ; BTVER2-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_palignr:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25]
+; ZNVER1-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -238,6 +263,12 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -289,6 +320,12 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -332,6 +369,12 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -375,6 +418,12 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -426,6 +475,12 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -469,6 +524,12 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -512,6 +573,12 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddubsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = bitcast <8 x i16> %1 to <16 x i8>
@@ -550,6 +617,11 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2:       # BB#0:
 ; BTVER2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhrsw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -593,6 +665,12 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2)
@@ -644,6 +722,12 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; BTVER2-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignb:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
   %2 = load <16 x i8>, <16 x i8> *%a2, align 16
   %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2)
@@ -695,6 +779,12 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; BTVER2-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignd:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = load <4 x i32>, <4 x i32> *%a2, align 16
   %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -746,6 +836,12 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; BTVER2-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignw:
+; ZNVER1:       # BB#0:
+; ZNVER1-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT:    retq # sched: [5:0.50]
   %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = load <8 x i16>, <8 x i16> *%a2, align 16
   %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2)
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 29f8e3ed4f78..784b932addc8 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -95,8 +95,8 @@ left.relocs:
 
 right:
   ; CHECK-LABEL: %right
-  ; CHECK: movq
   ; CHECK: movq %rdx, (%rsp)
+  ; CHECK: movq
   ; CHECK: callq some_call
   %sp2 = invoke token (i64, i32, void (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i64f(i64 0, i32 0, void (i64 addrspace(1)*)* @some_call, i32 1, i32 0, i64 addrspace(1)* %val1, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
            to label %right.relocs unwind label %exceptional_return.right
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index b16426eae3d5..6e7fc7bf1c07 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -11,9 +11,9 @@ target triple = "x86_64-pc-linux-gnu"
 define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_calls
 ; The exact stores don't matter, but there need to be three stack slots created
-; CHECK: movq	%rdi, 16(%rsp)
-; CHECK: movq	%rdx, 8(%rsp)
-; CHECK: movq	%rsi, (%rsp)
+; CHECK-DAG: movq	%rdi, 16(%rsp)
+; CHECK-DAG: movq	%rdx, 8(%rsp)
+; CHECK-DAG: movq	%rsi, (%rsp)
 ; There should be no more than three moves
 ; CHECK-NOT: movq
   %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
@@ -36,9 +36,9 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a
 define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: reserve_first
 ; The exact stores don't matter, but there need to be three stack slots created
-; CHECK: movq	%rdi, 16(%rsp)
-; CHECK: movq	%rdx, 8(%rsp)
-; CHECK: movq	%rsi, (%rsp)
+; CHECK-DAG: movq	%rdi, 16(%rsp)
+; CHECK-DAG: movq	%rdx, 8(%rsp)
+; CHECK-DAG: movq	%rsi, (%rsp)
   %safepoint_token = tail call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
   %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12)
   %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13)
@@ -61,21 +61,21 @@ define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1
   gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_deopt
 ; The exact stores don't matter, but there need to be three stack slots created
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
-; CHECK: movl	%ebx, 12(%rsp)
-; CHECK: movl	%ebp, 8(%rsp)
-; CHECK: movl	%r14d, 4(%rsp)
+; CHECK-DAG: movl	%ebx, 12(%rsp)
+; CHECK-DAG: movl	%ebp, 8(%rsp)
+; CHECK-DAG: movl	%r14d, 4(%rsp)
 ; CHECK: callq
   call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
 call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 3, i32 %a, i32 %b, i32 %c)
@@ -89,9 +89,9 @@ define i32 @back_to_back_invokes(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32
 ; CHECK-LABEL: back_to_back_invokes
 entry:
   ; The exact stores don't matter, but there need to be three stack slots created
-  ; CHECK: movq	%rdi, 16(%rsp)
-  ; CHECK: movq	%rdx, 8(%rsp)
-  ; CHECK: movq	%rsi, (%rsp)
+  ; CHECK-DAG: movq	%rdi, 16(%rsp)
+  ; CHECK-DAG: movq	%rdx, 8(%rsp)
+  ; CHECK-DAG: movq	%rsi, (%rsp)
   ; CHECK: callq
   %safepoint_token = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
                    to label %normal_return unwind label %exceptional_return
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
index 5bc8f983ff06..538d17564957 100644
--- a/test/CodeGen/X86/statepoint-vector.ll
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -49,8 +49,8 @@ entry:
 ; CHECK: subq	$40, %rsp
 ; CHECK: testb	$1, %dil
 ; CHECK: movaps	(%rsi), %xmm0
-; CHECK: movaps	%xmm0, 16(%rsp)
-; CHECK: movaps	%xmm0, (%rsp)
+; CHECK-DAG: movaps	%xmm0, (%rsp)
+; CHECK-DAG: movaps	%xmm0, 16(%rsp)
 ; CHECK: callq	do_safepoint
 ; CHECK: movaps	(%rsp), %xmm0
 ; CHECK: addq	$40, %rsp
diff --git a/test/CodeGen/X86/vec_cmp_uint-128.ll b/test/CodeGen/X86/vec_cmp_uint-128.ll
index 8bed14e7e5f5..cad7991c4f3b 100644
--- a/test/CodeGen/X86/vec_cmp_uint-128.ll
+++ b/test/CodeGen/X86/vec_cmp_uint-128.ll
@@ -463,7 +463,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: gt_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -476,7 +476,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX512-LABEL: gt_v4i32:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX512-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -782,7 +782,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: lt_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
@@ -795,7 +795,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX512-LABEL: lt_v4i32:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; AVX512-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 2b5eb695f53e..87cf2026d1ef 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -135,7 +135,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
@@ -433,7 +433,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
@@ -444,7 +444,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
 ; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index e7bfe3778212..ce0ec6c3875a 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -115,7 +115,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
@@ -381,7 +381,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
@@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $31, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrad $2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index cd17fcf8c85b..8138442b3eaf 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -130,7 +130,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
@@ -412,7 +412,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
@@ -423,7 +423,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
 ; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 4adc2e2fb6c9..b0433110f181 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -123,7 +123,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_div7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
@@ -392,7 +392,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: test_rem7_8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
@@ -403,7 +403,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrld $2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
 ; AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 6719a66f030f..c65c3e7fd004 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -73,7 +73,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
 ;
 ; AVX2-LABEL: PR20355:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm2, %xmm3, %xmm2
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 852c1f4d3d98..04378ee2ee01 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -77,14 +77,19 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
-; AVX512-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: var_rotate_v2i64:
 ; XOP:       # BB#0:
@@ -207,21 +212,26 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v4i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [32,32,32,32]
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
-; AVX512-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
-; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: var_rotate_v4i32:
 ; XOP:       # BB#0:
@@ -844,28 +854,24 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
-; XOPAVX1-LABEL: constant_rotate_v2i64:
-; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT:    retq
+; AVX512VL-LABEL: constant_rotate_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
-; XOPAVX2-LABEL: constant_rotate_v2i64:
-; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT:    retq
+; XOP-LABEL: constant_rotate_v2i64:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v2i64:
 ; X32-SSE:       # BB#0:
@@ -951,26 +957,24 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
-; XOPAVX1-LABEL: constant_rotate_v4i32:
-; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT:    retq
+; AVX512VL-LABEL: constant_rotate_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
-; XOPAVX2-LABEL: constant_rotate_v4i32:
-; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT:    retq
+; XOP-LABEL: constant_rotate_v4i32:
+; XOP:       # BB#0:
+; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v4i32:
 ; X32-SSE:       # BB#0:
@@ -1100,11 +1104,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
 ;
 ; XOP-LABEL: constant_rotate_v8i16:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v8i16:
@@ -1281,11 +1281,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ;
 ; XOP-LABEL: constant_rotate_v16i8:
 ; XOP:       # BB#0:
-; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
-; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm2
-; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vprotb {{.*}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; X32-SSE-LABEL: constant_rotate_v16i8:
@@ -1371,12 +1367,18 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllq $14, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlq $50, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $14, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v2i64:
 ; XOP:       # BB#0:
@@ -1412,12 +1414,18 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrld $28, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v4i32:
 ; XOP:       # BB#0:
@@ -1544,11 +1552,19 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v2i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrlq $49, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $15, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_mask_v2i64:
 ; XOP:       # BB#0:
@@ -1595,14 +1611,19 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrld $28, %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_mask_v4i32:
 ; XOP:       # BB#0:
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 14215e486bf9..3b65b68352b5 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -41,21 +41,25 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
-; AVX512-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
-; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
@@ -128,21 +132,25 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; AVX2-LABEL: var_rotate_v8i32:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32]
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: var_rotate_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX512-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
-; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: var_rotate_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
@@ -466,7 +474,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
@@ -483,36 +491,36 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm4
-; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpshlq %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v4i64:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
-; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
-  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 2>
+  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
   %or = or <4 x i64> %shl, %lshr
   ret <4 x i64> %or
 }
@@ -549,30 +557,33 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: constant_rotate_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: constant_rotate_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v8i32:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
-; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
@@ -643,30 +654,18 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
 ;
 ; XOPAVX1-LABEL: constant_rotate_v16i16:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm4
-; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpshlw %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v16i16:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm1
-; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm3
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
@@ -768,32 +767,20 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
 ;
 ; XOPAVX1-LABEL: constant_rotate_v32i8:
 ; XOPAVX1:       # BB#0:
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
-; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; XOPAVX1-NEXT:    vprotb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vprotb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: constant_rotate_v32i8:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm3
-; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm3, %xmm3
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; XOPAVX2-NEXT:    vprotb %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vprotb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
   %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
@@ -825,12 +812,17 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsllq $14, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlq $50, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $14, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
@@ -873,12 +865,17 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrld $28, %ymm0, %ymm0
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
@@ -1027,11 +1024,18 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v4i64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrlq $49, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprolq $15, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
 ; XOPAVX1:       # BB#0:
@@ -1082,14 +1086,18 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: splatconstant_rotate_mask_v8i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vpslld $4, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrld $28, %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
 ; XOPAVX1:       # BB#0:
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
new file mode 100644
index 000000000000..fa1b5c1c0cb4
--- /dev/null
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -0,0 +1,831 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VLBW
+
+;
+; Variable Rotates
+;
+
+define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
+; AVX512-LABEL: var_rotate_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
+  %shl = shl <8 x i64> %a, %b
+  %lshr = lshr <8 x i64> %a, %b64
+  %or = or <8 x i64> %shl, %lshr
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
+; AVX512-LABEL: var_rotate_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
+  %shl = shl <16 x i32> %a, %b
+  %lshr = lshr <16 x i32> %a, %b32
+  %or = or <16 x i32> %shl, %lshr
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
+; AVX512F-LABEL: var_rotate_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm4, %ymm5
+; AVX512F-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
+; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm2, %ymm4, %ymm5
+; AVX512VL-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
+; AVX512VL-NEXT:    vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_rotate_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_rotate_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
+  %shl = shl <32 x i16> %a, %b
+  %lshr = lshr <32 x i16> %a, %b16
+  %or = or <32 x i16> %shl, %lshr
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; AVX512F-LABEL: var_rotate_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm5, %ymm4
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm1, %ymm6
+; AVX512F-NEXT:    vpsllw $2, %ymm6, %ymm8
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm9, %ymm8, %ymm8
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm3
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm6
+; AVX512F-NEXT:    vpsllw $2, %ymm6, %ymm7
+; AVX512F-NEXT:    vpand %ymm9, %ymm7, %ymm7
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm7
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT:    vpsllw $5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm6
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm9, %ymm6, %ymm6
+; AVX512F-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm3
+; AVX512F-NEXT:    vpand %ymm9, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm2, %ymm5, %ymm4
+; AVX512VL-NEXT:    vpsubb %ymm3, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm1, %ymm6
+; AVX512VL-NEXT:    vpsllw $2, %ymm6, %ymm8
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm8, %ymm8
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm8, %ymm6, %ymm3
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT:    vpsllw $2, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpand %ymm9, %ymm7, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm7
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm2
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm8, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm6
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm6, %ymm6
+; AVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm8, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm9, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: var_rotate_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm3, %zmm4
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT:    vpsllw $5, %zmm2, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: var_rotate_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm3, %zmm4
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm2, %zmm1
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k2
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpmovb2m %zmm2, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
+  %shl = shl <64 x i8> %a, %b
+  %lshr = lshr <64 x i8> %a, %b8
+  %or = or <64 x i8> %shl, %lshr
+  ret <64 x i8> %or
+}
+
+;
+; Constant Rotates
+;
+
+define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
+; AVX512-LABEL: constant_rotate_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
+  %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
+  %or = or <8 x i64> %shl, %lshr
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
+; AVX512-LABEL: constant_rotate_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
+  %or = or <16 x i32> %shl, %lshr
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: constant_rotate_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX512VL-NEXT:    vpmullw %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_rotate_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_rotate_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+  %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
+  %or = or <32 x i16> %shl, %lshr
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: constant_rotate_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512F-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT:    vpand %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsllw $2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT:    vpaddb %ymm8, %ymm8, %ymm10
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpand %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendvb %ymm10, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm7
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm5
+; AVX512VL-NEXT:    vpand %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsllw $2, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm8
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm10
+; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: constant_rotate_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: constant_rotate_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT:    vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpsllw $2, %zmm2, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLBW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $2, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
+  %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  %or = or <64 x i8> %shl, %lshr
+  ret <64 x i8> %or
+}
+
+;
+; Uniform Constant Rotates
+;
+
+define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolq $14, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
+  %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
+  %or = or <8 x i64> %shl, %lshr
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
+  %or = or <16 x i32> %shl, %lshr
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $7, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
+; AVX512F-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlw $9, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $7, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpsrlw $9, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlw $9, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlw $9, %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $7, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $9, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+  %or = or <32 x i16> %shl, %lshr
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %or = or <64 x i8> %shl, %lshr
+  ret <64 x i8> %or
+}
+
+;
+; Masked Uniform Constant Rotates
+;
+
+define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprolq $15, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
+  %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
+  %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
+  %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
+  %or = or <8 x i64> %lmask, %rmask
+  ret <8 x i64> %or
+}
+
+define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
+; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
+  %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
+  %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
+  %or = or <16 x i32> %lmask, %rmask
+  ret <16 x i32> %or
+}
+
+define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $5, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $11, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $11, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $5, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $5, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
+  %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
+  %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
+  %or = or <32 x i16> %lmask, %rmask
+  ret <32 x i16> %or
+}
+
+define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsllw $4, %ymm1, %ymm3
+; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsllw $4, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512VLBW:       # BB#0:
+; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT:    retq
+  %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
+  %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+  %or = or <64 x i8> %lmask, %rmask
+  ret <64 x i8> %or
+}
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 09e143ddcd4d..5f2b18fc9c03 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -45,7 +45,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: var_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
@@ -66,7 +66,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: var_shift_v4i64:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
 ; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
@@ -667,7 +667,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX2-LABEL: splatvar_shift_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -687,7 +687,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; XOPAVX2-LABEL: splatvar_shift_v4i64:
 ; XOPAVX2:       # BB#0:
-; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -1700,7 +1700,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ; XOPAVX2-LABEL: splatconstant_shift_v4i64:
 ; XOPAVX2:       # BB#0:
 ; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72057594037927936,72057594037927936,72057594037927936,72057594037927936]
 ; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 820178d2d992..5f00e55e225b 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -745,7 +745,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
 ; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
@@ -755,7 +755,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 30e5661d5485..4a7d25c1376e 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -179,7 +179,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
@@ -189,7 +189,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -432,7 +432,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
@@ -442,7 +442,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 3bf677aadf19..2fce8a601931 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -89,7 +89,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -99,7 +99,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
@@ -235,7 +235,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
 ; AVX512CD-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
@@ -245,7 +245,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
 ; AVX512CDBW-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index 5503cfc357e5..5825a56b6f99 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -58,8 +58,8 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    movq (%rdi,%rsi,8), %rax
-; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-0.5,-0.5,-0.5,-0.5]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.5,0.5,0.5,0.5]
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovupd %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
@@ -108,7 +108,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
 ;
 ; AVX2-LABEL: test3:
 ; AVX2:       ## BB#0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm3
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
 ; AVX2-NEXT:    vpmuldq %xmm4, %xmm5, %xmm4
@@ -117,7 +117,7 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
 ; AVX2-NEXT:    vpsrld $31, %xmm3, %xmm4
 ; AVX2-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm4
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]
 ; AVX2-NEXT:    vpmulld %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll
index 48753ad4fd76..5731b63f3bc1 100644
--- a/test/CodeGen/X86/widen_arith-2.ll
+++ b/test/CodeGen/X86/widen_arith-2.ll
@@ -16,20 +16,17 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    shll $3, %eax
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    shll $3, %eax
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl (%esp), %ecx
+; CHECK-NEXT:    leal (,%eax,8), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; CHECK-NEXT:    psubw %xmm0, %xmm3
 ; CHECK-NEXT:    pand %xmm1, %xmm3
 ; CHECK-NEXT:    pshufb %xmm2, %xmm3
-; CHECK-NEXT:    movq %xmm3, (%edx,%ecx,8)
+; CHECK-NEXT:    movq %xmm3, (%edx,%eax,8)
 ; CHECK-NEXT:    incl (%esp)
 ; CHECK-NEXT:  .LBB0_1: # %forcond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index e55d62a461aa..cc6fb27a6293 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -16,22 +16,19 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; NARROW-NEXT:  .LBB0_2: # %forbody
 ; NARROW-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; NARROW-NEXT:    movl (%esp), %eax
-; NARROW-NEXT:    shll $3, %eax
-; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; NARROW-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; NARROW-NEXT:    movl (%esp), %eax
-; NARROW-NEXT:    shll $3, %eax
-; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; NARROW-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; NARROW-NEXT:    movl (%esp), %ecx
+; NARROW-NEXT:    leal (,%eax,8), %ecx
 ; NARROW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NARROW-NEXT:    addl %ecx, %edx
+; NARROW-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; NARROW-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; NARROW-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; NARROW-NEXT:    psubw %xmm0, %xmm2
 ; NARROW-NEXT:    psllw $8, %xmm2
 ; NARROW-NEXT:    psraw $8, %xmm2
 ; NARROW-NEXT:    psraw $2, %xmm2
 ; NARROW-NEXT:    pshufb %xmm1, %xmm2
-; NARROW-NEXT:    movq %xmm2, (%edx,%ecx,8)
+; NARROW-NEXT:    movq %xmm2, (%edx,%eax,8)
 ; NARROW-NEXT:    incl (%esp)
 ; NARROW-NEXT:  .LBB0_1: # %forcond
 ; NARROW-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -54,24 +51,21 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; WIDE-NEXT:  .LBB0_2: # %forbody
 ; WIDE-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; WIDE-NEXT:    movl (%esp), %eax
-; WIDE-NEXT:    shll $3, %eax
-; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; WIDE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIDE-NEXT:    movl (%esp), %eax
-; WIDE-NEXT:    shll $3, %eax
-; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; WIDE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIDE-NEXT:    movl (%esp), %ecx
+; WIDE-NEXT:    leal (,%eax,8), %ecx
 ; WIDE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIDE-NEXT:    addl %ecx, %edx
+; WIDE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; WIDE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIDE-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; WIDE-NEXT:    pinsrd $1, 4(%eax,%ecx,8), %xmm3
+; WIDE-NEXT:    pinsrd $1, 4(%ecx,%eax,8), %xmm3
 ; WIDE-NEXT:    psubb %xmm0, %xmm3
 ; WIDE-NEXT:    psrlw $2, %xmm3
 ; WIDE-NEXT:    pand %xmm1, %xmm3
 ; WIDE-NEXT:    pxor %xmm2, %xmm3
 ; WIDE-NEXT:    psubb %xmm2, %xmm3
-; WIDE-NEXT:    pextrd $1, %xmm3, 4(%edx,%ecx,8)
-; WIDE-NEXT:    movd %xmm3, (%edx,%ecx,8)
+; WIDE-NEXT:    pextrd $1, %xmm3, 4(%edx,%eax,8)
+; WIDE-NEXT:    movd %xmm3, (%edx,%eax,8)
 ; WIDE-NEXT:    incl (%esp)
 ; WIDE-NEXT:  .LBB0_1: # %forcond
 ; WIDE-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/win64-nosse-csrs.ll b/test/CodeGen/X86/win64-nosse-csrs.ll
index d1860b721044..29d4f165392e 100644
--- a/test/CodeGen/X86/win64-nosse-csrs.ll
+++ b/test/CodeGen/X86/win64-nosse-csrs.ll
@@ -20,7 +20,7 @@ entry-block:
 }
 
 ; Function Attrs: nounwind uwtable
-define x86_64_win64cc i64 @peach() unnamed_addr #1 {
+define win64cc i64 @peach() unnamed_addr #1 {
 entry-block:
   %0 = call i64 @banana()
   ret i64 %0
diff --git a/test/CodeGen/X86/win64_nonvol.ll b/test/CodeGen/X86/win64_nonvol.ll
index 8e5f6cec1ab7..e1c615d75f28 100644
--- a/test/CodeGen/X86/win64_nonvol.ll
+++ b/test/CodeGen/X86/win64_nonvol.ll
@@ -5,7 +5,7 @@
 ; Win64 nonvolatile registers get saved.
 
 ; CHECK-LABEL: bar:
-define x86_64_win64cc void @bar(i32 %a, i32 %b) {
+define win64cc void @bar(i32 %a, i32 %b) {
 ; CHECK-DAG: pushq %rdi
 ; CHECK-DAG: pushq %rsi
 ; CHECK-DAG: movaps %xmm6,
diff --git a/test/CodeGen/X86/win64_params.ll b/test/CodeGen/X86/win64_params.ll
index a0b552d4d584..6b4273512013 100644
--- a/test/CodeGen/X86/win64_params.ll
+++ b/test/CodeGen/X86/win64_params.ll
@@ -12,7 +12,7 @@ entry:
   ret i32 %add
 }
 
-define x86_64_win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize {
+define win64cc i32 @f7(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6) nounwind readnone optsize {
 entry:
 ; CHECK: movl    48(%rsp), %eax
 ; CHECK: addl    40(%rsp), %eax
diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll
index 0faa24ef7290..c7550a467a35 100644
--- a/test/CodeGen/X86/win_chkstk.ll
+++ b/test/CodeGen/X86/win_chkstk.ll
@@ -51,7 +51,7 @@ entry:
 
 ; Make sure we don't call __chkstk or __alloca on non-Windows even if the
 ; caller has the Win64 calling convention.
-define x86_64_win64cc i32 @main4k_win64() nounwind {
+define win64cc i32 @main4k_win64() nounwind {
 entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
index c9a5fc2b3288..b4b8010ec564 100644
--- a/test/CodeGen/X86/win_coreclr_chkstk.ll
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -103,7 +103,7 @@ entry:
 
 ; Make sure we don't emit the probe sequence if not on windows even if the
 ; caller has the Win64 calling convention.
-define x86_64_win64cc i32 @main4k_win64() nounwind {
+define win64cc i32 @main4k_win64() nounwind {
 entry:
 ; WIN_X64:      movq    %gs:16, %rcx
 ; LINUX-NOT:    movq    %gs:16, %rcx
@@ -115,7 +115,7 @@ entry:
 declare i32 @bar(i8*) nounwind
 
 ; Within-body inline probe expansion
-define x86_64_win64cc i32 @main4k_alloca(i64 %n) nounwind {
+define win64cc i32 @main4k_alloca(i64 %n) nounwind {
 entry:
 ; WIN_X64: 	callq	bar
 ; WIN_X64:  	movq	%gs:16, [[R:%r.*]]
diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
index 299190e8a595..e3387a2709cb 100644
--- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
+++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
@@ -3,7 +3,7 @@
 ; Verify that the var arg parameters which are passed in registers are stored
 ; in home stack slots allocated by the caller and that AP is correctly
 ; calculated.
-define x86_64_win64cc void @average_va(i32 %count, ...) nounwind {
+define win64cc void @average_va(i32 %count, ...) nounwind {
 entry:
 ; CHECK: pushq
 ; CHECK: movq   %r9, 40(%rsp)
@@ -24,7 +24,7 @@ declare void @llvm.va_end(i8*) nounwind
 ; CHECK-LABEL: f5:
 ; CHECK: pushq
 ; CHECK: leaq 56(%rsp),
-define x86_64_win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind {
+define win64cc i8** @f5(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
@@ -35,7 +35,7 @@ entry:
 ; CHECK-LABEL: f4:
 ; CHECK: pushq
 ; CHECK: leaq 48(%rsp),
-define x86_64_win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+define win64cc i8** @f4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
@@ -46,7 +46,7 @@ entry:
 ; CHECK-LABEL: f3:
 ; CHECK: pushq
 ; CHECK: leaq 40(%rsp),
-define x86_64_win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind {
+define win64cc i8** @f3(i64 %a0, i64 %a1, i64 %a2, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
@@ -62,7 +62,7 @@ entry:
 ; CHECK: movq [[REG_copy1]], 8(%rsp)
 ; CHECK: movq [[REG_copy1]], (%rsp)
 ; CHECK: ret
-define x86_64_win64cc void @copy1(i64 %a0, ...) nounwind {
+define win64cc void @copy1(i64 %a0, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %cp = alloca i8*, align 8
@@ -78,7 +78,7 @@ entry:
 ; CHECK: movq [[REG_copy4]], 8(%rsp)
 ; CHECK: movq [[REG_copy4]], (%rsp)
 ; CHECK: ret
-define x86_64_win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+define win64cc void @copy4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %cp = alloca i8*, align 8
@@ -96,7 +96,7 @@ entry:
 ; CHECK: movq [[REG_arg4_2]], (%rsp)
 ; CHECK: movl 48(%rsp), %eax
 ; CHECK: ret
-define x86_64_win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
+define win64cc i32 @arg4(i64 %a0, i64 %a1, i64 %a2, i64 %a3, ...) nounwind {
 entry:
   %ap = alloca i8*, align 8
   %ap.0 = bitcast i8** %ap to i8*
diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll
new file mode 100644
index 000000000000..39877c14429f
--- /dev/null
+++ b/test/CodeGen/X86/x86-cmov-converter.ll
@@ -0,0 +1,321 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization transform CMOV
+;; instruction into branches when it is profitable.
+;; There are 5 cases below:
+;;   1. CmovInCriticalPath:
+;;        CMOV depends on the condition and it is in the hot path.
+;;        Thus, it worths transforming.
+;;
+;;   2. CmovNotInCriticalPath:
+;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Thus, it does not worth transforming.
+;;
+;;   3. MaxIndex:
+;;        Maximum calculation algorithm that is looking for the max index,
+;;        calculating CMOV value is cheaper than calculating CMOV condition.
+;;        Thus, it worths transforming.
+;;
+;;   4. MaxValue:
+;;        Maximum calculation algorithm that is looking for the max value,
+;;        calculating CMOV value is not cheaper than calculating CMOV condition.
+;;        Thus, it does not worth transforming.
+;;
+;;   5. BinarySearch:
+;;        Usually, binary search CMOV is not predicted.
+;;        Thus, it does not worth transforming.
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;  }
+;;}
+;;
+;;
+;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;    d[i] /= b;
+;;  }
+;;}
+;;
+;;
+;;int MaxIndex(int n, int *a) {
+;;  int t = 0;
+;;  for (int i = 1; i < n; i++) {
+;;    if (a[i] > a[t])
+;;      t = i;
+;;  }
+;;  return a[t];
+;;}
+;;
+;;
+;;int MaxValue(int n, int *a) {
+;;  int t = a[0];
+;;  for (int i = 1; i < n; i++) {
+;;    if (a[i] > t)
+;;      t = a[i];
+;;  }
+;;  return t;
+;;}
+;;
+;;typedef struct Node Node;
+;;struct Node {
+;;  unsigned Val;
+;;  Node *Right;
+;;  Node *Left;
+;;};
+;;
+;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) {
+;;  while (Curr->Val > Next->Val) {
+;;    Curr = Next;
+;;    if (Mask & (0x1 << Curr->Val))
+;;      Next = Curr->Right;
+;;    else
+;;      Next = Curr->Left;
+;;  }
+;;  return Curr->Val;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%struct.Node = type { i32, %struct.Node*, %struct.Node* }
+
+; CHECK-LABEL: CmovInHotPath
+; CHECK-NOT: cmov
+; CHECK: jg
+
+define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: CmovNotInHotPath
+; CHECK: cmovg
+
+define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 {
+entry:
+  %cmp18 = icmp sgt i32 %n, 0
+  br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx7, align 4
+  %div = sdiv i32 %1, %b
+  store i32 %div, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: MaxIndex
+; CHECK-NOT: cmov
+; CHECK: jg
+
+define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 1
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = sext i32 %i.0.t.0 to i64
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %t.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %t.0.lcssa
+  %0 = load i32, i32* %arrayidx5, align 4
+  ret i32 %0
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
+  %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %t.015 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp sgt i32 %1, %2
+  %3 = trunc i64 %indvars.iv to i32
+  %i.0.t.0 = select i1 %cmp3, i32 %3, i32 %t.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; CHECK-LABEL: MaxValue
+; CHECK-NOT: jg
+; CHECK: cmovg
+
+define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 {
+entry:
+  %0 = load i32, i32* %a, align 4
+  %cmp13 = icmp sgt i32 %n, 1
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ]
+  ret i32 %t.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
+  %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ]
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx1, align 4
+  %cmp2 = icmp sgt i32 %1, %t.014
+  %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: BinarySearch
+; CHECK: cmov
+
+define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 {
+entry:
+  %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0
+  %0 = load i32, i32* %Val8, align 8
+  %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0
+  %1 = load i32, i32* %Val19, align 8
+  %cmp10 = icmp ugt i32 %0, %1
+  br i1 %cmp10, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry, %while.body
+  %2 = phi i32 [ %4, %while.body ], [ %1, %entry ]
+  %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ]
+  %shl = shl i32 1, %2
+  %and = and i32 %shl, %Mask
+  %tobool = icmp eq i32 %and, 0
+  %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2
+  %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1
+  %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right
+  %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8
+  %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0
+  %4 = load i32, i32* %Val1, align 8
+  %cmp = icmp ugt i32 %2, %4
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ]
+  ret i32 %.lcssa
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; The following test checks that x86-cmov-converter optimization transforms
+;; CMOV instructions into branch correctly.
+;;
+;; MBB:
+;;   cond = cmp ...
+;;   v1 = CMOVgt t1, f1, cond
+;;   v2 = CMOVle s1, f2, cond
+;;
+;; Where: t1 = 11, f1 = 22, f2 = a
+;;
+;; After CMOV transformation
+;; -------------------------
+;; MBB:
+;;   cond = cmp ...
+;;   ja %SinkMBB
+;;
+;; FalseMBB:
+;;   jmp %SinkMBB
+;;
+;; SinkMBB:
+;;   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+;;   %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+;;                                          ; true-value with false-value
+;;                                          ; Phi instruction cannot use
+;;                                          ; previous Phi instruction result
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-LABEL: Transform
+; CHECK-NOT: cmov
+; CHECK:         divl    [[a:%[0-9a-z]*]]
+; CHECK:         cmpl    [[a]], %eax
+; CHECK:         movl    $11, [[s1:%[0-9a-z]*]]
+; CHECK:         movl    [[a]], [[s2:%[0-9a-z]*]]
+; CHECK:         ja      [[SinkBB:.*]]
+; CHECK: [[FalseBB:.*]]:
+; CHECK:         movl    $22, [[s1]]
+; CHECK:         movl    $22, [[s2]]
+; CHECK: [[SinkBB]]:
+; CHECK:         ja
+
+define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
+entry:
+  %cmp10 = icmp ugt i32 0, %n
+  br i1 %cmp10, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry, %while.body
+  %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ]
+  %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i
+  %x = load i32, i32* %arr_i, align 4
+  %div = udiv i32 %x, %a
+  %cond = icmp ugt i32 %div, %a
+  %condOpp = icmp ule i32 %div, %a
+  %s1 = select i1 %cond, i32 11, i32 22
+  %s2 = select i1 %condOpp, i32 %s1, i32 %a
+  %sum = urem i32 %s1, %s2
+  store i32 %sum, i32* %arr_i, align 4
+  %i_inc = add i32 %i, 1
+  %cmp = icmp ugt i32 %i_inc, %n
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+attributes #0 = {"target-cpu"="x86-64"}
diff --git a/test/CodeGen/XCore/varargs.ll b/test/CodeGen/XCore/varargs.ll
index 2e364b275610..b6f716d66c9d 100644
--- a/test/CodeGen/XCore/varargs.ll
+++ b/test/CodeGen/XCore/varargs.ll
@@ -26,10 +26,10 @@ entry:
 ; CHECK-LABEL: test_vararg
 ; CHECK: extsp 6
 ; CHECK: stw lr, sp[1]
-; CHECK: stw r3, sp[6]
-; CHECK: stw r0, sp[3]
-; CHECK: stw r1, sp[4]
-; CHECK: stw r2, sp[5]
+; CHECK-DAG: stw r3, sp[6]
+; CHECK-DAG: stw r0, sp[3]
+; CHECK-DAG: stw r1, sp[4]
+; CHECK-DAG: stw r2, sp[5]
 ; CHECK: ldaw r0, sp[3]
 ; CHECK: stw r0, sp[2]
   %list = alloca i8*, align 4
diff --git a/test/DebugInfo/Generic/namespace.ll b/test/DebugInfo/Generic/namespace.ll
index 983e20db4111..5a8f65263192 100644
--- a/test/DebugInfo/Generic/namespace.ll
+++ b/test/DebugInfo/Generic/namespace.ll
@@ -54,17 +54,14 @@
 
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_module
-; This is a bug, it should be in F2 but it inherits the file from its
-; enclosing scope
-; CHECK-NEXT: DW_AT_decl_file{{.*}}stdin
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:.*]])
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(15)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]})
 ; CHECK: NULL
 ; CHECK-NOT: NULL
 
 ; CHECK: DW_TAG_imported_module
-; Same bug as above, this should be F2
-; CHECK-NEXT: DW_AT_decl_file{{.*}}debug-info-namespace.cpp
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:.*]])
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(18)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
 ; CHECK-NOT: NULL
@@ -320,29 +317,29 @@ attributes #1 = { nounwind readnone }
 !31 = !DIGlobalVariable(name: "i", linkageName: "_ZN1A1B1iE", line: 20, isLocal: false, isDefinition: true, scope: !6, file: !18, type: !13)
 !32 = !DIGlobalVariable(name: "var_fwd", linkageName: "_ZN1A1B7var_fwdE", line: 44, isLocal: false, isDefinition: true, scope: !6, file: !18, type: !13)
 !33 = !{!34, !35, !36, !37, !40, !41, !42, !43, !44, !45, !47, !48, !49, !51, !54, !55, !56}
-!34 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 15, scope: !7, entity: !6)
-!35 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 18, scope: !0, entity: !7)
-!36 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 19, name: "E", scope: !0, entity: !7)
-!37 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 23, scope: !38, entity: !6)
+!34 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 15, scope: !7, entity: !6)
+!35 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 18, scope: !0, entity: !7)
+!36 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 19, name: "E", scope: !0, entity: !7)
+!37 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 23, scope: !38, entity: !6)
 !38 = distinct !DILexicalBlock(line: 22, column: 10, file: !5, scope: !39)
 !39 = distinct !DILexicalBlock(line: 22, column: 7, file: !5, scope: !21)
-!40 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 26, scope: !21, entity: !7)
-!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 27, scope: !21, entity: !4)
-!42 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 28, scope: !21, entity: !8)
-!43 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 29, scope: !21, entity: !14)
-!44 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 30, scope: !21, entity: !31)
-!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 31, scope: !21, entity: !46)
+!40 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !5, line: 26, scope: !21, entity: !7)
+!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 27, scope: !21, entity: !4)
+!42 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 28, scope: !21, entity: !8)
+!43 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 29, scope: !21, entity: !14)
+!44 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 30, scope: !21, entity: !31)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 31, scope: !21, entity: !46)
 !46 = !DIDerivedType(tag: DW_TAG_typedef, name: "baz", line: 7, file: !5, scope: !6, baseType: !8)
-!47 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 32, name: "X", scope: !21, entity: !7)
-!48 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 33, name: "Y", scope: !21, entity: !47)
-!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 34, scope: !21, entity: !50)
+!47 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 32, name: "X", scope: !21, entity: !7)
+!48 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 33, name: "Y", scope: !21, entity: !47)
+!49 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 34, scope: !21, entity: !50)
 !50 = !DIGlobalVariable(name: "var_decl", linkageName: "_ZN1A1B8var_declE", line: 8, isLocal: false, isDefinition: false, scope: !6, file: !18, type: !13)
-!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 35, scope: !21, entity: !52)
+!51 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 35, scope: !21, entity: !52)
 !52 = !DISubprogram(name: "func_decl", linkageName: "_ZN1A1B9func_declEv", line: 9, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: false, file: !5, scope: !6, type: !19, variables: !53)
 !53 = !{} ; previously: invalid DW_TAG_base_type
-!54 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 36, scope: !21, entity: !32)
-!55 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 37, scope: !21, entity: !26)
-!56 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 42, scope: !7, entity: !31)
+!54 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 36, scope: !21, entity: !32)
+!55 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 37, scope: !21, entity: !26)
+!56 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !5, line: 42, scope: !7, entity: !31)
 !57 = !{i32 2, !"Dwarf Version", i32 2}
 !58 = !{i32 2, !"Debug Info Version", i32 3}
 !59 = !{!"clang version 3.6.0 "}
diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test
index 1887af2e8268..14fe4bb352f6 100644
--- a/test/DebugInfo/PDB/pdbdump-headers.test
+++ b/test/DebugInfo/PDB/pdbdump-headers.test
@@ -91,189 +91,189 @@ ALL-NEXT:   Mod 0001 | `* Linker *`:
 ALL:                           Types (TPI Stream)
 ALL-NEXT: ============================================================
 ALL-NEXT:   Showing 75 records
-ALL-NEXT:   0x1000 | LF_ARGLIST [size = 8, hash = 205956]
-ALL-NEXT:   0x1001 | LF_PROCEDURE [size = 16, hash = 163561]
+ALL-NEXT:   0x1000 | LF_ARGLIST [size = 8, hash = 0x32484]
+ALL-NEXT:   0x1001 | LF_PROCEDURE [size = 16, hash = 0x27EE9]
 ALL-NEXT:            return type = 0x0074 (int), # args = 0, param list = 0x1000
 ALL-NEXT:            calling conv = cdecl, options = None
-ALL-NEXT:   0x1002 | LF_FIELDLIST [size = 76, hash = 59811]
+ALL-NEXT:   0x1002 | LF_FIELDLIST [size = 76, hash = 0xE9A3]
 ALL-NEXT:            - LF_ENUMERATE [apartment = 1]
 ALL-NEXT:            - LF_ENUMERATE [single = 2]
 ALL-NEXT:            - LF_ENUMERATE [free = 3]
 ALL-NEXT:            - LF_ENUMERATE [neutral = 4]
 ALL-NEXT:            - LF_ENUMERATE [both = 5]
-ALL-NEXT:   0x1003 | LF_ENUM [size = 120, hash = 208239] `__vc_attributes::threadingAttribute::threading_e`
+ALL-NEXT:   0x1003 | LF_ENUM [size = 120, hash = 0x32D6F] `__vc_attributes::threadingAttribute::threading_e`
 ALL-NEXT:            unique name: `.?AW4threading_e@threadingAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1002, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1004 | LF_STRUCTURE [size = 100, hash = 16377] `__vc_attributes::threadingAttribute`
+ALL-NEXT:   0x1004 | LF_STRUCTURE [size = 100, hash = 0x3FF9] `__vc_attributes::threadingAttribute`
 ALL-NEXT:            unique name: `.?AUthreadingAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1005 | LF_POINTER [size = 12, hash = 247078]
+ALL-NEXT:   0x1005 | LF_POINTER [size = 12, hash = 0x3C526]
 ALL-NEXT:            referent = 0x1004, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1006 | LF_ARGLIST [size = 12, hash = 194342]
+ALL-NEXT:   0x1006 | LF_ARGLIST [size = 12, hash = 0x2F726]
 ALL-NEXT:            0x1003: `__vc_attributes::threadingAttribute::threading_e`
-ALL-NEXT:   0x1007 | LF_MFUNCTION [size = 28, hash = 254156]
+ALL-NEXT:   0x1007 | LF_MFUNCTION [size = 28, hash = 0x3E0CC]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1006
 ALL-NEXT:            class type = 0x1004, this type = 0x1005, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1008 | LF_MFUNCTION [size = 28, hash = 194536]
+ALL-NEXT:   0x1008 | LF_MFUNCTION [size = 28, hash = 0x2F7E8]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x1004, this type = 0x1005, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1009 | LF_METHODLIST [size = 20, hash = 167492]
+ALL-NEXT:   0x1009 | LF_METHODLIST [size = 20, hash = 0x28E44]
 ALL-NEXT:            - Method [type = 0x1007, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1008, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x100A | LF_FIELDLIST [size = 68, hash = 185421]
+ALL-NEXT:   0x100A | LF_FIELDLIST [size = 68, hash = 0x2D44D]
 ALL-NEXT:            - LF_NESTTYPE [name = `threading_e`, parent = 0x1003]
 ALL-NEXT:            - LF_METHOD [name = `threadingAttribute`, # overloads = 2, overload list = 0x1009]
 ALL-NEXT:            - LF_MEMBER [name = `value`, Type = 0x1003, offset = 0, attrs = public]
-ALL-NEXT:   0x100B | LF_STRUCTURE [size = 100, hash = 119540] `__vc_attributes::threadingAttribute`
+ALL-NEXT:   0x100B | LF_STRUCTURE [size = 100, hash = 0x1D2F4] `__vc_attributes::threadingAttribute`
 ALL-NEXT:            unique name: `.?AUthreadingAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x100A
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x100C | LF_FIELDLIST [size = 48, hash = 261871]
+ALL-NEXT:   0x100C | LF_FIELDLIST [size = 48, hash = 0x3FEEF]
 ALL-NEXT:            - LF_ENUMERATE [native = 0]
 ALL-NEXT:            - LF_ENUMERATE [com = 1]
 ALL-NEXT:            - LF_ENUMERATE [managed = 2]
-ALL-NEXT:   0x100D | LF_ENUM [size = 120, hash = 198119] `__vc_attributes::event_receiverAttribute::type_e`
+ALL-NEXT:   0x100D | LF_ENUM [size = 120, hash = 0x305E7] `__vc_attributes::event_receiverAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@event_receiverAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x100C, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x100E | LF_STRUCTURE [size = 112, hash = 48056] `__vc_attributes::event_receiverAttribute`
+ALL-NEXT:   0x100E | LF_STRUCTURE [size = 112, hash = 0xBBB8] `__vc_attributes::event_receiverAttribute`
 ALL-NEXT:            unique name: `.?AUevent_receiverAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x100F | LF_POINTER [size = 12, hash = 251486]
+ALL-NEXT:   0x100F | LF_POINTER [size = 12, hash = 0x3D65E]
 ALL-NEXT:            referent = 0x100E, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1010 | LF_ARGLIST [size = 16, hash = 134580]
+ALL-NEXT:   0x1010 | LF_ARGLIST [size = 16, hash = 0x20DB4]
 ALL-NEXT:            0x100D: `__vc_attributes::event_receiverAttribute::type_e`
 ALL-NEXT:            0x0030 (bool): `bool`
-ALL-NEXT:   0x1011 | LF_MFUNCTION [size = 28, hash = 148190]
+ALL-NEXT:   0x1011 | LF_MFUNCTION [size = 28, hash = 0x242DE]
 ALL-NEXT:            return type = 0x0003 (void), # args = 2, param list = 0x1010
 ALL-NEXT:            class type = 0x100E, this type = 0x100F, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1012 | LF_ARGLIST [size = 12, hash = 113636]
+ALL-NEXT:   0x1012 | LF_ARGLIST [size = 12, hash = 0x1BBE4]
 ALL-NEXT:            0x100D: `__vc_attributes::event_receiverAttribute::type_e`
-ALL-NEXT:   0x1013 | LF_MFUNCTION [size = 28, hash = 53336]
+ALL-NEXT:   0x1013 | LF_MFUNCTION [size = 28, hash = 0xD058]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1012
 ALL-NEXT:            class type = 0x100E, this type = 0x100F, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1014 | LF_MFUNCTION [size = 28, hash = 55779]
+ALL-NEXT:   0x1014 | LF_MFUNCTION [size = 28, hash = 0xD9E3]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x100E, this type = 0x100F, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1015 | LF_METHODLIST [size = 28, hash = 220695]
+ALL-NEXT:   0x1015 | LF_METHODLIST [size = 28, hash = 0x35E17]
 ALL-NEXT:            - Method [type = 0x1011, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1013, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1014, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x1016 | LF_FIELDLIST [size = 96, hash = 198114]
+ALL-NEXT:   0x1016 | LF_FIELDLIST [size = 96, hash = 0x305E2]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x100D]
 ALL-NEXT:            - LF_METHOD [name = `event_receiverAttribute`, # overloads = 3, overload list = 0x1015]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x100D, offset = 0, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `layout_dependent`, Type = 0x0030 (bool), offset = 4, attrs = public]
-ALL-NEXT:   0x1017 | LF_STRUCTURE [size = 112, hash = 148734] `__vc_attributes::event_receiverAttribute`
+ALL-NEXT:   0x1017 | LF_STRUCTURE [size = 112, hash = 0x244FE] `__vc_attributes::event_receiverAttribute`
 ALL-NEXT:            unique name: `.?AUevent_receiverAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1016
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x1018 | LF_FIELDLIST [size = 48, hash = 81128]
+ALL-NEXT:   0x1018 | LF_FIELDLIST [size = 48, hash = 0x13CE8]
 ALL-NEXT:            - LF_ENUMERATE [never = 0]
 ALL-NEXT:            - LF_ENUMERATE [allowed = 1]
 ALL-NEXT:            - LF_ENUMERATE [always = 2]
-ALL-NEXT:   0x1019 | LF_ENUM [size = 116, hash = 60158] `__vc_attributes::aggregatableAttribute::type_e`
+ALL-NEXT:   0x1019 | LF_ENUM [size = 116, hash = 0xEAFE] `__vc_attributes::aggregatableAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@aggregatableAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1018, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x101A | LF_STRUCTURE [size = 108, hash = 217249] `__vc_attributes::aggregatableAttribute`
+ALL-NEXT:   0x101A | LF_STRUCTURE [size = 108, hash = 0x350A1] `__vc_attributes::aggregatableAttribute`
 ALL-NEXT:            unique name: `.?AUaggregatableAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x101B | LF_POINTER [size = 12, hash = 174209]
+ALL-NEXT:   0x101B | LF_POINTER [size = 12, hash = 0x2A881]
 ALL-NEXT:            referent = 0x101A, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x101C | LF_ARGLIST [size = 12, hash = 159978]
+ALL-NEXT:   0x101C | LF_ARGLIST [size = 12, hash = 0x270EA]
 ALL-NEXT:            0x1019: `__vc_attributes::aggregatableAttribute::type_e`
-ALL-NEXT:   0x101D | LF_MFUNCTION [size = 28, hash = 249504]
+ALL-NEXT:   0x101D | LF_MFUNCTION [size = 28, hash = 0x3CEA0]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x101C
 ALL-NEXT:            class type = 0x101A, this type = 0x101B, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x101E | LF_MFUNCTION [size = 28, hash = 141941]
+ALL-NEXT:   0x101E | LF_MFUNCTION [size = 28, hash = 0x22A75]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x101A, this type = 0x101B, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x101F | LF_METHODLIST [size = 20, hash = 238785]
+ALL-NEXT:   0x101F | LF_METHODLIST [size = 20, hash = 0x3A4C1]
 ALL-NEXT:            - Method [type = 0x101D, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x101E, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x1020 | LF_FIELDLIST [size = 68, hash = 6214]
+ALL-NEXT:   0x1020 | LF_FIELDLIST [size = 68, hash = 0x1846]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x1019]
 ALL-NEXT:            - LF_METHOD [name = `aggregatableAttribute`, # overloads = 2, overload list = 0x101F]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x1019, offset = 0, attrs = public]
-ALL-NEXT:   0x1021 | LF_STRUCTURE [size = 108, hash = 94935] `__vc_attributes::aggregatableAttribute`
+ALL-NEXT:   0x1021 | LF_STRUCTURE [size = 108, hash = 0x172D7] `__vc_attributes::aggregatableAttribute`
 ALL-NEXT:            unique name: `.?AUaggregatableAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1020
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x1022 | LF_ENUM [size = 116, hash = 151449] `__vc_attributes::event_sourceAttribute::type_e`
+ALL-NEXT:   0x1022 | LF_ENUM [size = 116, hash = 0x24F99] `__vc_attributes::event_sourceAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@event_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x100C, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1023 | LF_FIELDLIST [size = 28, hash = 135589]
+ALL-NEXT:   0x1023 | LF_FIELDLIST [size = 28, hash = 0x211A5]
 ALL-NEXT:            - LF_ENUMERATE [speed = 0]
 ALL-NEXT:            - LF_ENUMERATE [size = 1]
-ALL-NEXT:   0x1024 | LF_ENUM [size = 124, hash = 73373] `__vc_attributes::event_sourceAttribute::optimize_e`
+ALL-NEXT:   0x1024 | LF_ENUM [size = 124, hash = 0x11E9D] `__vc_attributes::event_sourceAttribute::optimize_e`
 ALL-NEXT:            unique name: `.?AW4optimize_e@event_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1023, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1025 | LF_STRUCTURE [size = 108, hash = 96512] `__vc_attributes::event_sourceAttribute`
+ALL-NEXT:   0x1025 | LF_STRUCTURE [size = 108, hash = 0x17900] `__vc_attributes::event_sourceAttribute`
 ALL-NEXT:            unique name: `.?AUevent_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1026 | LF_POINTER [size = 12, hash = 254299]
+ALL-NEXT:   0x1026 | LF_POINTER [size = 12, hash = 0x3E15B]
 ALL-NEXT:            referent = 0x1025, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1027 | LF_ARGLIST [size = 12, hash = 17744]
+ALL-NEXT:   0x1027 | LF_ARGLIST [size = 12, hash = 0x4550]
 ALL-NEXT:            0x1022: `__vc_attributes::event_sourceAttribute::type_e`
-ALL-NEXT:   0x1028 | LF_MFUNCTION [size = 28, hash = 239514]
+ALL-NEXT:   0x1028 | LF_MFUNCTION [size = 28, hash = 0x3A79A]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1027
 ALL-NEXT:            class type = 0x1025, this type = 0x1026, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1029 | LF_MFUNCTION [size = 28, hash = 173189]
+ALL-NEXT:   0x1029 | LF_MFUNCTION [size = 28, hash = 0x2A485]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x1025, this type = 0x1026, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x102A | LF_METHODLIST [size = 20, hash = 130544]
+ALL-NEXT:   0x102A | LF_METHODLIST [size = 20, hash = 0x1FDF0]
 ALL-NEXT:            - Method [type = 0x1028, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1029, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x102B | LF_FIELDLIST [size = 128, hash = 204437]
+ALL-NEXT:   0x102B | LF_FIELDLIST [size = 128, hash = 0x31E95]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x1022]
 ALL-NEXT:            - LF_NESTTYPE [name = `optimize_e`, parent = 0x1024]
 ALL-NEXT:            - LF_METHOD [name = `event_sourceAttribute`, # overloads = 2, overload list = 0x102A]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x1022, offset = 0, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `optimize`, Type = 0x1024, offset = 4, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `decorate`, Type = 0x0030 (bool), offset = 8, attrs = public]
-ALL-NEXT:   0x102C | LF_STRUCTURE [size = 108, hash = 238560] `__vc_attributes::event_sourceAttribute`
+ALL-NEXT:   0x102C | LF_STRUCTURE [size = 108, hash = 0x3A3E0] `__vc_attributes::event_sourceAttribute`
 ALL-NEXT:            unique name: `.?AUevent_sourceAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x102B
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x102D | LF_FIELDLIST [size = 92, hash = 144673]
+ALL-NEXT:   0x102D | LF_FIELDLIST [size = 92, hash = 0x23521]
 ALL-NEXT:            - LF_ENUMERATE [dll = 1]
 ALL-NEXT:            - LF_ENUMERATE [exe = 2]
 ALL-NEXT:            - LF_ENUMERATE [service = 3]
 ALL-NEXT:            - LF_ENUMERATE [unspecified = 4]
 ALL-NEXT:            - LF_ENUMERATE [EXE = 2]
 ALL-NEXT:            - LF_ENUMERATE [SERVICE = 3]
-ALL-NEXT:   0x102E | LF_ENUM [size = 104, hash = 115151] `__vc_attributes::moduleAttribute::type_e`
+ALL-NEXT:   0x102E | LF_ENUM [size = 104, hash = 0x1C1CF] `__vc_attributes::moduleAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@moduleAttribute@__vc_attributes@@`
 ALL-NEXT:            field list: 0x102D, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x102F | LF_STRUCTURE [size = 96, hash = 197306] `__vc_attributes::moduleAttribute`
+ALL-NEXT:   0x102F | LF_STRUCTURE [size = 96, hash = 0x302BA] `__vc_attributes::moduleAttribute`
 ALL-NEXT:            unique name: `.?AUmoduleAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1030 | LF_POINTER [size = 12, hash = 256035]
+ALL-NEXT:   0x1030 | LF_POINTER [size = 12, hash = 0x3E823]
 ALL-NEXT:            referent = 0x102F, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1031 | LF_MODIFIER [size = 12, hash = 101096]
+ALL-NEXT:   0x1031 | LF_MODIFIER [size = 12, hash = 0x18AE8]
 ALL-NEXT:            referent = 0x0070 (char), modifiers = const
-ALL-NEXT:   0x1032 | LF_POINTER [size = 12, hash = 231280]
+ALL-NEXT:   0x1032 | LF_POINTER [size = 12, hash = 0x38770]
 ALL-NEXT:            referent = 0x1031, mode = pointer, opts = None, kind = ptr32
-ALL-NEXT:   0x1033 | LF_ARGLIST [size = 68, hash = 52156]
+ALL-NEXT:   0x1033 | LF_ARGLIST [size = 68, hash = 0xCBBC]
 ALL-NEXT:            0x102E: `__vc_attributes::moduleAttribute::type_e`
 ALL-NEXT:            0x1032: `const char*`
 ALL-NEXT:            0x1032: `const char*`
@@ -289,25 +289,25 @@ ALL-NEXT:            0x0030 (bool): `bool`
 ALL-NEXT:            0x0030 (bool): `bool`
 ALL-NEXT:            0x1032: `const char*`
 ALL-NEXT:            0x1032: `const char*`
-ALL-NEXT:   0x1034 | LF_MFUNCTION [size = 28, hash = 48854]
+ALL-NEXT:   0x1034 | LF_MFUNCTION [size = 28, hash = 0xBED6]
 ALL-NEXT:            return type = 0x0003 (void), # args = 15, param list = 0x1033
 ALL-NEXT:            class type = 0x102F, this type = 0x1030, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1035 | LF_ARGLIST [size = 12, hash = 170035]
+ALL-NEXT:   0x1035 | LF_ARGLIST [size = 12, hash = 0x29833]
 ALL-NEXT:            0x102E: `__vc_attributes::moduleAttribute::type_e`
-ALL-NEXT:   0x1036 | LF_MFUNCTION [size = 28, hash = 177041]
+ALL-NEXT:   0x1036 | LF_MFUNCTION [size = 28, hash = 0x2B391]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1035
 ALL-NEXT:            class type = 0x102F, this type = 0x1030, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1037 | LF_MFUNCTION [size = 28, hash = 102745]
+ALL-NEXT:   0x1037 | LF_MFUNCTION [size = 28, hash = 0x19159]
 ALL-NEXT:            return type = 0x0003 (void), # args = 0, param list = 0x1000
 ALL-NEXT:            class type = 0x102F, this type = 0x1030, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1038 | LF_METHODLIST [size = 28, hash = 16947]
+ALL-NEXT:   0x1038 | LF_METHODLIST [size = 28, hash = 0x4233]
 ALL-NEXT:            - Method [type = 0x1034, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1036, vftable offset = -1, attrs = public]
 ALL-NEXT:            - Method [type = 0x1037, vftable offset = -1, attrs = public]
-ALL-NEXT:   0x1039 | LF_FIELDLIST [size = 356, hash = 183703]
+ALL-NEXT:   0x1039 | LF_FIELDLIST [size = 356, hash = 0x2CD97]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x102E]
 ALL-NEXT:            - LF_METHOD [name = `moduleAttribute`, # overloads = 3, overload list = 0x1038]
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x102E, offset = 0, attrs = public]
@@ -325,11 +325,11 @@ ALL-NEXT:            - LF_MEMBER [name = `hidden`, Type = 0x0030 (bool), offset
 ALL-NEXT:            - LF_MEMBER [name = `restricted`, Type = 0x0030 (bool), offset = 45, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `custom`, Type = 0x1032, offset = 48, attrs = public]
 ALL-NEXT:            - LF_MEMBER [name = `resource_name`, Type = 0x1032, offset = 52, attrs = public]
-ALL-NEXT:   0x103A | LF_STRUCTURE [size = 96, hash = 98548] `__vc_attributes::moduleAttribute`
+ALL-NEXT:   0x103A | LF_STRUCTURE [size = 96, hash = 0x180F4] `__vc_attributes::moduleAttribute`
 ALL-NEXT:            unique name: `.?AUmoduleAttribute@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1039
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x103B | LF_FIELDLIST [size = 756, hash = 35693]
+ALL-NEXT:   0x103B | LF_FIELDLIST [size = 756, hash = 0x8B6D]
 ALL-NEXT:            - LF_ENUMERATE [eAnyUsage = 0]
 ALL-NEXT:            - LF_ENUMERATE [eCoClassUsage = 1]
 ALL-NEXT:            - LF_ENUMERATE [eCOMInterfaceUsage = 2]
@@ -360,58 +360,58 @@ ALL-NEXT:            - LF_ENUMERATE [eModuleUsage = 16777216]
 ALL-NEXT:            - LF_ENUMERATE [eIllegalUsage = 33554432]
 ALL-NEXT:            - LF_ENUMERATE [eAsynchronousUsage = 67108864]
 ALL-NEXT:            - LF_ENUMERATE [eAnyIDLUsage = 4161535]
-ALL-NEXT:   0x103C | LF_ENUM [size = 140, hash = 171328] `__vc_attributes::helper_attributes::usageAttribute::usage_e`
+ALL-NEXT:   0x103C | LF_ENUM [size = 140, hash = 0x29D40] `__vc_attributes::helper_attributes::usageAttribute::usage_e`
 ALL-NEXT:            unique name: `.?AW4usage_e@usageAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            field list: 0x103B, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x103D | LF_STRUCTURE [size = 128, hash = 203640] `__vc_attributes::helper_attributes::usageAttribute`
+ALL-NEXT:   0x103D | LF_STRUCTURE [size = 128, hash = 0x31B78] `__vc_attributes::helper_attributes::usageAttribute`
 ALL-NEXT:            unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x103E | LF_POINTER [size = 12, hash = 139292]
+ALL-NEXT:   0x103E | LF_POINTER [size = 12, hash = 0x2201C]
 ALL-NEXT:            referent = 0x103D, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x103F | LF_ARGLIST [size = 12, hash = 49018]
+ALL-NEXT:   0x103F | LF_ARGLIST [size = 12, hash = 0xBF7A]
 ALL-NEXT:            0x0075 (unsigned): `unsigned`
-ALL-NEXT:   0x1040 | LF_MFUNCTION [size = 28, hash = 43821]
+ALL-NEXT:   0x1040 | LF_MFUNCTION [size = 28, hash = 0xAB2D]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x103F
 ALL-NEXT:            class type = 0x103D, this type = 0x103E, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1041 | LF_FIELDLIST [size = 60, hash = 202555]
+ALL-NEXT:   0x1041 | LF_FIELDLIST [size = 60, hash = 0x3173B]
 ALL-NEXT:            - LF_NESTTYPE [name = `usage_e`, parent = 0x103C]
 ALL-NEXT:            - LF_ONEMETHOD [name = `usageAttribute`]
 ALL-NEXT:              type = 0x1040, vftable offset = -1, attrs = public
 ALL-NEXT:            - LF_MEMBER [name = `value`, Type = 0x0075 (unsigned), offset = 0, attrs = public]
-ALL-NEXT:   0x1042 | LF_STRUCTURE [size = 128, hash = 165040] `__vc_attributes::helper_attributes::usageAttribute`
+ALL-NEXT:   0x1042 | LF_STRUCTURE [size = 128, hash = 0x284B0] `__vc_attributes::helper_attributes::usageAttribute`
 ALL-NEXT:            unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1041
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
-ALL-NEXT:   0x1043 | LF_FIELDLIST [size = 68, hash = 215835]
+ALL-NEXT:   0x1043 | LF_FIELDLIST [size = 68, hash = 0x34B1B]
 ALL-NEXT:            - LF_ENUMERATE [eBoolean = 0]
 ALL-NEXT:            - LF_ENUMERATE [eInteger = 1]
 ALL-NEXT:            - LF_ENUMERATE [eFloat = 2]
 ALL-NEXT:            - LF_ENUMERATE [eDouble = 3]
-ALL-NEXT:   0x1044 | LF_ENUM [size = 148, hash = 142625] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
+ALL-NEXT:   0x1044 | LF_ENUM [size = 148, hash = 0x22D21] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
 ALL-NEXT:            unique name: `.?AW4type_e@v1_alttypeAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            field list: 0x1043, underlying type: 0x0074 (int)
 ALL-NEXT:            options: has unique name | is nested
-ALL-NEXT:   0x1045 | LF_STRUCTURE [size = 140, hash = 52534] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
+ALL-NEXT:   0x1045 | LF_STRUCTURE [size = 140, hash = 0xCD36] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
 ALL-NEXT:            unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: <no type>
 ALL-NEXT:            options: forward ref | has unique name
-ALL-NEXT:   0x1046 | LF_POINTER [size = 12, hash = 44186]
+ALL-NEXT:   0x1046 | LF_POINTER [size = 12, hash = 0xAC9A]
 ALL-NEXT:            referent = 0x1045, mode = pointer, opts = const, kind = ptr32
-ALL-NEXT:   0x1047 | LF_ARGLIST [size = 12, hash = 103930]
+ALL-NEXT:   0x1047 | LF_ARGLIST [size = 12, hash = 0x195FA]
 ALL-NEXT:            0x1044: `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e`
-ALL-NEXT:   0x1048 | LF_MFUNCTION [size = 28, hash = 110942]
+ALL-NEXT:   0x1048 | LF_MFUNCTION [size = 28, hash = 0x1B15E]
 ALL-NEXT:            return type = 0x0003 (void), # args = 1, param list = 0x1047
 ALL-NEXT:            class type = 0x1045, this type = 0x1046, this adjust = 0
 ALL-NEXT:            calling conv = thiscall, options = constructor
-ALL-NEXT:   0x1049 | LF_FIELDLIST [size = 64, hash = 17991]
+ALL-NEXT:   0x1049 | LF_FIELDLIST [size = 64, hash = 0x4647]
 ALL-NEXT:            - LF_NESTTYPE [name = `type_e`, parent = 0x1044]
 ALL-NEXT:            - LF_ONEMETHOD [name = `v1_alttypeAttribute`]
 ALL-NEXT:              type = 0x1048, vftable offset = -1, attrs = public
 ALL-NEXT:            - LF_MEMBER [name = `type`, Type = 0x1044, offset = 0, attrs = public]
-ALL-NEXT:   0x104A | LF_STRUCTURE [size = 140, hash = 213215] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
+ALL-NEXT:   0x104A | LF_STRUCTURE [size = 140, hash = 0x340DF] `__vc_attributes::helper_attributes::v1_alttypeAttribute`
 ALL-NEXT:            unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@`
 ALL-NEXT:            vtable: <no type>, base list: <no type>, field list: 0x1049
 ALL-NEXT:            options: has ctor / dtor | contains nested class | has unique name
@@ -421,29 +421,29 @@ ALL:        Hash Adjusters:
 ALL:                           Types (IPI Stream)
 ALL-NEXT: ============================================================
 ALL-NEXT:   Showing 15 records
-ALL-NEXT:   0x1000 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7186]
+ALL-NEXT:   0x1000 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C12]
 ALL-NEXT:            udt = 0x100B, mod = 1, file = 1, line = 481
-ALL-NEXT:   0x1001 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7198]
+ALL-NEXT:   0x1001 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C1E]
 ALL-NEXT:            udt = 0x1017, mod = 1, file = 1, line = 194
-ALL-NEXT:   0x1002 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7180]
+ALL-NEXT:   0x1002 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C0C]
 ALL-NEXT:            udt = 0x1021, mod = 1, file = 1, line = 603
-ALL-NEXT:   0x1003 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7191]
+ALL-NEXT:   0x1003 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C17]
 ALL-NEXT:            udt = 0x102C, mod = 1, file = 1, line = 1200
-ALL-NEXT:   0x1004 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7201]
+ALL-NEXT:   0x1004 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C21]
 ALL-NEXT:            udt = 0x103A, mod = 1, file = 1, line = 540
-ALL-NEXT:   0x1005 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7241]
+ALL-NEXT:   0x1005 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C49]
 ALL-NEXT:            udt = 0x1042, mod = 1, file = 1, line = 108
-ALL-NEXT:   0x1006 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 7249]
+ALL-NEXT:   0x1006 | LF_UDT_MOD_SRC_LINE [size = 20, hash = 0x1C51]
 ALL-NEXT:            udt = 0x104A, mod = 1, file = 1, line = 96
-ALL-NEXT:   0x1007 | LF_STRING_ID [size = 48, hash = 80727] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs
-ALL-NEXT:   0x1008 | LF_STRING_ID [size = 76, hash = 154177] ID: <no type>, String: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe
-ALL-NEXT:   0x1009 | LF_STRING_ID [size = 20, hash = 75189] ID: <no type>, String: empty.cpp
-ALL-NEXT:   0x100A | LF_STRING_ID [size = 56, hash = 253662] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb
-ALL-NEXT:   0x100B | LF_STRING_ID [size = 252, hash = 193467] ID: <no type>, String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
-ALL-NEXT:   0x100C | LF_SUBSTR_LIST [size = 12, hash = 222705]
+ALL-NEXT:   0x1007 | LF_STRING_ID [size = 48, hash = 0x13B57] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs
+ALL-NEXT:   0x1008 | LF_STRING_ID [size = 76, hash = 0x25A41] ID: <no type>, String: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe
+ALL-NEXT:   0x1009 | LF_STRING_ID [size = 20, hash = 0x125B5] ID: <no type>, String: empty.cpp
+ALL-NEXT:   0x100A | LF_STRING_ID [size = 56, hash = 0x3DEDE] ID: <no type>, String: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb
+ALL-NEXT:   0x100B | LF_STRING_ID [size = 252, hash = 0x2F3BB] ID: <no type>, String: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
+ALL-NEXT:   0x100C | LF_SUBSTR_LIST [size = 12, hash = 0x365F1]
 ALL-NEXT:            0x100B: `-Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows`
-ALL-NEXT:   0x100D | LF_STRING_ID [size = 96, hash = 186099] ID: 0x100C, String:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X
-ALL-NEXT:   0x100E | LF_BUILDINFO [size = 28, hash = 257108]
+ALL-NEXT:   0x100D | LF_STRING_ID [size = 96, hash = 0x2D6F3] ID: 0x100C, String:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X
+ALL-NEXT:   0x100E | LF_BUILDINFO [size = 28, hash = 0x3EC54]
 ALL-NEXT:            0x1007: `d:\src\llvm\test\DebugInfo\PDB\Inputs`
 ALL-NEXT:            0x1008: `C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe`
 ALL-NEXT:            0x1009: `empty.cpp`
@@ -472,13 +472,13 @@ ALL-NEXT:          frontend = 18.0.31101.0, backend = 18.0.31101.0
 ALL-NEXT:          flags = security checks
 ALL-NEXT:    120 | S_GPROC32 [size = 44] `main`
 ALL-NEXT:          parent = 0, end = 196, addr = 0001:0016, code size = 10
-ALL-NEXT:          debug start = 3, debug end = 8, flags = has fp
+ALL-NEXT:          type = `0x1001 (int ())`, debug start = 3, debug end = 8, flags = has fp
 ALL-NEXT:    164 | S_FRAMEPROC [size = 32]
 ALL-NEXT:          size = 0, padding size = 0, offset to padding = 0
 ALL-NEXT:          bytes of callee saved registers = 0, exception handler addr = 0000:0000
 ALL-NEXT:          flags = has async eh | opt speed
 ALL-NEXT:    196 | S_END [size = 4]
-ALL-NEXT:    200 | S_BUILDINFO [size = 8] BuildId = `4110`
+ALL-NEXT:    200 | S_BUILDINFO [size = 8] BuildId = `0x100E`
 ALL-NEXT:   Mod 0001 | `* Linker *`:
 ALL-NEXT:      4 | S_OBJNAME [size = 20] sig=0, `* Linker *`
 ALL-NEXT:     24 | S_COMPILE3 [size = 48]
diff --git a/test/DebugInfo/X86/DIModule.ll b/test/DebugInfo/X86/DIModule.ll
index 1fe7f0c5fabe..eacdfe10f53b 100644
--- a/test/DebugInfo/X86/DIModule.ll
+++ b/test/DebugInfo/X86/DIModule.ll
@@ -18,7 +18,7 @@ target triple = "x86_64-apple-macosx"
 !1 = !DIFile(filename: "/llvm/tools/clang/test/Modules/<stdin>", directory: "/")
 !2 = !{}
 !3 = !{!4}
-!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !5, line: 5)
+!4 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !5, file: !1, line: 5)
 !5 = !DIModule(scope: null, name: "DebugModule", configMacros: "-DMODULES=0", includePath: "/llvm/tools/clang/test/Modules/Inputs", isysroot: "/")
 !6 = !{i32 2, !"Dwarf Version", i32 4}
 !7 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/X86/DIModuleContext.ll b/test/DebugInfo/X86/DIModuleContext.ll
index a63fd0f373cd..02d4441c8234 100644
--- a/test/DebugInfo/X86/DIModuleContext.ll
+++ b/test/DebugInfo/X86/DIModuleContext.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-apple-macosx"
 !4 = !{}
 !5 = !{!0}
 !6 = !{!7}
-!7 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !8, line: 11)
+!7 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !8, file: !3, line: 11)
 !8 = !DIModule(scope: null, name: "Module", includePath: ".", isysroot: "/")
 !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
 !10 = !DICompositeType(tag: DW_TAG_structure_type, name: "s", scope: !8, file: !3, line: 1, flags: DIFlagFwdDecl)
diff --git a/test/DebugInfo/X86/fission-inline.ll b/test/DebugInfo/X86/fission-inline.ll
index 45e0127294d1..eadcd15b2f21 100644
--- a/test/DebugInfo/X86/fission-inline.ll
+++ b/test/DebugInfo/X86/fission-inline.ll
@@ -110,7 +110,7 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !16 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !17 = !DISubprogram(name: "f2<int>", linkageName: "_ZN3foo2f2IiEEvv", line: 10, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: false, scopeLine: 10, file: !1, scope: !4, type: !12, templateParams: !14)
 !18 = !{!19}
-!19 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 19, scope: !20, entity: !4)
+!19 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 19, scope: !20, entity: !4)
 !20 = distinct !DILexicalBlock(line: 16, column: 13, file: !1, scope: !21)
 !21 = distinct !DILexicalBlock(line: 16, column: 7, file: !1, scope: !10)
 !22 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index 533ab838a732..df1d113538ea 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -345,9 +345,9 @@ attributes #1 = { nounwind readnone }
 !42 = !DINamespace(scope: !43)
 !43 = !DINamespace(name: "outer", scope: null)
 !44 = !{!45, !47}
-!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !46, line: 34)
+!45 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !46, file:!3, line: 34)
 !46 = !DIGlobalVariable(name: "global_namespace_variable_decl", linkageName: "_ZN2ns30global_namespace_variable_declE", scope: !18, file: !3, line: 28, type: !9, isLocal: false, isDefinition: false)
-!47 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !43, entity: !42, line: 43)
+!47 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !43, entity: !42, file: !3, line: 43)
 !48 = !{i32 2, !"Dwarf Version", i32 4}
 !49 = !{i32 2, !"Debug Info Version", i32 3}
 !50 = !{!"clang version 3.7.0 (trunk 234897) (llvm/trunk 234911)"}
diff --git a/test/DebugInfo/X86/lexical-block-file-inline.ll b/test/DebugInfo/X86/lexical-block-file-inline.ll
index 0f85a5573f03..9f040f41ec5e 100644
--- a/test/DebugInfo/X86/lexical-block-file-inline.ll
+++ b/test/DebugInfo/X86/lexical-block-file-inline.ll
@@ -134,7 +134,7 @@ attributes #2 = { nounwind }
 !8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !9, file: !9, line: 6, type: !5, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
 !9 = !DIFile(filename: "test.h", directory: "/")
 !10 = !{!11}
-!11 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !14, line: 1)
+!11 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !12, entity: !14, file: !1, line: 1)
 !12 = !DILexicalBlockFile(scope: !13, file: !9, discriminator: 0)
 !13 = distinct !DILexicalBlock(scope: !4, file: !1, line: 3)
 !14 = !DINamespace(name: "N", scope: null)
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
index a8278c9dcf83..c94572f7d23c 100644
--- a/test/DebugInfo/X86/pr19307.ll
+++ b/test/DebugInfo/X86/pr19307.ll
@@ -105,26 +105,26 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !19 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", line: 65, file: !20, scope: !10, baseType: !8)
 !20 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc")
 !21 = !{!22, !26, !29, !33, !38, !41}
-!22 = !DIImportedEntity(tag: DW_TAG_imported_module, line: 57, scope: !23, entity: !25)
+!22 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !1, line: 57, scope: !23, entity: !25)
 !23 = !DINamespace(name: "__gnu_debug", scope: null)
 !24 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", directory: "/llvm_cmake_gcc")
 !25 = !DINamespace(name: "__debug", scope: !10)
-!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 66, scope: !10, entity: !27)
+!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 66, scope: !10, entity: !27)
 !27 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", line: 106, file: !5, baseType: !28)
 !28 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", line: 95, file: !5, baseType: !4)
-!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 141, scope: !10, entity: !30)
+!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 141, scope: !10, entity: !30)
 !30 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", line: 141, file: !31, baseType: !32)
 !31 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc")
 !32 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 42, scope: !34, entity: !36)
+!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 42, scope: !34, entity: !36)
 !34 = !DINamespace(name: "__gnu_cxx", scope: null)
 !35 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", directory: "/llvm_cmake_gcc")
 !36 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", line: 155, file: !11, scope: !10, baseType: !37)
 !37 = !DIBasicType(tag: DW_TAG_base_type, name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 43, scope: !34, entity: !39)
+!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 43, scope: !34, entity: !39)
 !39 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", line: 156, file: !11, scope: !10, baseType: !40)
 !40 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 55, scope: !10, entity: !6)
+!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 55, scope: !10, entity: !6)
 !42 = !{i32 2, !"Dwarf Version", i32 4}
 !43 = !{i32 2, !"Debug Info Version", i32 3}
 !44 = !{!"clang version 3.5.0 (209308)"}
diff --git a/test/DllTool/coff-exports.def b/test/DllTool/coff-exports.def
new file mode 100644
index 000000000000..0226886a523c
--- /dev/null
+++ b/test/DllTool/coff-exports.def
@@ -0,0 +1,13 @@
+; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a
+; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s
+
+LIBRARY test.dll
+EXPORTS
+TestFunction
+
+; CHECK: File: test.dll
+; CHECK: Format: COFF-import-file
+; CHECK: Type: code
+; CHECK: Name type: name
+; CHECK: Symbol: __imp_TestFunction
+; CHECK: Symbol: TestFunction
diff --git a/test/DllTool/coff-weak-exports.def b/test/DllTool/coff-weak-exports.def
new file mode 100644
index 000000000000..511d947d8395
--- /dev/null
+++ b/test/DllTool/coff-weak-exports.def
@@ -0,0 +1,19 @@
+; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a
+; RUN: llvm-readobj -coff-exports %t.a | FileCheck %s
+
+LIBRARY test.dll
+EXPORTS
+TestFunction==AltTestFunction
+
+; CHECK: File: test.dll
+; CHECK: Format: COFF-x86-64
+; CHECK: Arch: x86_64
+; CHECK: AddressSize: 64bit
+; CHECK: File: test.dll
+; CHECK: Format: COFF-x86-64
+; CHECK: Arch: x86_64
+; CHECK: AddressSize: 64bit
+; CHECK: File: test.dll
+; CHECK: Format: COFF-x86-64
+; CHECK: Arch: x86_64
+; CHECK: AddressSize: 64bit
diff --git a/test/DllTool/lit.local.cfg b/test/DllTool/lit.local.cfg
new file mode 100644
index 000000000000..482608486d21
--- /dev/null
+++ b/test/DllTool/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.def']
diff --git a/test/FileCheck/regex-scope.txt b/test/FileCheck/regex-scope.txt
index e77f3f6513a8..989f422c6bcd 100644
--- a/test/FileCheck/regex-scope.txt
+++ b/test/FileCheck/regex-scope.txt
@@ -1,4 +1,4 @@
-// RUN: FileCheck -check-prefix CHECK -input-file %s %s
+// RUN: FileCheck -input-file %s %s
 // RUN: FileCheck -check-prefixes CHECK,GLOBAL -input-file %s %s
 // RUN: FileCheck -check-prefixes CHECK,LOCAL -input-file %s %s
 // RUN: FileCheck -check-prefixes CHECK,GLOBAL --enable-var-scope -input-file %s %s
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 9827e7a6792b..5eb5388c87a8 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -170,6 +170,26 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
 ; CHECK: __asan_memcpy
 ; CHECK: ret void
 
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
+  ; This is a canary test to make sure that these don't get lowered into calls that don't
+  ; have the element-atomic property. Eventually, asan will have to be enhanced to lower
+  ; these properly.
+  ; CHECK-LABEL: memintr_element_atomic_test
+  ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  ; CHECK-NEXT: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ret void
+}
+
+
 ; CHECK-LABEL: @test_swifterror
 ; CHECK-NOT: __asan_report_load
 ; CHECK: ret void
diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
new file mode 100644
index 000000000000..32610ce3b815
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
@@ -0,0 +1,48 @@
+; This check verifies that arguments passed by value get redzones.
+; RUN: opt < %s -asan -asan-realign-stack=32 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { [8 x i32] }
+
+declare i32 @bar(%struct.A*)
+
+; Test behavior for named argument with explicit alignment.  The memcpy and
+; alloca alignments should match the explicit alignment of 64.
+define void @foo(%struct.A* byval align 64 %a) sanitize_address {
+entry:
+; CHECK-LABEL: foo
+; CHECK: call i64 @__asan_stack_malloc
+; CHECK: alloca i8, i64 {{.*}} align 64
+; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to %struct.A*
+; CHECK: [[copyBytePtr:%[^ \t]+]] = bitcast %struct.A* [[copyPtr]]
+; CHECK: [[aBytePtr:%[^ \t]+]] = bitcast %struct.A* %a
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyBytePtr]]{{[^%]+}}[[aBytePtr]],{{[^,]+}}, i32 64
+; CHECK: call i32 @bar(%struct.A* [[copyPtr]])
+; CHECK: ret void
+
+  %call = call i32 @bar(%struct.A* %a)
+  ret void
+}
+
+; Test behavior for unnamed argument without explicit alignment.  In this case,
+; the first argument is referenced by the identifier %0 and the ABI requires a
+; minimum alignment of 4 bytes since struct.A contains i32s which have 4-byte
+; alignment.  However, the alloca alignment will be 32 since that is the value
+; passed via the -asan-realign-stack option, which is greater than 4.
+define void @baz(%struct.A* byval) sanitize_address {
+entry:
+; CHECK-LABEL: baz
+; CHECK: call i64 @__asan_stack_malloc
+; CHECK: alloca i8, i64 {{.*}} align 32
+; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to %struct.A*
+; CHECK: [[copyBytePtr:%[^ \t]+]] = bitcast %struct.A* [[copyPtr]]
+; CHECK: [[aBytePtr:%[^ \t]+]] = bitcast %struct.A* %0
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyBytePtr]]{{[^%]+}}[[aBytePtr]],{{[^,]+}}, i32 4
+; CHECK: call i32 @bar(%struct.A* [[copyPtr]])
+; CHECK: ret void
+
+  %call = call i32 @bar(%struct.A* %0)
+  ret void
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll b/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll
new file mode 100644
index 000000000000..9b6e3db9050f
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/unordered_atomic_mem_intrins.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have
+;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
+;; verify that dfsan handles these intrinsics properly once they have been
+;; added to that class hierarchy.
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @test_memcpy(i8* nocapture, i8* nocapture) {
+  ; CHECK-LABEL: dfs$test_memcpy
+  ; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ; CHECK: ret void
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ret void
+}
+
+define void @test_memmove(i8* nocapture, i8* nocapture) {
+  ; CHECK-LABEL: dfs$test_memmove
+  ; CHECK: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ; CHECK: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 16, i32 1)
+  ret void
+}
+
+define void @test_memset(i8* nocapture) {
+  ; CHECK-LABEL: dfs$test_memset
+  ; CHECK: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %0, i8 88, i64 16, i32 1)
+  ; CHECK: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %0, i8 88, i64 16, i32 1)
+  ret void
+}
diff --git a/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll b/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
index 3457cfc7e278..344ad86e99e4 100644
--- a/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
+++ b/test/Instrumentation/EfficiencySanitizer/working_set_basic.ll
@@ -233,6 +233,39 @@ entry:
 ; CHECK: ret void
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Ensure that esan doesn't convert element atomic memory intrinsics to
+; calls.
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @elementAtomic_memCpyTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memCpyTest
+  ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memMoveTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memMoveTest
+  ; CHECK-NEXT:  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memSetTest(i8* nocapture %x) {
+  ; CHECK-LABEL: elementAtomic_memSetTest
+  ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ret void
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Top-level:
 
diff --git a/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll b/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll
index 1c5978e52864..22c8d5c59a16 100644
--- a/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll
+++ b/test/Instrumentation/EfficiencySanitizer/working_set_slow.ll
@@ -250,6 +250,38 @@ entry:
 ; CHECK: ret void
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Ensure that esan doesn't convert element atomic memory intrinsics to
+; calls.
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @elementAtomic_memCpyTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memCpyTest
+  ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memMoveTest(i8* nocapture %x, i8* nocapture %y) {
+  ; CHECK-LABEL: elementAtomic_memMoveTest
+  ; CHECK-NEXT:  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @elementAtomic_memSetTest(i8* nocapture %x) {
+  ; CHECK-LABEL: elementAtomic_memSetTest
+  ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 77, i64 16, i32 1)
+  ret void
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Top-level:
 
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index ffb239a15256..47912b5b6901 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -238,6 +238,41 @@ declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 ; CHECK: call i8* @__msan_memmove
 ; CHECK: ret void
 
+;; ------------
+;; Placeholder tests that will fail once element atomic @llvm.mem[cpy|move|set] instrinsics have
+;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
+;; verify that MSAN handles these intrinsics properly once they have been
+;; added to that class hierarchy.
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @atomic_memcpy(i8* nocapture %x, i8* nocapture %y) nounwind {
+  ; CHECK-LABEL: atomic_memcpy
+  ; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @atomic_memmove(i8* nocapture %x, i8* nocapture %y) nounwind {
+  ; CHECK-LABEL: atomic_memmove
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1)
+  ret void
+}
+
+define void @atomic_memset(i8* nocapture %x) nounwind {
+  ; CHECK-LABEL: atomic_memset
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1)
+  ret void
+}
+
+;; ------------
+
 
 ; Check that we propagate shadow for "select"
 
diff --git a/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
new file mode 100644
index 000000000000..badf07db2c15
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -0,0 +1,22 @@
+; Test -sanitizer-coverage-trace-compares=1 API declarations on a non-x86_64 arch
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1  -S | FileCheck %s
+
+target triple = "x86_32-unknown-linux-gnu"
+define i32 @foo() #0 {
+entry:
+  ret i32 0
+}
+
+; CHECK: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK: declare void @__sanitizer_cov_trace_cmp1(i8, i8)
+; CHECK: declare void @__sanitizer_cov_trace_cmp2(i16, i16)
+; CHECK: declare void @__sanitizer_cov_trace_cmp4(i32, i32)
+; CHECK: declare void @__sanitizer_cov_trace_cmp8(i64, i64)
+; CHECK: declare void @__sanitizer_cov_trace_div4(i32)
+; CHECK: declare void @__sanitizer_cov_trace_div8(i64)
+; CHECK: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK: declare void @__sanitizer_cov_trace_switch(i64, i64*)
+; CHECK: declare void @__sanitizer_cov_trace_pc()
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard(i32*)
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard_init(i32*, i32*)
+; CHECK-NOT: declare
diff --git a/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
new file mode 100644
index 000000000000..16689f9831d8
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_64.ll
@@ -0,0 +1,22 @@
+; Test -sanitizer-coverage-trace-compares=1 API declarations on x86_64
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1  -S | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @foo() #0 {
+entry:
+  ret i32 0
+}
+
+; CHECK: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_cmp8(i64, i64)
+; CHECK: declare void @__sanitizer_cov_trace_div4(i32 zeroext)
+; CHECK: declare void @__sanitizer_cov_trace_div8(i64)
+; CHECK: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK: declare void @__sanitizer_cov_trace_switch(i64, i64*)
+; CHECK: declare void @__sanitizer_cov_trace_pc()
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard(i32*)
+; CHECK: declare void @__sanitizer_cov_trace_pc_guard_init(i32*, i32*)
+; CHECK-NOT: declare
diff --git a/test/Linker/pr26037.ll b/test/Linker/pr26037.ll
index 0e6da17e9fb7..da771669574f 100644
--- a/test/Linker/pr26037.ll
+++ b/test/Linker/pr26037.ll
@@ -44,13 +44,13 @@ entry:
 !7 = !{null}
 !8 = distinct !DISubprogram(name: "b", linkageName: "_ZN1A1bEv", scope: !5, file: !1, line: 8, type: !6, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
 !9 = !{!10, !16}
-!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !8, entity: !4, line: 8)
+!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !8, entity: !4, file: !1, line: 8)
 !11 = !{i32 2, !"Dwarf Version", i32 4}
 !12 = !{i32 2, !"Debug Info Version", i32 3}
 !13 = !{!"clang version 3.8.0 (trunk 256934) (llvm/trunk 256936)"}
 !14 = !DILocation(line: 7, column: 12, scope: !4)
 !15 = !DILocation(line: 8, column: 24, scope: !8)
-!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, line: 8)
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, file: !1, line: 8)
 !17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 9, column: 8)
 !18 = distinct !DISubprogram(name: "c", linkageName: "_ZN1A1cEv", scope: !5, file: !1, line: 9, type: !6, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
 !19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 10, column: 8)
diff --git a/test/MC/AArch64/coff-relocations.s b/test/MC/AArch64/coff-relocations.s
new file mode 100644
index 000000000000..221ecfd4cd41
--- /dev/null
+++ b/test/MC/AArch64/coff-relocations.s
@@ -0,0 +1,52 @@
+; RUN: llvm-mc -triple aarch64-windows -filetype obj -o - %s | \
+; RUN: llvm-readobj -r - | FileCheck %s
+
+; IMAGE_REL_ARM64_ADDR32
+.Linfo_foo:
+  .asciz "foo"
+  .long foo
+
+; IMAGE_REL_ARM64_ADDR32NB
+.long func@IMGREL
+
+; IMAGE_REL_ARM64_ADDR64
+.globl struc
+struc:
+  .quad arr
+
+; IMAGE_REL_ARM64_BRANCH26
+b target
+
+; IMAGE_REL_ARM64_PAGEBASE_REL21
+adrp x0, foo
+
+; IMAGE_REL_ARM64_PAGEOFFSET_12A
+add x0, x0, :lo12:foo
+
+; IMAGE_REL_ARM64_PAGEOFFSET_12L
+ldr x0, [x0, :lo12:foo]
+
+; IMAGE_REL_ARM64_SECREL
+.secrel32 .Linfo_bar
+.Linfo_bar:
+
+; IMAGE_REL_ARM64_SECTION
+.secidx func
+
+
+; CHECK: Format: COFF-ARM64
+; CHECK: Arch: aarch64
+; CHECK: AddressSize: 64bit
+; CHECK: Relocations [
+; CHECK:   Section (1) .text {
+; CHECK: 0x4 IMAGE_REL_ARM64_ADDR32 foo
+; CHECK: 0x8 IMAGE_REL_ARM64_ADDR32NB func
+; CHECK: 0xC IMAGE_REL_ARM64_ADDR64 arr
+; CHECK: 0x14 IMAGE_REL_ARM64_BRANCH26 target
+; CHECK: 0x18 IMAGE_REL_ARM64_PAGEBASE_REL21 foo
+; CHECK: 0x1C IMAGE_REL_ARM64_PAGEOFFSET_12A foo
+; CHECK: 0x20 IMAGE_REL_ARM64_PAGEOFFSET_12L foo
+; CHECK: 0x24 IMAGE_REL_ARM64_SECREL .text
+; CHECK: 0x28 IMAGE_REL_ARM64_SECTION func
+; CHECK:   }
+; CHECK: ]
diff --git a/test/MC/AArch64/invalid-instructions-spellcheck.s b/test/MC/AArch64/invalid-instructions-spellcheck.s
new file mode 100644
index 000000000000..8acb285ac9a6
--- /dev/null
+++ b/test/MC/AArch64/invalid-instructions-spellcheck.s
@@ -0,0 +1,37 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -mattr=-neon -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-NEON
+
+// This tests the mnemonic spell checker.
+
+// First check what happens when an instruction is omitted:
+
+  w1, w2, w3
+
+// CHECK:      error: unknown token in expression
+// CHECK-NEXT: w1, w2, w3
+// CHECK-NEXT:   ^
+// CHECK-NEXT: error: invalid operand
+// CHECK-NEXT: w1, w2, w3
+// CHECK-NEXT:   ^
+
+// We don't want to see a suggestion here; the edit distance is too large to
+// give sensible suggestions:
+
+  addddddddd w1, w2, w3
+
+// CHECK:      error: unrecognized instruction mnemonic
+// CHECK-NEXT: addddddddd w1, w2, w3
+// CHECK-NEXT: ^
+
+  addd w1, w2, w3
+
+// CHECK:      error: unrecognized instruction mnemonic, did you mean: add, addp, adds, addv, fadd, madd?
+// CHECK-NEXT: addd w1, w2, w3
+// CHECK-NEXT: ^
+
+// Instructions 'addv' and 'addp' are only available when NEON is enabled, so we
+// don't want to see them here:
+
+// CHECK-NO-NEON:      error: unrecognized instruction mnemonic, did you mean: add, adds, fadd, madd?
+// CHECK-NO-NEON-NEXT: addd w1, w2, w3
+// CHECK-NO-NEON-NEXT: ^
diff --git a/test/MC/AMDGPU/gfx9_asm_all.s b/test/MC/AMDGPU/gfx9_asm_all.s
index 56484a37bdce..40bc2f8e159a 100644
--- a/test/MC/AMDGPU/gfx9_asm_all.s
+++ b/test/MC/AMDGPU/gfx9_asm_all.s
@@ -105392,3 +105392,354 @@ v_mad_mixlo_f16 v5, |v1|, |v2|, |v3|
 
 v_mad_mixlo_f16 v5, v1, v2, v3 clamp
 // CHECK: [0x05,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_pk_mad_i16 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_pk_mad_i16 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18]
+
+v_pk_mad_i16 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19]
+
+v_pk_mad_i16 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19]
+
+v_pk_mad_i16 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19]
+
+v_pk_mad_i16 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19]
+
+v_pk_mad_i16 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19]
+
+v_pk_mad_i16 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19]
+
+v_pk_mad_i16 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19]
+
+v_pk_mad_i16 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14]
+
+v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_i16 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v255, v1, v2, v3
+// CHECK: [0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v255, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, s1, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, s101, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, m0, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v255, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c]
+
+v_pk_mad_u16 v5, v1, s2, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, s101, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, flat_scratch_lo, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, flat_scratch_hi, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, vcc_lo, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, vcc_hi, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, m0, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, exec_lo, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, exec_hi, v3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v255
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f]
+
+v_pk_mad_u16 v5, v1, v2, s3
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18]
+
+v_pk_mad_u16 v5, v1, v2, s101
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19]
+
+v_pk_mad_u16 v5, v1, v2, flat_scratch_lo
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19]
+
+v_pk_mad_u16 v5, v1, v2, flat_scratch_hi
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19]
+
+v_pk_mad_u16 v5, v1, v2, vcc_lo
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19]
+
+v_pk_mad_u16 v5, v1, v2, vcc_hi
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19]
+
+v_pk_mad_u16 v5, v1, v2, m0
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19]
+
+v_pk_mad_u16 v5, v1, v2, exec_lo
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19]
+
+v_pk_mad_u16 v5, v1, v2, exec_hi
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,0]
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0]
+// CHECK: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,1,0]
+// CHECK: [0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,1]
+// CHECK: [0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: [0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,1,1]
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,0]
+// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,0,0]
+// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,1,0]
+// CHECK: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14]
+
+v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,1]
+// CHECK: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04]
+
+v_pk_mad_u16 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_sub_u16 v5, v1, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v255, v1, v2
+// CHECK: [0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v255, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, s1, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, s101, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, vcc_lo, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, vcc_hi, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, m0, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, exec_lo, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, exec_hi, v2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v255
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18]
+
+v_pk_sub_u16 v5, v1, s2
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, s101
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, m0
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[0,0]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[1,0]
+// CHECK: [0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[0,1]
+// CHECK: [0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel:[1,1]
+// CHECK: [0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,1]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,0]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,0]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08]
+
+v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,1]
+// CHECK: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10]
+
+v_pk_sub_u16 v5, v1, v2 clamp
+// CHECK: [0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18]
diff --git a/test/MC/AMDGPU/vop3-errs.s b/test/MC/AMDGPU/vop3-errs.s
index 7ba577049af3..855dd0b5de08 100644
--- a/test/MC/AMDGPU/vop3-errs.s
+++ b/test/MC/AMDGPU/vop3-errs.s
@@ -1,35 +1,47 @@
-// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX67 --check-prefix=GCN
+// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX67 --check-prefix=GCN
+// RUN: not llvm-mc -arch=amdgcn -mcpu=fiji -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX89 --check-prefix=GCN
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX89 --check-prefix=GCN
 
 v_add_f32_e64 v0, v1
-// CHECK: error: too few operands for instruction
+// GCN: error: too few operands for instruction
 
 v_div_scale_f32  v24, vcc, v22, 1.1, v22
-// CHECK: error: invalid operand for instruction
+// GCN: error: invalid operand for instruction
 
 v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3]
-// CHECK: error: instruction not supported on this GPU
+// GFX67: error: instruction not supported on this GPU
+// GFX89: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[0:1], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[1:2], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[2:3], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[3:4], v[0:1], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[4:5], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[5:6], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[8:9], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
 
 v_mqsad_pk_u16_u8 v[9:10], v[1:2], v9, v[4:5]
-// CHECK: error: destination must be different than all sources
+// GCN: error: destination must be different than all sources
+
+v_cmp_eq_f32_e64 vcc, v0, v1 mul:2
+// GCN: error: invalid operand for instruction
+
+v_cmp_le_f64_e64 vcc, v0, v1 mul:4
+// GCN: error: invalid operand for instruction
+
+v_cvt_u32_f32_e64 v0, v1 div:2
+// GCN: error: invalid operand for instruction
\ No newline at end of file
diff --git a/test/MC/ARM/virtexts-thumb.s b/test/MC/ARM/virtexts-thumb.s
index d911e1dfb1d7..0ea0bafe5a86 100644
--- a/test/MC/ARM/virtexts-thumb.s
+++ b/test/MC/ARM/virtexts-thumb.s
@@ -50,7 +50,7 @@
 # CHECK-THUMB: [0xde,0xf3,0x00,0x8f]
 
 # SUBS PC, LR, #0 should have the same encoding as ERET.
-# The conditional forms can't be tested becuse the ARM assembler parser doesn't
+# The conditional forms can't be tested because the ARM assembler parser doesn't
 # accept SUBS<cond> PC, LR, #<imm>, only the unconditonal form is allowed. This
 # is due to the way that the custom parser handles optional operands; see the
 # FIXME in ARM/AsmParser/ARMAsmParser.cpp.
diff --git a/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt b/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt
index 63374300504c..188534706eb5 100644
--- a/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt
+++ b/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt
@@ -88934,3 +88934,408 @@
 
 # CHECK: v_pk_sub_i16 v5, v1, v2 clamp    ; encoding: [0x05,0x80,0x83,0xd3,0x01,0x05,0x02,0x18]
 0x05,0x80,0x83,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v255, v1, v2, v3    ; encoding: [0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0xff,0x40,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v255, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0xff,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, s1, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, s101, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x65,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, flat_scratch_lo, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x66,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, flat_scratch_hi, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x67,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, vcc_lo, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x6a,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, vcc_hi, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x6b,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, m0, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x7c,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, exec_lo, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x7e,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, exec_hi, v2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c]
+0x05,0x40,0x80,0xd3,0x7f,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v255, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xff,0x0f,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, s2, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, s101, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xcb,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, flat_scratch_lo, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xcd,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, flat_scratch_hi, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xcf,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, vcc_lo, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xd5,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, vcc_hi, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xd7,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, m0, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xf9,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, exec_lo, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xfd,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, exec_hi, v3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c]
+0x05,0x40,0x80,0xd3,0x01,0xff,0x0c,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v255    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x1f
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, s3    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x18
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, s101    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x96,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, flat_scratch_lo    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x9a,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, flat_scratch_hi    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x9e,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, vcc_lo    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xaa,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, vcc_hi    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xae,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, m0    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xf2,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, exec_lo    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xfa,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, exec_hi    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19]
+0x05,0x40,0x80,0xd3,0x01,0x05,0xfe,0x19
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,0,0]    ; encoding: [0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x48,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,1,0]    ; encoding: [0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x50,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[0,0,1]    ; encoding: [0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x60,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel:[1,1,1]    ; encoding: [0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x78,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,0]    ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[1,0,0]    ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c]
+0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x0c
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,1,0]    ; encoding: [0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14]
+0x05,0x00,0x80,0xd3,0x01,0x05,0x0e,0x14
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 op_sel_hi:[0,0,1]    ; encoding: [0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x40,0x80,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_i16 v5, v1, v2, v3 clamp    ; encoding: [0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0xc0,0x80,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v255, v1, v2, v3    ; encoding: [0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0xff,0x40,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v255, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0xff,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, s1, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, s101, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x65,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, flat_scratch_lo, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x66,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, flat_scratch_hi, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x67,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, vcc_lo, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x6a,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, vcc_hi, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x6b,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, m0, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x7c,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, exec_lo, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x7e,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, exec_hi, v2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c]
+0x05,0x40,0x89,0xd3,0x7f,0x04,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v255, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xff,0x0f,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, s2, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, s101, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xcb,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, flat_scratch_lo, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xcd,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, flat_scratch_hi, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xcf,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, vcc_lo, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xd5,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, vcc_hi, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xd7,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, m0, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xf9,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, exec_lo, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xfd,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, exec_hi, v3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c]
+0x05,0x40,0x89,0xd3,0x01,0xff,0x0c,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v255    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x1f
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, s3    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x18
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, s101    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x96,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, flat_scratch_lo    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x9a,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, flat_scratch_hi    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x9e,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, vcc_lo    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xaa,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, vcc_hi    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xae,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, m0    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xf2,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, exec_lo    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xfa,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, exec_hi    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19]
+0x05,0x40,0x89,0xd3,0x01,0x05,0xfe,0x19
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,0,0]    ; encoding: [0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x48,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,1,0]    ; encoding: [0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x50,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[0,0,1]    ; encoding: [0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x60,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel:[1,1,1]    ; encoding: [0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x78,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,0]    ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[1,0,0]    ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c]
+0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x0c
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,1,0]    ; encoding: [0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14]
+0x05,0x00,0x89,0xd3,0x01,0x05,0x0e,0x14
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 op_sel_hi:[0,0,1]    ; encoding: [0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04]
+0x05,0x40,0x89,0xd3,0x01,0x05,0x0e,0x04
+
+# CHECK: v_pk_mad_u16 v5, v1, v2, v3 clamp    ; encoding: [0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0xc0,0x89,0xd3,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_pk_sub_u16 v5, v1, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v255, v1, v2    ; encoding: [0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0xff,0x00,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v255, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0xff,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, s1, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, s101, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x65,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, flat_scratch_lo, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x66,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, flat_scratch_hi, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x67,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, vcc_lo, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x6a,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, vcc_hi, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x6b,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, m0, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x7c,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, exec_lo, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x7e,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, exec_hi, v2    ; encoding: [0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18]
+0x05,0x00,0x8b,0xd3,0x7f,0x04,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v255    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xff,0x03,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, s2    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, s101    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xcb,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, flat_scratch_lo    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xcd,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, flat_scratch_hi    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xcf,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, vcc_lo    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xd5,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, vcc_hi    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xd7,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, m0    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xf9,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, exec_lo    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xfd,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, exec_hi    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18]
+0x05,0x00,0x8b,0xd3,0x01,0xff,0x00,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,0]    ; encoding: [0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x08,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[0,1]    ; encoding: [0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x10,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel:[1,1]    ; encoding: [0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x18,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,0]    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x00
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[1,0]    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x08
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 op_sel_hi:[0,1]    ; encoding: [0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10]
+0x05,0x00,0x8b,0xd3,0x01,0x05,0x02,0x10
+
+# CHECK: v_pk_sub_u16 v5, v1, v2 clamp    ; encoding: [0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18]
+0x05,0x80,0x8b,0xd3,0x01,0x05,0x02,0x18
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], s[2:3], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x02,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x02,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], s[4:5], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x04,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x04,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], s[100:101], v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x64,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x64,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], flat_scratch, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x66,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x66,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], vcc, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x6a,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x6a,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], exec, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x7e,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x7e,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], 0, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], -1, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0xc1,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0xc1,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], 0.5, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0xf0,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0xf0,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], -4.0, v2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0xf7,0x04,0x0e,0x04]
+0xfc,0x00,0xe7,0xd1,0xf7,0x04,0x0e,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], s2, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], s101, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcb,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xcb,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], flat_scratch_lo, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcd,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xcd,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], flat_scratch_hi, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xcf,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xcf,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], vcc_lo, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xd5,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xd5,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], vcc_hi, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xd7,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xd7,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], m0, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xf9,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xf9,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], exec_lo, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xfd,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xfd,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], exec_hi, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xff,0x0c,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xff,0x0c,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], 0, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x01,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x01,0x0d,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], -1, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0x83,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0x83,0x0d,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], 0.5, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xe1,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xe1,0x0d,0x04
+
+# CHECK: v_mqsad_u32_u8 v[252:255], v[1:2], -4.0, v[3:6]    ; encoding: [0xfc,0x00,0xe7,0xd1,0x01,0xef,0x0d,0x04]
+0xfc,0x00,0xe7,0xd1,0x01,0xef,0x0d,0x04
diff --git a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
index 62e7092086aa..7025354d6847 100644
--- a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
+++ b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt
@@ -10,23 +10,4 @@
 0x08 0x10 0x65 0x7c  # CHECK: fork  $2, $3, $5
 0x09 0x00 0x80 0x7c  # CHECK: yield  $4
 0x09 0x20 0xa0 0x7c  # CHECK: yield $4, $5
-0x02 0x20 0x05 0x41  # CHECK: mftr  $4, $5, 0, 2, 0
-0x20 0x20 0x05 0x41  # CHECK: mftr  $4, $5, 1, 0, 0
-0x21 0x20 0x00 0x41  # CHECK: mftr  $4, $zero, 1, 1, 0
-0x21 0x20 0x0a 0x41  # CHECK: mftr  $4, $10, 1, 1, 0
-0x22 0x20 0x0a 0x41  # CHECK: mftr  $4, $10, 1, 2, 0
-0x32 0x20 0x0a 0x41  # CHECK: mftr  $4, $10, 1, 2, 1
-0x23 0x20 0x1a 0x41  # CHECK: mftr  $4, $26, 1, 3, 0
-0x23 0x20 0x1f 0x41  # CHECK: mftr  $4, $ra, 1, 3, 0
-0x24 0x20 0x0e 0x41  # CHECK: mftr  $4, $14, 1, 4, 0
-0x25 0x20 0x0f 0x41  # CHECK: mftr  $4, $15, 1, 5, 0
-0x02 0x28 0x84 0x41  # CHECK: mttr  $4, $5, 0, 2, 0
-0x20 0x28 0x84 0x41  # CHECK: mttr  $4, $5, 1, 0, 0
-0x21 0x00 0x84 0x41  # CHECK: mttr  $4, $zero, 1, 1, 0
-0x21 0x50 0x84 0x41  # CHECK: mttr  $4, $10, 1, 1, 0
-0x22 0x50 0x84 0x41  # CHECK: mttr  $4, $10, 1, 2, 0
-0x32 0x50 0x84 0x41  # CHECK: mttr  $4, $10, 1, 2, 1
-0x23 0xd0 0x84 0x41  # CHECK: mttr  $4, $26, 1, 3, 0
-0x23 0xf8 0x84 0x41  # CHECK: mttr  $4, $ra, 1, 3, 0
-0x24 0x70 0x84 0x41  # CHECK: mttr  $4, $14, 1, 4, 0
-0x25 0x78 0x84 0x41  # CHECK: mttr  $4, $15, 1, 5, 0
+
diff --git a/test/MC/Disassembler/Mips/mt/valid-r2.txt b/test/MC/Disassembler/Mips/mt/valid-r2.txt
index 4786d8b5591f..17c42c0614a5 100644
--- a/test/MC/Disassembler/Mips/mt/valid-r2.txt
+++ b/test/MC/Disassembler/Mips/mt/valid-r2.txt
@@ -10,23 +10,4 @@
 0x7c 0x65 0x10 0x08  # CHECK: fork  $2, $3, $5
 0x7c 0x80 0x00 0x09  # CHECK: yield  $4
 0x7c 0xa0 0x20 0x09  # CHECK: yield $4, $5
-0x41 0x05 0x20 0x02  # CHECK: mftr  $4, $5, 0, 2, 0
-0x41 0x05 0x20 0x20  # CHECK: mftr  $4, $5, 1, 0, 0
-0x41 0x00 0x20 0x21  # CHECK: mftr  $4, $zero, 1, 1, 0
-0x41 0x0a 0x20 0x21  # CHECK: mftr  $4, $10, 1, 1, 0
-0x41 0x0a 0x20 0x22  # CHECK: mftr  $4, $10, 1, 2, 0
-0x41 0x0a 0x20 0x32  # CHECK: mftr  $4, $10, 1, 2, 1
-0x41 0x1a 0x20 0x23  # CHECK: mftr  $4, $26, 1, 3, 0
-0x41 0x1f 0x20 0x23  # CHECK: mftr  $4, $ra, 1, 3, 0
-0x41 0x0e 0x20 0x24  # CHECK: mftr  $4, $14, 1, 4, 0
-0x41 0x0f 0x20 0x25  # CHECK: mftr  $4, $15, 1, 5, 0
-0x41 0x84 0x28 0x02  # CHECK: mttr  $4, $5, 0, 2, 0
-0x41 0x84 0x28 0x20  # CHECK: mttr  $4, $5, 1, 0, 0
-0x41 0x84 0x00 0x21  # CHECK: mttr  $4, $zero, 1, 1, 0
-0x41 0x84 0x50 0x21  # CHECK: mttr  $4, $10, 1, 1, 0
-0x41 0x84 0x50 0x22  # CHECK: mttr  $4, $10, 1, 2, 0
-0x41 0x84 0x50 0x32  # CHECK: mttr  $4, $10, 1, 2, 1
-0x41 0x84 0xd0 0x23  # CHECK: mttr  $4, $26, 1, 3, 0
-0x41 0x84 0xf8 0x23  # CHECK: mttr  $4, $ra, 1, 3, 0
-0x41 0x84 0x70 0x24  # CHECK: mttr  $4, $14, 1, 4, 0
-0x41 0x84 0x78 0x25  # CHECK: mttr  $4, $15, 1, 5, 0
+
diff --git a/test/MC/Disassembler/SystemZ/insns-z14.txt b/test/MC/Disassembler/SystemZ/insns-z14.txt
new file mode 100644
index 000000000000..c73b50c1c2fb
--- /dev/null
+++ b/test/MC/Disassembler/SystemZ/insns-z14.txt
@@ -0,0 +1,3253 @@
+# Test z14 instructions that don't have PC-relative operands.
+# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z14 \
+# RUN:   | FileCheck %s
+
+# CHECK: agh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x38
+
+# CHECK: agh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x38
+
+# CHECK: agh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x38
+
+# CHECK: agh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x38
+
+# CHECK: agh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x38
+
+# CHECK: agh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x38
+
+# CHECK: agh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x38
+
+# CHECK: agh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x38
+
+# CHECK: agh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x38
+
+# CHECK: agh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x38
+
+# CHECK: bi -524288
+0xe3 0xf0 0x00 0x00 0x80 0x47
+
+# CHECK: bi -1
+0xe3 0xf0 0x0f 0xff 0xff 0x47
+
+# CHECK: bi 0
+0xe3 0xf0 0x00 0x00 0x00 0x47
+
+# CHECK: bi 1
+0xe3 0xf0 0x00 0x01 0x00 0x47
+
+# CHECK: bi 524287
+0xe3 0xf0 0x0f 0xff 0x7f 0x47
+
+# CHECK: bi 0(%r1)
+0xe3 0xf0 0x10 0x00 0x00 0x47
+
+# CHECK: bi 0(%r15)
+0xe3 0xf0 0xf0 0x00 0x00 0x47
+
+# CHECK: bi 524287(%r1,%r15)
+0xe3 0xf1 0xff 0xff 0x7f 0x47
+
+# CHECK: bi 524287(%r15,%r1)
+0xe3 0xff 0x1f 0xff 0x7f 0x47
+
+# CHECK: bic 0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x47
+
+# CHECK: bic 0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x47
+
+# CHECK: bic 0, 0
+0xe3 0x00 0x00 0x00 0x00 0x47
+
+# CHECK: bic 0, 1
+0xe3 0x00 0x00 0x01 0x00 0x47
+
+# CHECK: bic 0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x47
+
+# CHECK: bic 0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x47
+
+# CHECK: bic 0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x47
+
+# CHECK: bic 0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x47
+
+# CHECK: bic 0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x47
+
+# CHECK: bio 0(%r15)
+0xe3 0x10 0xf0 0x00 0x00 0x47
+
+# CHECK: bih 0(%r15)
+0xe3 0x20 0xf0 0x00 0x00 0x47
+
+# CHECK: binle 0(%r15)
+0xe3 0x30 0xf0 0x00 0x00 0x47
+
+# CHECK: bil 0(%r15)
+0xe3 0x40 0xf0 0x00 0x00 0x47
+
+# CHECK: binhe 0(%r15)
+0xe3 0x50 0xf0 0x00 0x00 0x47
+
+# CHECK: bilh 0(%r15)
+0xe3 0x60 0xf0 0x00 0x00 0x47
+
+# CHECK: bine 0(%r15)
+0xe3 0x70 0xf0 0x00 0x00 0x47
+
+# CHECK: bie 0(%r15)
+0xe3 0x80 0xf0 0x00 0x00 0x47
+
+# CHECK: binlh 0(%r15)
+0xe3 0x90 0xf0 0x00 0x00 0x47
+
+# CHECK: bihe 0(%r15)
+0xe3 0xa0 0xf0 0x00 0x00 0x47
+
+# CHECK: binl 0(%r15)
+0xe3 0xb0 0xf0 0x00 0x00 0x47
+
+# CHECK: bile 0(%r15)
+0xe3 0xc0 0xf0 0x00 0x00 0x47
+
+# CHECK: binh 0(%r15)
+0xe3 0xd0 0xf0 0x00 0x00 0x47
+
+# CHECK: bino 0(%r15)
+0xe3 0xe0 0xf0 0x00 0x00 0x47
+
+# CHECK: irbm %r0, %r0
+0xb9 0xac 0x00 0x00
+
+# CHECK: irbm %r0, %r15
+0xb9 0xac 0x00 0x0f
+
+# CHECK: irbm %r15, %r0
+0xb9 0xac 0x00 0xf0
+
+# CHECK: irbm %r7, %r8
+0xb9 0xac 0x00 0x78
+
+# CHECK: irbm %r15, %r15
+0xb9 0xac 0x00 0xff
+
+# CHECK: kma %r2, %r2, %r2
+0xb9 0x29 0x20 0x22
+
+# CHECK: kma %r2, %r8, %r14
+0xb9 0x29 0x80 0x2e
+
+# CHECK: kma %r14, %r8, %r2
+0xb9 0x29 0x80 0xe2
+
+# CHECK: kma %r6, %r8, %r10
+0xb9 0x29 0x80 0x6a
+
+# CHECK: lgg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x4c
+
+# CHECK: lgg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x4c
+
+# CHECK: lgg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x4c
+
+# CHECK: lgg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x4c
+
+# CHECK: lgg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x4c
+
+# CHECK: lgg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x4c
+
+# CHECK: lgg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x4c
+
+# CHECK: lgg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x4c
+
+# CHECK: lgg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x4c
+
+# CHECK: lgg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x4c
+
+# CHECK: lgsc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x4d
+
+# CHECK: lgsc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x4d
+
+# CHECK: lgsc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x4d
+
+# CHECK: lgsc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x4d
+
+# CHECK: lgsc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x4d
+
+# CHECK: lgsc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x4d
+
+# CHECK: lgsc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x4d
+
+# CHECK: lgsc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x4d
+
+# CHECK: lgsc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x4d
+
+# CHECK: llgfsg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x48
+
+# CHECK: llgfsg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x48
+
+# CHECK: llgfsg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x48
+
+# CHECK: llgfsg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x48
+
+# CHECK: llgfsg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x48
+
+# CHECK: llgfsg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x48
+
+# CHECK: llgfsg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x48
+
+# CHECK: llgfsg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x48
+
+# CHECK: llgfsg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x48
+
+# CHECK: llgfsg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x48
+
+# CHECK: mg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x84
+
+# CHECK: mg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x84
+
+# CHECK: mg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x84
+
+# CHECK: mg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x84
+
+# CHECK: mg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x84
+
+# CHECK: mg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x84
+
+# CHECK: mg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x84
+
+# CHECK: mg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x84
+
+# CHECK: mg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x84
+
+# CHECK: mg %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x84
+
+# CHECK: mgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3c
+
+# CHECK: mgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3c
+
+# CHECK: mgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3c
+
+# CHECK: mgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3c
+
+# CHECK: mgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3c
+
+# CHECK: mgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3c
+
+# CHECK: mgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3c
+
+# CHECK: mgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3c
+
+# CHECK: mgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3c
+
+# CHECK: mgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3c
+
+# CHECK: mgrk %r0, %r0, %r0
+0xb9 0xec 0x00 0x00
+
+# CHECK: mgrk %r0, %r0, %r15
+0xb9 0xec 0xf0 0x00
+
+# CHECK: mgrk %r0, %r15, %r0
+0xb9 0xec 0x00 0x0f
+
+# CHECK: mgrk %r14, %r0, %r0
+0xb9 0xec 0x00 0xe0
+
+# CHECK: mgrk %r6, %r8, %r9
+0xb9 0xec 0x90 0x68
+
+# CHECK: msc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x53
+
+# CHECK: msc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x53
+
+# CHECK: msc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x53
+
+# CHECK: msc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x53
+
+# CHECK: msc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x53
+
+# CHECK: msc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x53
+
+# CHECK: msc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x53
+
+# CHECK: msc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x53
+
+# CHECK: msc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x53
+
+# CHECK: msc %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x53
+
+# CHECK: msgc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x83
+
+# CHECK: msgc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x83
+
+# CHECK: msgc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x83
+
+# CHECK: msgc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x83
+
+# CHECK: msgc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x83
+
+# CHECK: msgc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x83
+
+# CHECK: msgc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x83
+
+# CHECK: msgc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x83
+
+# CHECK: msgc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x83
+
+# CHECK: msgc %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x83
+
+# CHECK: msrkc %r0, %r0, %r0
+0xb9 0xfd 0x00 0x00
+
+# CHECK: msrkc %r0, %r0, %r15
+0xb9 0xfd 0xf0 0x00
+
+# CHECK: msrkc %r0, %r15, %r0
+0xb9 0xfd 0x00 0x0f
+
+# CHECK: msrkc %r15, %r0, %r0
+0xb9 0xfd 0x00 0xf0
+
+# CHECK: msrkc %r7, %r8, %r9
+0xb9 0xfd 0x90 0x78
+
+# CHECK: msgrkc %r0, %r0, %r0
+0xb9 0xed 0x00 0x00
+
+# CHECK: msgrkc %r0, %r0, %r15
+0xb9 0xed 0xf0 0x00
+
+# CHECK: msgrkc %r0, %r15, %r0
+0xb9 0xed 0x00 0x0f
+
+# CHECK: msgrkc %r15, %r0, %r0
+0xb9 0xed 0x00 0xf0
+
+# CHECK: msgrkc %r7, %r8, %r9
+0xb9 0xed 0x90 0x78
+
+# CHECK: sgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x39
+
+# CHECK: sgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x39
+
+# CHECK: sgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x39
+
+# CHECK: sgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x39
+
+# CHECK: sgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x39
+
+# CHECK: sgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x39
+
+# CHECK: sgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x39
+
+# CHECK: sgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x39
+
+# CHECK: sgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x39
+
+# CHECK: sgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x39
+
+# CHECK: stgsc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x49
+
+# CHECK: stgsc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x49
+
+# CHECK: stgsc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x49
+
+# CHECK: stgsc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x49
+
+# CHECK: stgsc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x49
+
+# CHECK: stgsc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x49
+
+# CHECK: stgsc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x49
+
+# CHECK: stgsc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x49
+
+# CHECK: stgsc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x49
+
+# CHECK: vap %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x71
+
+# CHECK: vap %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x71
+
+# CHECK: vap %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x71
+
+# CHECK: vap %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x71
+
+# CHECK: vap %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x71
+
+# CHECK: vap %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x71
+
+# CHECK: vap %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x71
+
+# CHECK: vbperm %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x85
+
+# CHECK: vbperm %v0, %v0, %v15
+0xe7 0x00 0xf0 0x00 0x00 0x85
+
+# CHECK: vbperm %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x85
+
+# CHECK: vbperm %v0, %v15, %v0
+0xe7 0x0f 0x00 0x00 0x00 0x85
+
+# CHECK: vbperm %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x85
+
+# CHECK: vbperm %v15, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x00 0x85
+
+# CHECK: vbperm %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x85
+
+# CHECK: vbperm %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x85
+
+# CHECK: vcp %v0, %v0, 0
+0xe6 0x00 0x00 0x00 0x00 0x77
+
+# CHECK: vcp %v0, %v0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x77
+
+# CHECK: vcp %v15, %v0, 0
+0xe6 0x0f 0x00 0x00 0x00 0x77
+
+# CHECK: vcp %v31, %v0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x77
+
+# CHECK: vcp %v0, %v15, 0
+0xe6 0x00 0xf0 0x00 0x00 0x77
+
+# CHECK: vcp %v0, %v31, 0
+0xe6 0x00 0xf0 0x00 0x02 0x77
+
+# CHECK: vcp %v3, %v18, 4
+0xe6 0x03 0x20 0x40 0x02 0x77
+
+# CHECK: vcvb %r0, %v0, 0
+0xe6 0x00 0x00 0x00 0x00 0x50
+
+# CHECK: vcvb %r0, %v0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x50
+
+# CHECK: vcvb %r15, %v0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x50
+
+# CHECK: vcvb %r0, %v15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x50
+
+# CHECK: vcvb %r0, %v31, 0
+0xe6 0x0f 0x00 0x00 0x04 0x50
+
+# CHECK: vcvb %r3, %v18, 4
+0xe6 0x32 0x00 0x40 0x04 0x50
+
+# CHECK: vcvbg %r0, %v0, 0
+0xe6 0x00 0x00 0x00 0x00 0x52
+
+# CHECK: vcvbg %r0, %v0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x52
+
+# CHECK: vcvbg %r15, %v0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x52
+
+# CHECK: vcvbg %r0, %v15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x52
+
+# CHECK: vcvbg %r0, %v31, 0
+0xe6 0x0f 0x00 0x00 0x04 0x52
+
+# CHECK: vcvbg %r3, %v18, 4
+0xe6 0x32 0x00 0x40 0x04 0x52
+
+# CHECK: vcvd %v0, %r0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x58
+
+# CHECK: vcvd %v0, %r0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x58
+
+# CHECK: vcvd %v0, %r0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x58
+
+# CHECK: vcvd %v0, %r15, 0, 0
+0xe6 0x0f 0x00 0x00 0x00 0x58
+
+# CHECK: vcvd %v15, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x58
+
+# CHECK: vcvd %v31, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x58
+
+# CHECK: vcvd %v18, %r9, 52, 11
+0xe6 0x29 0x00 0xb3 0x48 0x58
+
+# CHECK: vcvdg %v0, %r0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x5a
+
+# CHECK: vcvdg %v0, %r0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x5a
+
+# CHECK: vcvdg %v0, %r0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x5a
+
+# CHECK: vcvdg %v0, %r15, 0, 0
+0xe6 0x0f 0x00 0x00 0x00 0x5a
+
+# CHECK: vcvdg %v15, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x5a
+
+# CHECK: vcvdg %v31, %r0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x5a
+
+# CHECK: vcvdg %v18, %r9, 52, 11
+0xe6 0x29 0x00 0xb3 0x48 0x5a
+
+# CHECK: vdp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x7a
+
+# CHECK: vdp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x7a
+
+# CHECK: vdp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x7a
+
+# CHECK: vdp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x7a
+
+# CHECK: vdp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x7a
+
+# CHECK: vdp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x7a
+
+# CHECK: vdp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x7a
+
+# CHECK: vfasb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe3
+
+# CHECK: vfasb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe3
+
+# CHECK: vfasb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe3
+
+# CHECK: vfasb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe3
+
+# CHECK: vfasb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe3
+
+# CHECK: vfcesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe8
+
+# CHECK: vfcesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe8
+
+# CHECK: vfcesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe8
+
+# CHECK: vfcesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe8
+
+# CHECK: vfcesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe8
+
+# CHECK: vfcesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xe8
+
+# CHECK: vfcesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x10 0x22 0xe8
+
+# CHECK: vfcesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x10 0x24 0xe8
+
+# CHECK: vfcesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xe8
+
+# CHECK: vfcesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x10 0x2a 0xe8
+
+# CHECK: vfchsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xeb
+
+# CHECK: vfchsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xeb
+
+# CHECK: vfchsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xeb
+
+# CHECK: vfchsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xeb
+
+# CHECK: vfchsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xeb
+
+# CHECK: vfchsbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xeb
+
+# CHECK: vfchsbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x10 0x22 0xeb
+
+# CHECK: vfchsbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x10 0x24 0xeb
+
+# CHECK: vfchsbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xeb
+
+# CHECK: vfchsbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x10 0x2a 0xeb
+
+# CHECK: vfchesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xea
+
+# CHECK: vfchesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xea
+
+# CHECK: vfchesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xea
+
+# CHECK: vfchesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xea
+
+# CHECK: vfchesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xea
+
+# CHECK: vfchesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xea
+
+# CHECK: vfchesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x10 0x22 0xea
+
+# CHECK: vfchesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x10 0x24 0xea
+
+# CHECK: vfchesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xea
+
+# CHECK: vfchesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x10 0x2a 0xea
+
+# CHECK: vfdsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe5
+
+# CHECK: vfdsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe5
+
+# CHECK: vfdsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe5
+
+# CHECK: vfdsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe5
+
+# CHECK: vfdsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe5
+
+# CHECK: vfisb %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x20 0xc7
+
+# CHECK: vfisb %v0, %v0, 0, 15
+0xe7 0x00 0x00 0xf0 0x20 0xc7
+
+# CHECK: vfisb %v0, %v0, 4, 0
+0xe7 0x00 0x00 0x04 0x20 0xc7
+
+# CHECK: vfisb %v0, %v0, 7, 0
+0xe7 0x00 0x00 0x07 0x20 0xc7
+
+# CHECK: vfisb %v0, %v31, 0, 0
+0xe7 0x0f 0x00 0x00 0x24 0xc7
+
+# CHECK: vfisb %v31, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x28 0xc7
+
+# CHECK: vfisb %v14, %v17, 4, 10
+0xe7 0xe1 0x00 0xa4 0x24 0xc7
+
+# CHECK: vfkedb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x30 0xe8
+
+# CHECK: vfkedb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x32 0xe8
+
+# CHECK: vfkedb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x34 0xe8
+
+# CHECK: vfkedb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x38 0xe8
+
+# CHECK: vfkedb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x3a 0xe8
+
+# CHECK: vfkedbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x30 0xe8
+
+# CHECK: vfkedbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x32 0xe8
+
+# CHECK: vfkedbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x34 0xe8
+
+# CHECK: vfkedbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x38 0xe8
+
+# CHECK: vfkedbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x3a 0xe8
+
+# CHECK: vfkesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x20 0xe8
+
+# CHECK: vfkesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x22 0xe8
+
+# CHECK: vfkesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x24 0xe8
+
+# CHECK: vfkesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x28 0xe8
+
+# CHECK: vfkesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x2a 0xe8
+
+# CHECK: vfkesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x20 0xe8
+
+# CHECK: vfkesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x22 0xe8
+
+# CHECK: vfkesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x24 0xe8
+
+# CHECK: vfkesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x28 0xe8
+
+# CHECK: vfkesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x2a 0xe8
+
+# CHECK: vfkhdb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x30 0xeb
+
+# CHECK: vfkhdb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x32 0xeb
+
+# CHECK: vfkhdb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x34 0xeb
+
+# CHECK: vfkhdb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x38 0xeb
+
+# CHECK: vfkhdb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x3a 0xeb
+
+# CHECK: vfkhdbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x30 0xeb
+
+# CHECK: vfkhdbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x32 0xeb
+
+# CHECK: vfkhdbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x34 0xeb
+
+# CHECK: vfkhdbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x38 0xeb
+
+# CHECK: vfkhdbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x3a 0xeb
+
+# CHECK: vfkhsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x20 0xeb
+
+# CHECK: vfkhsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x22 0xeb
+
+# CHECK: vfkhsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x24 0xeb
+
+# CHECK: vfkhsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x28 0xeb
+
+# CHECK: vfkhsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x2a 0xeb
+
+# CHECK: vfkhsbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x20 0xeb
+
+# CHECK: vfkhsbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x22 0xeb
+
+# CHECK: vfkhsbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x24 0xeb
+
+# CHECK: vfkhsbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x28 0xeb
+
+# CHECK: vfkhsbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x2a 0xeb
+
+# CHECK: vfkhedb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x30 0xea
+
+# CHECK: vfkhedb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x32 0xea
+
+# CHECK: vfkhedb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x34 0xea
+
+# CHECK: vfkhedb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x38 0xea
+
+# CHECK: vfkhedb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x3a 0xea
+
+# CHECK: vfkhedbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x30 0xea
+
+# CHECK: vfkhedbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x32 0xea
+
+# CHECK: vfkhedbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x34 0xea
+
+# CHECK: vfkhedbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x38 0xea
+
+# CHECK: vfkhedbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x3a 0xea
+
+# CHECK: vfkhesb %v0, %v0, %v0
+0xe7 0x00 0x00 0x04 0x20 0xea
+
+# CHECK: vfkhesb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x04 0x22 0xea
+
+# CHECK: vfkhesb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x04 0x24 0xea
+
+# CHECK: vfkhesb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x04 0x28 0xea
+
+# CHECK: vfkhesb %v18, %v3, %v20
+0xe7 0x23 0x40 0x04 0x2a 0xea
+
+# CHECK: vfkhesbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x14 0x20 0xea
+
+# CHECK: vfkhesbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x14 0x22 0xea
+
+# CHECK: vfkhesbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x14 0x24 0xea
+
+# CHECK: vfkhesbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x14 0x28 0xea
+
+# CHECK: vfkhesbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x14 0x2a 0xea
+
+# CHECK: vfpsosb %v0, %v0, 3
+0xe7 0x00 0x00 0x30 0x20 0xcc
+
+# CHECK: vfpsosb %v0, %v0, 15
+0xe7 0x00 0x00 0xf0 0x20 0xcc
+
+# CHECK: vfpsosb %v0, %v15, 3
+0xe7 0x0f 0x00 0x30 0x20 0xcc
+
+# CHECK: vfpsosb %v0, %v31, 3
+0xe7 0x0f 0x00 0x30 0x24 0xcc
+
+# CHECK: vfpsosb %v15, %v0, 3
+0xe7 0xf0 0x00 0x30 0x20 0xcc
+
+# CHECK: vfpsosb %v31, %v0, 3
+0xe7 0xf0 0x00 0x30 0x28 0xcc
+
+# CHECK: vfpsosb %v14, %v17, 7
+0xe7 0xe1 0x00 0x70 0x24 0xcc
+
+# CHECK: vflcsb %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xcc
+
+# CHECK: vflcsb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x20 0xcc
+
+# CHECK: vflcsb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xcc
+
+# CHECK: vflcsb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x20 0xcc
+
+# CHECK: vflcsb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xcc
+
+# CHECK: vflcsb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xcc
+
+# CHECK: vflnsb %v0, %v0
+0xe7 0x00 0x00 0x10 0x20 0xcc
+
+# CHECK: vflnsb %v0, %v15
+0xe7 0x0f 0x00 0x10 0x20 0xcc
+
+# CHECK: vflnsb %v0, %v31
+0xe7 0x0f 0x00 0x10 0x24 0xcc
+
+# CHECK: vflnsb %v15, %v0
+0xe7 0xf0 0x00 0x10 0x20 0xcc
+
+# CHECK: vflnsb %v31, %v0
+0xe7 0xf0 0x00 0x10 0x28 0xcc
+
+# CHECK: vflnsb %v14, %v17
+0xe7 0xe1 0x00 0x10 0x24 0xcc
+
+# CHECK: vflpsb %v0, %v0
+0xe7 0x00 0x00 0x20 0x20 0xcc
+
+# CHECK: vflpsb %v0, %v15
+0xe7 0x0f 0x00 0x20 0x20 0xcc
+
+# CHECK: vflpsb %v0, %v31
+0xe7 0x0f 0x00 0x20 0x24 0xcc
+
+# CHECK: vflpsb %v15, %v0
+0xe7 0xf0 0x00 0x20 0x20 0xcc
+
+# CHECK: vflpsb %v31, %v0
+0xe7 0xf0 0x00 0x20 0x28 0xcc
+
+# CHECK: vflpsb %v14, %v17
+0xe7 0xe1 0x00 0x20 0x24 0xcc
+
+# CHECK: vfmax %v0, %v0, %v0, 0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0xef
+
+# CHECK: vfmax %v0, %v0, %v0, 15, 0, 0
+0xe7 0x00 0x00 0x00 0xf0 0xef
+
+# CHECK: vfmax %v0, %v0, %v0, 0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0xef
+
+# CHECK: vfmax %v0, %v0, %v0, 0, 0, 4
+0xe7 0x00 0x00 0x40 0x00 0xef
+
+# CHECK: vfmax %v0, %v0, %v31, 0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0xef
+
+# CHECK: vfmax %v0, %v31, %v0, 0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0xef
+
+# CHECK: vfmax %v31, %v0, %v0, 0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0xef
+
+# CHECK: vfmax %v18, %v3, %v20, 11, 9, 12
+0xe7 0x23 0x40 0xc9 0xba 0xef
+
+# CHECK: vfmaxdb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x30 0xef
+
+# CHECK: vfmaxdb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x30 0xef
+
+# CHECK: vfmaxdb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x32 0xef
+
+# CHECK: vfmaxdb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x34 0xef
+
+# CHECK: vfmaxdb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x38 0xef
+
+# CHECK: vfmaxdb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x3a 0xef
+
+# CHECK: vfmaxsb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0xef
+
+# CHECK: vfmaxsb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x20 0xef
+
+# CHECK: vfmaxsb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x22 0xef
+
+# CHECK: vfmaxsb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x24 0xef
+
+# CHECK: vfmaxsb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x28 0xef
+
+# CHECK: vfmaxsb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x2a 0xef
+
+# CHECK: vfmin %v0, %v0, %v0, 0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0xee
+
+# CHECK: vfmin %v0, %v0, %v0, 15, 0, 0
+0xe7 0x00 0x00 0x00 0xf0 0xee
+
+# CHECK: vfmin %v0, %v0, %v0, 0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0xee
+
+# CHECK: vfmin %v0, %v0, %v0, 0, 0, 4
+0xe7 0x00 0x00 0x40 0x00 0xee
+
+# CHECK: vfmin %v0, %v0, %v31, 0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0xee
+
+# CHECK: vfmin %v0, %v31, %v0, 0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0xee
+
+# CHECK: vfmin %v31, %v0, %v0, 0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0xee
+
+# CHECK: vfmin %v18, %v3, %v20, 11, 9, 12
+0xe7 0x23 0x40 0xc9 0xba 0xee
+
+# CHECK: vfmindb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x30 0xee
+
+# CHECK: vfmindb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x30 0xee
+
+# CHECK: vfmindb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x32 0xee
+
+# CHECK: vfmindb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x34 0xee
+
+# CHECK: vfmindb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x38 0xee
+
+# CHECK: vfmindb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x3a 0xee
+
+# CHECK: vfminsb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0xee
+
+# CHECK: vfminsb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x40 0x20 0xee
+
+# CHECK: vfminsb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x00 0x22 0xee
+
+# CHECK: vfminsb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x00 0x24 0xee
+
+# CHECK: vfminsb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x00 0x28 0xee
+
+# CHECK: vfminsb %v18, %v3, %v20, 12
+0xe7 0x23 0x40 0xc0 0x2a 0xee
+
+# CHECK: vfmasb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x8f
+
+# CHECK: vfmasb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x8f
+
+# CHECK: vfmasb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x8f
+
+# CHECK: vfmasb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x8f
+
+# CHECK: vfmasb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x8f
+
+# CHECK: vfmasb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x8f
+
+# CHECK: vfmsb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe7
+
+# CHECK: vfmsb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe7
+
+# CHECK: vfmsb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe7
+
+# CHECK: vfmsb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe7
+
+# CHECK: vfmsb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe7
+
+# CHECK: vfmssb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x8e
+
+# CHECK: vfmssb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x8e
+
+# CHECK: vfmssb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x8e
+
+# CHECK: vfmssb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x8e
+
+# CHECK: vfmssb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x8e
+
+# CHECK: vfmssb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x8e
+
+# CHECK: vfnma %v0, %v0, %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0x9f
+
+# CHECK: vfnma %v0, %v0, %v0, %v0, 0, 15
+0xe7 0x00 0x0f 0x00 0x00 0x9f
+
+# CHECK: vfnma %v0, %v0, %v0, %v0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0x9f
+
+# CHECK: vfnma %v0, %v0, %v0, %v31, 0, 0
+0xe7 0x00 0x00 0x00 0xf1 0x9f
+
+# CHECK: vfnma %v0, %v0, %v31, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0x9f
+
+# CHECK: vfnma %v0, %v31, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0x9f
+
+# CHECK: vfnma %v31, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0x9f
+
+# CHECK: vfnma %v13, %v17, %v21, %v25, 9, 11
+0xe7 0xd1 0x5b 0x09 0x97 0x9f
+
+# CHECK: vfnmadb %v0, %v0, %v0, %v0
+0xe7 0x00 0x03 0x00 0x00 0x9f
+
+# CHECK: vfnmadb %v0, %v0, %v0, %v31
+0xe7 0x00 0x03 0x00 0xf1 0x9f
+
+# CHECK: vfnmadb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf3 0x00 0x02 0x9f
+
+# CHECK: vfnmadb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x03 0x00 0x04 0x9f
+
+# CHECK: vfnmadb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x03 0x00 0x08 0x9f
+
+# CHECK: vfnmadb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x00 0x97 0x9f
+
+# CHECK: vfnmasb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x9f
+
+# CHECK: vfnmasb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x9f
+
+# CHECK: vfnmasb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x9f
+
+# CHECK: vfnmasb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x9f
+
+# CHECK: vfnmasb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x9f
+
+# CHECK: vfnmasb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x9f
+
+# CHECK: vfnms %v0, %v0, %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0x9e
+
+# CHECK: vfnms %v0, %v0, %v0, %v0, 0, 15
+0xe7 0x00 0x0f 0x00 0x00 0x9e
+
+# CHECK: vfnms %v0, %v0, %v0, %v0, 15, 0
+0xe7 0x00 0x00 0x0f 0x00 0x9e
+
+# CHECK: vfnms %v0, %v0, %v0, %v31, 0, 0
+0xe7 0x00 0x00 0x00 0xf1 0x9e
+
+# CHECK: vfnms %v0, %v0, %v31, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0x9e
+
+# CHECK: vfnms %v0, %v31, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0x9e
+
+# CHECK: vfnms %v31, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0x9e
+
+# CHECK: vfnms %v13, %v17, %v21, %v25, 9, 11
+0xe7 0xd1 0x5b 0x09 0x97 0x9e
+
+# CHECK: vfnmsdb %v0, %v0, %v0, %v0
+0xe7 0x00 0x03 0x00 0x00 0x9e
+
+# CHECK: vfnmsdb %v0, %v0, %v0, %v31
+0xe7 0x00 0x03 0x00 0xf1 0x9e
+
+# CHECK: vfnmsdb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf3 0x00 0x02 0x9e
+
+# CHECK: vfnmsdb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x03 0x00 0x04 0x9e
+
+# CHECK: vfnmsdb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x03 0x00 0x08 0x9e
+
+# CHECK: vfnmsdb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x00 0x97 0x9e
+
+# CHECK: vfnmssb %v0, %v0, %v0, %v0
+0xe7 0x00 0x02 0x00 0x00 0x9e
+
+# CHECK: vfnmssb %v0, %v0, %v0, %v31
+0xe7 0x00 0x02 0x00 0xf1 0x9e
+
+# CHECK: vfnmssb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf2 0x00 0x02 0x9e
+
+# CHECK: vfnmssb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x02 0x00 0x04 0x9e
+
+# CHECK: vfnmssb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x02 0x00 0x08 0x9e
+
+# CHECK: vfnmssb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x00 0x97 0x9e
+
+# CHECK: vfssb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xe2
+
+# CHECK: vfssb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x22 0xe2
+
+# CHECK: vfssb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x24 0xe2
+
+# CHECK: vfssb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xe2
+
+# CHECK: vfssb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xe2
+
+# CHECK: vfsqsb %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xce
+
+# CHECK: vfsqsb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x20 0xce
+
+# CHECK: vfsqsb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xce
+
+# CHECK: vfsqsb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x20 0xce
+
+# CHECK: vfsqsb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x28 0xce
+
+# CHECK: vfsqsb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xce
+
+# CHECK: vftcisb %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0x4a
+
+# CHECK: vftcisb %v0, %v0, 4095
+0xe7 0x00 0xff 0xf0 0x20 0x4a
+
+# CHECK: vftcisb %v0, %v15, 0
+0xe7 0x0f 0x00 0x00 0x20 0x4a
+
+# CHECK: vftcisb %v0, %v31, 0
+0xe7 0x0f 0x00 0x00 0x24 0x4a
+
+# CHECK: vftcisb %v15, %v0, 0
+0xe7 0xf0 0x00 0x00 0x20 0x4a
+
+# CHECK: vftcisb %v31, %v0, 0
+0xe7 0xf0 0x00 0x00 0x28 0x4a
+
+# CHECK: vftcisb %v4, %v21, 1656
+0xe7 0x45 0x67 0x80 0x24 0x4a
+
+# CHECK: vlip %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x49
+
+# CHECK: vlip %v0, 0, 15
+0xe6 0x00 0x00 0x00 0xf0 0x49
+
+# CHECK: vlip %v0, 65535, 0
+0xe6 0x00 0xff 0xff 0x00 0x49
+
+# CHECK: vlip %v15, 0, 0
+0xe6 0xf0 0x00 0x00 0x00 0x49
+
+# CHECK: vlip %v31, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x49
+
+# CHECK: vlip %v17, 4660, 7
+0xe6 0x10 0x12 0x34 0x78 0x49
+
+# CHECK: vllezlf %v0, 0
+0xe7 0x00 0x00 0x00 0x60 0x04
+
+# CHECK: vllezlf %v0, 4095
+0xe7 0x00 0x0f 0xff 0x60 0x04
+
+# CHECK: vllezlf %v0, 0(%r15)
+0xe7 0x00 0xf0 0x00 0x60 0x04
+
+# CHECK: vllezlf %v0, 0(%r15,%r1)
+0xe7 0x0f 0x10 0x00 0x60 0x04
+
+# CHECK: vllezlf %v15, 0
+0xe7 0xf0 0x00 0x00 0x60 0x04
+
+# CHECK: vllezlf %v31, 0
+0xe7 0xf0 0x00 0x00 0x68 0x04
+
+# CHECK: vllezlf %v18, 1383(%r3,%r4)
+0xe7 0x23 0x45 0x67 0x68 0x04
+
+# CHECK: vlrl %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x35
+
+# CHECK: vlrl %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x35
+
+# CHECK: vlrl %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x35
+
+# CHECK: vlrl %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x35
+
+# CHECK: vlrl %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x35
+
+# CHECK: vlrl %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x35
+
+# CHECK: vlrl %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x35
+
+# CHECK: vlrlr %v0, %r0, 0
+0xe6 0x00 0x00 0x00 0x00 0x37
+
+# CHECK: vlrlr %v0, %r0, 4095
+0xe6 0x00 0x0f 0xff 0x00 0x37
+
+# CHECK: vlrlr %v0, %r0, 0(%r15)
+0xe6 0x00 0xf0 0x00 0x00 0x37
+
+# CHECK: vlrlr %v0, %r15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x37
+
+# CHECK: vlrlr %v15, %r0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x37
+
+# CHECK: vlrlr %v31, %r0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x37
+
+# CHECK: vlrlr %v18, %r3, 1383(%r4)
+0xe6 0x03 0x45 0x67 0x21 0x37
+
+# CHECK: vmsl %v0, %v0, %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v0, 15, 0
+0xe7 0x00 0x0f 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v0, 0, 12
+0xe7 0x00 0x00 0xc0 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v15, 0, 0
+0xe7 0x00 0x00 0x00 0xf0 0xb8
+
+# CHECK: vmsl %v0, %v0, %v0, %v31, 0, 0
+0xe7 0x00 0x00 0x00 0xf1 0xb8
+
+# CHECK: vmsl %v0, %v0, %v15, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v0, %v31, %v0, 0, 0
+0xe7 0x00 0xf0 0x00 0x02 0xb8
+
+# CHECK: vmsl %v0, %v15, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x00 0xb8
+
+# CHECK: vmsl %v0, %v31, %v0, %v0, 0, 0
+0xe7 0x0f 0x00 0x00 0x04 0xb8
+
+# CHECK: vmsl %v15, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x00 0xb8
+
+# CHECK: vmsl %v31, %v0, %v0, %v0, 0, 0
+0xe7 0xf0 0x00 0x00 0x08 0xb8
+
+# CHECK: vmsl %v18, %v3, %v20, %v5, 0, 4
+0xe7 0x23 0x40 0x40 0x5a 0xb8
+
+# CHECK: vmsl %v18, %v3, %v20, %v5, 11, 8
+0xe7 0x23 0x4b 0x80 0x5a 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v0, 0
+0xe7 0x00 0x03 0x00 0x00 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v0, 12
+0xe7 0x00 0x03 0xc0 0x00 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v15, 0
+0xe7 0x00 0x03 0x00 0xf0 0xb8
+
+# CHECK: vmslg %v0, %v0, %v0, %v31, 0
+0xe7 0x00 0x03 0x00 0xf1 0xb8
+
+# CHECK: vmslg %v0, %v0, %v15, %v0, 0
+0xe7 0x00 0xf3 0x00 0x00 0xb8
+
+# CHECK: vmslg %v0, %v0, %v31, %v0, 0
+0xe7 0x00 0xf3 0x00 0x02 0xb8
+
+# CHECK: vmslg %v0, %v15, %v0, %v0, 0
+0xe7 0x0f 0x03 0x00 0x00 0xb8
+
+# CHECK: vmslg %v0, %v31, %v0, %v0, 0
+0xe7 0x0f 0x03 0x00 0x04 0xb8
+
+# CHECK: vmslg %v15, %v0, %v0, %v0, 0
+0xe7 0xf0 0x03 0x00 0x00 0xb8
+
+# CHECK: vmslg %v31, %v0, %v0, %v0, 0
+0xe7 0xf0 0x03 0x00 0x08 0xb8
+
+# CHECK: vmslg %v18, %v3, %v20, %v5, 4
+0xe7 0x23 0x43 0x40 0x5a 0xb8
+
+# CHECK: vmslg %v18, %v3, %v20, %v5, 8
+0xe7 0x23 0x43 0x80 0x5a 0xb8
+
+# CHECK: vmp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x78
+
+# CHECK: vmp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x78
+
+# CHECK: vmp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x78
+
+# CHECK: vmp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x78
+
+# CHECK: vmp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x78
+
+# CHECK: vmp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x78
+
+# CHECK: vmp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x78
+
+# CHECK: vmsp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x79
+
+# CHECK: vmsp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x79
+
+# CHECK: vmsp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x79
+
+# CHECK: vmsp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x79
+
+# CHECK: vmsp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x79
+
+# CHECK: vmsp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x79
+
+# CHECK: vmsp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x79
+
+# CHECK: vnn %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x6e
+
+# CHECK: vnn %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x6e
+
+# CHECK: vnn %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x6e
+
+# CHECK: vnn %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x6e
+
+# CHECK: vnn %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x6e
+
+# CHECK: vnx %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x6c
+
+# CHECK: vnx %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x6c
+
+# CHECK: vnx %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x6c
+
+# CHECK: vnx %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x6c
+
+# CHECK: vnx %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x6c
+
+# CHECK: voc %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x6f
+
+# CHECK: voc %v0, %v0, %v31
+0xe7 0x00 0xf0 0x00 0x02 0x6f
+
+# CHECK: voc %v0, %v31, %v0
+0xe7 0x0f 0x00 0x00 0x04 0x6f
+
+# CHECK: voc %v31, %v0, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x6f
+
+# CHECK: voc %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x6f
+
+# CHECK: vpkz %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x34
+
+# CHECK: vpkz %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x34
+
+# CHECK: vpkz %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x34
+
+# CHECK: vpkz %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x34
+
+# CHECK: vpkz %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x34
+
+# CHECK: vpkz %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x34
+
+# CHECK: vpkz %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x34
+
+# CHECK: vpopctb %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x50
+
+# CHECK: vpopctb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x00 0x50
+
+# CHECK: vpopctb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x04 0x50
+
+# CHECK: vpopctb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x00 0x50
+
+# CHECK: vpopctb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x08 0x50
+
+# CHECK: vpopctb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x04 0x50
+
+# CHECK: vpopctf %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x50
+
+# CHECK: vpopctf %v0, %v15
+0xe7 0x0f 0x00 0x00 0x20 0x50
+
+# CHECK: vpopctf %v0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0x50
+
+# CHECK: vpopctf %v15, %v0
+0xe7 0xf0 0x00 0x00 0x20 0x50
+
+# CHECK: vpopctf %v31, %v0
+0xe7 0xf0 0x00 0x00 0x28 0x50
+
+# CHECK: vpopctf %v14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0x50
+
+# CHECK: vpopctg %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x50
+
+# CHECK: vpopctg %v0, %v15
+0xe7 0x0f 0x00 0x00 0x30 0x50
+
+# CHECK: vpopctg %v0, %v31
+0xe7 0x0f 0x00 0x00 0x34 0x50
+
+# CHECK: vpopctg %v15, %v0
+0xe7 0xf0 0x00 0x00 0x30 0x50
+
+# CHECK: vpopctg %v31, %v0
+0xe7 0xf0 0x00 0x00 0x38 0x50
+
+# CHECK: vpopctg %v14, %v17
+0xe7 0xe1 0x00 0x00 0x34 0x50
+
+# CHECK: vpopcth %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x50
+
+# CHECK: vpopcth %v0, %v15
+0xe7 0x0f 0x00 0x00 0x10 0x50
+
+# CHECK: vpopcth %v0, %v31
+0xe7 0x0f 0x00 0x00 0x14 0x50
+
+# CHECK: vpopcth %v15, %v0
+0xe7 0xf0 0x00 0x00 0x10 0x50
+
+# CHECK: vpopcth %v31, %v0
+0xe7 0xf0 0x00 0x00 0x18 0x50
+
+# CHECK: vpopcth %v14, %v17
+0xe7 0xe1 0x00 0x00 0x14 0x50
+
+# CHECK: vpsop %v0, %v0, 0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x5b
+
+# CHECK: vpsop %v0, %v0, 0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x5b
+
+# CHECK: vpsop %v0, %v0, 0, 255, 0
+0xe6 0x00 0xff 0x00 0x00 0x5b
+
+# CHECK: vpsop %v0, %v0, 255, 0, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x5b
+
+# CHECK: vpsop %v0, %v31, 0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x5b
+
+# CHECK: vpsop %v31, %v0, 0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x5b
+
+# CHECK: vpsop %v13, %v17, 52, 121, 11
+0xe6 0xd1 0x79 0xb3 0x44 0x5b
+
+# CHECK: vrp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x7b
+
+# CHECK: vrp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x7b
+
+# CHECK: vrp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x7b
+
+# CHECK: vrp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x7b
+
+# CHECK: vrp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x7b
+
+# CHECK: vrp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x7b
+
+# CHECK: vrp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x7b
+
+# CHECK: vsdp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x7e
+
+# CHECK: vsdp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x7e
+
+# CHECK: vsdp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x7e
+
+# CHECK: vsdp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x7e
+
+# CHECK: vsdp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x7e
+
+# CHECK: vsdp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x7e
+
+# CHECK: vsdp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x7e
+
+# CHECK: vsp %v0, %v0, %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x73
+
+# CHECK: vsp %v0, %v0, %v0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x73
+
+# CHECK: vsp %v0, %v0, %v0, 255, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x73
+
+# CHECK: vsp %v0, %v0, %v31, 0, 0
+0xe6 0x00 0xf0 0x00 0x02 0x73
+
+# CHECK: vsp %v0, %v31, %v0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x73
+
+# CHECK: vsp %v31, %v0, %v0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x73
+
+# CHECK: vsp %v13, %v17, %v21, 121, 11
+0xe6 0xd1 0x50 0xb7 0x96 0x73
+
+# CHECK: vsrp %v0, %v0, 0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x59
+
+# CHECK: vsrp %v0, %v0, 0, 0, 15
+0xe6 0x00 0x00 0xf0 0x00 0x59
+
+# CHECK: vsrp %v0, %v0, 0, 255, 0
+0xe6 0x00 0xff 0x00 0x00 0x59
+
+# CHECK: vsrp %v0, %v0, 255, 0, 0
+0xe6 0x00 0x00 0x0f 0xf0 0x59
+
+# CHECK: vsrp %v0, %v31, 0, 0, 0
+0xe6 0x0f 0x00 0x00 0x04 0x59
+
+# CHECK: vsrp %v31, %v0, 0, 0, 0
+0xe6 0xf0 0x00 0x00 0x08 0x59
+
+# CHECK: vsrp %v13, %v17, 52, 121, 11
+0xe6 0xd1 0x79 0xb3 0x44 0x59
+
+# CHECK: vstrl %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x3d
+
+# CHECK: vstrl %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x3d
+
+# CHECK: vstrl %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x3d
+
+# CHECK: vstrl %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x3d
+
+# CHECK: vstrl %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x3d
+
+# CHECK: vstrl %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x3d
+
+# CHECK: vstrl %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x3d
+
+# CHECK: vstrlr %v0, %r0, 0
+0xe6 0x00 0x00 0x00 0x00 0x3f
+
+# CHECK: vstrlr %v0, %r0, 4095
+0xe6 0x00 0x0f 0xff 0x00 0x3f
+
+# CHECK: vstrlr %v0, %r0, 0(%r15)
+0xe6 0x00 0xf0 0x00 0x00 0x3f
+
+# CHECK: vstrlr %v0, %r15, 0
+0xe6 0x0f 0x00 0x00 0x00 0x3f
+
+# CHECK: vstrlr %v15, %r0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x3f
+
+# CHECK: vstrlr %v31, %r0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x3f
+
+# CHECK: vstrlr %v18, %r3, 1383(%r4)
+0xe6 0x03 0x45 0x67 0x21 0x3f
+
+# CHECK: vtp %v0
+0xe6 0x00 0x00 0x00 0x00 0x5f
+
+# CHECK: vtp %v15
+0xe6 0x0f 0x00 0x00 0x00 0x5f
+
+# CHECK: vtp %v31
+0xe6 0x0f 0x00 0x00 0x04 0x5f
+
+# CHECK: vupkz %v0, 0, 0
+0xe6 0x00 0x00 0x00 0x00 0x3c
+
+# CHECK: vupkz %v0, 4095, 0
+0xe6 0x00 0x0f 0xff 0x00 0x3c
+
+# CHECK: vupkz %v0, 0(%r15), 0
+0xe6 0x00 0xf0 0x00 0x00 0x3c
+
+# CHECK: vupkz %v0, 0, 255
+0xe6 0xff 0x00 0x00 0x00 0x3c
+
+# CHECK: vupkz %v15, 0, 0
+0xe6 0x00 0x00 0x00 0xf0 0x3c
+
+# CHECK: vupkz %v31, 0, 0
+0xe6 0x00 0x00 0x00 0xf1 0x3c
+
+# CHECK: vupkz %v18, 1383(%r4), 3
+0xe6 0x03 0x45 0x67 0x21 0x3c
+
+# CHECK: wfasb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe3
+
+# CHECK: wfasb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe3
+
+# CHECK: wfasb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe3
+
+# CHECK: wfasb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe3
+
+# CHECK: wfasb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe3
+
+# CHECK: wfasb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe3
+
+# CHECK: wfaxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe3
+
+# CHECK: wfaxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe3
+
+# CHECK: wfaxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe3
+
+# CHECK: wfaxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe3
+
+# CHECK: wfaxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe3
+
+# CHECK: wfcsb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %f0, %f15
+0xe7 0x0f 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %f0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xcb
+
+# CHECK: wfcsb %f15, %f0
+0xe7 0xf0 0x00 0x00 0x20 0xcb
+
+# CHECK: wfcsb %v31, %f0
+0xe7 0xf0 0x00 0x00 0x28 0xcb
+
+# CHECK: wfcsb %f14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xcb
+
+# CHECK: wfcxb %v0, %v0
+0xe7 0x00 0x00 0x00 0x40 0xcb
+
+# CHECK: wfcxb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x40 0xcb
+
+# CHECK: wfcxb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x44 0xcb
+
+# CHECK: wfcxb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x40 0xcb
+
+# CHECK: wfcxb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x48 0xcb
+
+# CHECK: wfcxb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x44 0xcb
+
+# CHECK: wfcesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe8
+
+# CHECK: wfcesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe8
+
+# CHECK: wfcesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe8
+
+# CHECK: wfcesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe8
+
+# CHECK: wfcesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe8
+
+# CHECK: wfcesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe8
+
+# CHECK: wfcesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xe8
+
+# CHECK: wfcesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xe8
+
+# CHECK: wfcesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x18 0x22 0xe8
+
+# CHECK: wfcesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x18 0x24 0xe8
+
+# CHECK: wfcesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xe8
+
+# CHECK: wfcesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x18 0x2a 0xe8
+
+# CHECK: wfcexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe8
+
+# CHECK: wfcexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe8
+
+# CHECK: wfcexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe8
+
+# CHECK: wfcexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe8
+
+# CHECK: wfcexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe8
+
+# CHECK: wfcexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xe8
+
+# CHECK: wfcexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x18 0x42 0xe8
+
+# CHECK: wfcexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x18 0x44 0xe8
+
+# CHECK: wfcexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xe8
+
+# CHECK: wfcexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x18 0x4a 0xe8
+
+# CHECK: wfchsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xeb
+
+# CHECK: wfchsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xeb
+
+# CHECK: wfchsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xeb
+
+# CHECK: wfchsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xeb
+
+# CHECK: wfchsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xeb
+
+# CHECK: wfchsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xeb
+
+# CHECK: wfchsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xeb
+
+# CHECK: wfchsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xeb
+
+# CHECK: wfchsbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x18 0x22 0xeb
+
+# CHECK: wfchsbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x18 0x24 0xeb
+
+# CHECK: wfchsbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xeb
+
+# CHECK: wfchsbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x18 0x2a 0xeb
+
+# CHECK: wfchxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xeb
+
+# CHECK: wfchxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xeb
+
+# CHECK: wfchxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xeb
+
+# CHECK: wfchxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xeb
+
+# CHECK: wfchxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xeb
+
+# CHECK: wfchxbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xeb
+
+# CHECK: wfchxbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x18 0x42 0xeb
+
+# CHECK: wfchxbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x18 0x44 0xeb
+
+# CHECK: wfchxbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xeb
+
+# CHECK: wfchxbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x18 0x4a 0xeb
+
+# CHECK: wfchesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xea
+
+# CHECK: wfchesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xea
+
+# CHECK: wfchesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xea
+
+# CHECK: wfchesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xea
+
+# CHECK: wfchesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xea
+
+# CHECK: wfchesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xea
+
+# CHECK: wfchesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xea
+
+# CHECK: wfchesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xea
+
+# CHECK: wfchesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x18 0x22 0xea
+
+# CHECK: wfchesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x18 0x24 0xea
+
+# CHECK: wfchesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xea
+
+# CHECK: wfchesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x18 0x2a 0xea
+
+# CHECK: wfchexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xea
+
+# CHECK: wfchexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xea
+
+# CHECK: wfchexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xea
+
+# CHECK: wfchexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xea
+
+# CHECK: wfchexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xea
+
+# CHECK: wfchexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xea
+
+# CHECK: wfchexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x18 0x42 0xea
+
+# CHECK: wfchexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x18 0x44 0xea
+
+# CHECK: wfchexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xea
+
+# CHECK: wfchexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x18 0x4a 0xea
+
+# CHECK: wfdsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe5
+
+# CHECK: wfdsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe5
+
+# CHECK: wfdsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe5
+
+# CHECK: wfdsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe5
+
+# CHECK: wfdsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe5
+
+# CHECK: wfdsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe5
+
+# CHECK: wfdxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe5
+
+# CHECK: wfdxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe5
+
+# CHECK: wfdxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe5
+
+# CHECK: wfdxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe5
+
+# CHECK: wfdxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe5
+
+# CHECK: wfisb %f0, %f0, 0, 0
+0xe7 0x00 0x00 0x08 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 0, 0
+0xe7 0x00 0x00 0x08 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 0, 15
+0xe7 0x00 0x00 0xf8 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 4, 0
+0xe7 0x00 0x00 0x0c 0x20 0xc7
+
+# CHECK: wfisb %f0, %f0, 7, 0
+0xe7 0x00 0x00 0x0f 0x20 0xc7
+
+# CHECK: wfisb %f0, %v31, 0, 0
+0xe7 0x0f 0x00 0x08 0x24 0xc7
+
+# CHECK: wfisb %v31, %f0, 0, 0
+0xe7 0xf0 0x00 0x08 0x28 0xc7
+
+# CHECK: wfisb %f14, %v17, 4, 10
+0xe7 0xe1 0x00 0xac 0x24 0xc7
+
+# CHECK: wfixb %v0, %v0, 0, 0
+0xe7 0x00 0x00 0x08 0x40 0xc7
+
+# CHECK: wfixb %v0, %v0, 0, 15
+0xe7 0x00 0x00 0xf8 0x40 0xc7
+
+# CHECK: wfixb %v0, %v0, 4, 0
+0xe7 0x00 0x00 0x0c 0x40 0xc7
+
+# CHECK: wfixb %v0, %v0, 7, 0
+0xe7 0x00 0x00 0x0f 0x40 0xc7
+
+# CHECK: wfixb %v0, %v31, 0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xc7
+
+# CHECK: wfixb %v31, %v0, 0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xc7
+
+# CHECK: wfixb %v14, %v17, 4, 10
+0xe7 0xe1 0x00 0xac 0x44 0xc7
+
+# CHECK: wfksb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %f0, %f0
+0xe7 0x00 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %f0, %f15
+0xe7 0x0f 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %f0, %v31
+0xe7 0x0f 0x00 0x00 0x24 0xca
+
+# CHECK: wfksb %f15, %f0
+0xe7 0xf0 0x00 0x00 0x20 0xca
+
+# CHECK: wfksb %v31, %f0
+0xe7 0xf0 0x00 0x00 0x28 0xca
+
+# CHECK: wfksb %f14, %v17
+0xe7 0xe1 0x00 0x00 0x24 0xca
+
+# CHECK: wfkxb %v0, %v0
+0xe7 0x00 0x00 0x00 0x40 0xca
+
+# CHECK: wfkxb %v0, %v15
+0xe7 0x0f 0x00 0x00 0x40 0xca
+
+# CHECK: wfkxb %v0, %v31
+0xe7 0x0f 0x00 0x00 0x44 0xca
+
+# CHECK: wfkxb %v15, %v0
+0xe7 0xf0 0x00 0x00 0x40 0xca
+
+# CHECK: wfkxb %v31, %v0
+0xe7 0xf0 0x00 0x00 0x48 0xca
+
+# CHECK: wfkxb %v14, %v17
+0xe7 0xe1 0x00 0x00 0x44 0xca
+
+# CHECK: wfkedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xe8
+
+# CHECK: wfkedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xe8
+
+# CHECK: wfkedb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x32 0xe8
+
+# CHECK: wfkedb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x34 0xe8
+
+# CHECK: wfkedb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x38 0xe8
+
+# CHECK: wfkedb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x3a 0xe8
+
+# CHECK: wfkedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xe8
+
+# CHECK: wfkedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xe8
+
+# CHECK: wfkedbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x32 0xe8
+
+# CHECK: wfkedbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x34 0xe8
+
+# CHECK: wfkedbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x38 0xe8
+
+# CHECK: wfkedbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x3a 0xe8
+
+# CHECK: wfkesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xe8
+
+# CHECK: wfkesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xe8
+
+# CHECK: wfkesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x22 0xe8
+
+# CHECK: wfkesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x24 0xe8
+
+# CHECK: wfkesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x28 0xe8
+
+# CHECK: wfkesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x2a 0xe8
+
+# CHECK: wfkesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xe8
+
+# CHECK: wfkesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xe8
+
+# CHECK: wfkesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x22 0xe8
+
+# CHECK: wfkesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x24 0xe8
+
+# CHECK: wfkesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x28 0xe8
+
+# CHECK: wfkesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x2a 0xe8
+
+# CHECK: wfkexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x0c 0x40 0xe8
+
+# CHECK: wfkexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x0c 0x42 0xe8
+
+# CHECK: wfkexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x0c 0x44 0xe8
+
+# CHECK: wfkexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x0c 0x48 0xe8
+
+# CHECK: wfkexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x0c 0x4a 0xe8
+
+# CHECK: wfkexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x1c 0x40 0xe8
+
+# CHECK: wfkexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x1c 0x42 0xe8
+
+# CHECK: wfkexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x1c 0x44 0xe8
+
+# CHECK: wfkexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x1c 0x48 0xe8
+
+# CHECK: wfkexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x1c 0x4a 0xe8
+
+# CHECK: wfkhdb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xeb
+
+# CHECK: wfkhdb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xeb
+
+# CHECK: wfkhdb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x32 0xeb
+
+# CHECK: wfkhdb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x34 0xeb
+
+# CHECK: wfkhdb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x38 0xeb
+
+# CHECK: wfkhdb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x3a 0xeb
+
+# CHECK: wfkhdbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xeb
+
+# CHECK: wfkhdbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xeb
+
+# CHECK: wfkhdbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x32 0xeb
+
+# CHECK: wfkhdbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x34 0xeb
+
+# CHECK: wfkhdbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x38 0xeb
+
+# CHECK: wfkhdbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x3a 0xeb
+
+# CHECK: wfkhsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xeb
+
+# CHECK: wfkhsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xeb
+
+# CHECK: wfkhsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x22 0xeb
+
+# CHECK: wfkhsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x24 0xeb
+
+# CHECK: wfkhsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x28 0xeb
+
+# CHECK: wfkhsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x2a 0xeb
+
+# CHECK: wfkhsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xeb
+
+# CHECK: wfkhsbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xeb
+
+# CHECK: wfkhsbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x22 0xeb
+
+# CHECK: wfkhsbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x24 0xeb
+
+# CHECK: wfkhsbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x28 0xeb
+
+# CHECK: wfkhsbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x2a 0xeb
+
+# CHECK: wfkhxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x0c 0x40 0xeb
+
+# CHECK: wfkhxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x0c 0x42 0xeb
+
+# CHECK: wfkhxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x0c 0x44 0xeb
+
+# CHECK: wfkhxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x0c 0x48 0xeb
+
+# CHECK: wfkhxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x0c 0x4a 0xeb
+
+# CHECK: wfkhxbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x1c 0x40 0xeb
+
+# CHECK: wfkhxbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x1c 0x42 0xeb
+
+# CHECK: wfkhxbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x1c 0x44 0xeb
+
+# CHECK: wfkhxbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x1c 0x48 0xeb
+
+# CHECK: wfkhxbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x1c 0x4a 0xeb
+
+# CHECK: wfkhedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xea
+
+# CHECK: wfkhedb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x30 0xea
+
+# CHECK: wfkhedb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x32 0xea
+
+# CHECK: wfkhedb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x34 0xea
+
+# CHECK: wfkhedb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x38 0xea
+
+# CHECK: wfkhedb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x3a 0xea
+
+# CHECK: wfkhedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xea
+
+# CHECK: wfkhedbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x30 0xea
+
+# CHECK: wfkhedbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x32 0xea
+
+# CHECK: wfkhedbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x34 0xea
+
+# CHECK: wfkhedbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x38 0xea
+
+# CHECK: wfkhedbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x3a 0xea
+
+# CHECK: wfkhesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xea
+
+# CHECK: wfkhesb %f0, %f0, %f0
+0xe7 0x00 0x00 0x0c 0x20 0xea
+
+# CHECK: wfkhesb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x0c 0x22 0xea
+
+# CHECK: wfkhesb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x0c 0x24 0xea
+
+# CHECK: wfkhesb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x0c 0x28 0xea
+
+# CHECK: wfkhesb %v18, %f3, %v20
+0xe7 0x23 0x40 0x0c 0x2a 0xea
+
+# CHECK: wfkhesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xea
+
+# CHECK: wfkhesbs %f0, %f0, %f0
+0xe7 0x00 0x00 0x1c 0x20 0xea
+
+# CHECK: wfkhesbs %f0, %f0, %v31
+0xe7 0x00 0xf0 0x1c 0x22 0xea
+
+# CHECK: wfkhesbs %f0, %v31, %f0
+0xe7 0x0f 0x00 0x1c 0x24 0xea
+
+# CHECK: wfkhesbs %v31, %f0, %f0
+0xe7 0xf0 0x00 0x1c 0x28 0xea
+
+# CHECK: wfkhesbs %v18, %f3, %v20
+0xe7 0x23 0x40 0x1c 0x2a 0xea
+
+# CHECK: wfkhexb %v0, %v0, %v0
+0xe7 0x00 0x00 0x0c 0x40 0xea
+
+# CHECK: wfkhexb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x0c 0x42 0xea
+
+# CHECK: wfkhexb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x0c 0x44 0xea
+
+# CHECK: wfkhexb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x0c 0x48 0xea
+
+# CHECK: wfkhexb %v18, %v3, %v20
+0xe7 0x23 0x40 0x0c 0x4a 0xea
+
+# CHECK: wfkhexbs %v0, %v0, %v0
+0xe7 0x00 0x00 0x1c 0x40 0xea
+
+# CHECK: wfkhexbs %v0, %v0, %v31
+0xe7 0x00 0xf0 0x1c 0x42 0xea
+
+# CHECK: wfkhexbs %v0, %v31, %v0
+0xe7 0x0f 0x00 0x1c 0x44 0xea
+
+# CHECK: wfkhexbs %v31, %v0, %v0
+0xe7 0xf0 0x00 0x1c 0x48 0xea
+
+# CHECK: wfkhexbs %v18, %v3, %v20
+0xe7 0x23 0x40 0x1c 0x4a 0xea
+
+# CHECK: wfpsosb %f0, %f0, 3
+0xe7 0x00 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %f0, 3
+0xe7 0x00 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %f0, 15
+0xe7 0x00 0x00 0xf8 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %f15, 3
+0xe7 0x0f 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %f0, %v31, 3
+0xe7 0x0f 0x00 0x38 0x24 0xcc
+
+# CHECK: wfpsosb %f15, %f0, 3
+0xe7 0xf0 0x00 0x38 0x20 0xcc
+
+# CHECK: wfpsosb %v31, %f0, 3
+0xe7 0xf0 0x00 0x38 0x28 0xcc
+
+# CHECK: wfpsosb %f14, %v17, 7
+0xe7 0xe1 0x00 0x78 0x24 0xcc
+
+# CHECK: wfpsoxb %v0, %v0, 3
+0xe7 0x00 0x00 0x38 0x40 0xcc
+
+# CHECK: wfpsoxb %v0, %v0, 15
+0xe7 0x00 0x00 0xf8 0x40 0xcc
+
+# CHECK: wfpsoxb %v0, %v15, 3
+0xe7 0x0f 0x00 0x38 0x40 0xcc
+
+# CHECK: wfpsoxb %v0, %v31, 3
+0xe7 0x0f 0x00 0x38 0x44 0xcc
+
+# CHECK: wfpsoxb %v15, %v0, 3
+0xe7 0xf0 0x00 0x38 0x40 0xcc
+
+# CHECK: wfpsoxb %v31, %v0, 3
+0xe7 0xf0 0x00 0x38 0x48 0xcc
+
+# CHECK: wfpsoxb %v14, %v17, 7
+0xe7 0xe1 0x00 0x78 0x44 0xcc
+
+# CHECK: wflcsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %f0, %f15
+0xe7 0x0f 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %f0, %v31
+0xe7 0x0f 0x00 0x08 0x24 0xcc
+
+# CHECK: wflcsb %f15, %f0
+0xe7 0xf0 0x00 0x08 0x20 0xcc
+
+# CHECK: wflcsb %v31, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xcc
+
+# CHECK: wflcsb %f14, %v17
+0xe7 0xe1 0x00 0x08 0x24 0xcc
+
+# CHECK: wflcxb %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xcc
+
+# CHECK: wflcxb %v0, %v15
+0xe7 0x0f 0x00 0x08 0x40 0xcc
+
+# CHECK: wflcxb %v0, %v31
+0xe7 0x0f 0x00 0x08 0x44 0xcc
+
+# CHECK: wflcxb %v15, %v0
+0xe7 0xf0 0x00 0x08 0x40 0xcc
+
+# CHECK: wflcxb %v31, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xcc
+
+# CHECK: wflcxb %v14, %v17
+0xe7 0xe1 0x00 0x08 0x44 0xcc
+
+# CHECK: wflnsb %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %f0, %f0
+0xe7 0x00 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %f0, %f15
+0xe7 0x0f 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %f0, %v31
+0xe7 0x0f 0x00 0x18 0x24 0xcc
+
+# CHECK: wflnsb %f15, %f0
+0xe7 0xf0 0x00 0x18 0x20 0xcc
+
+# CHECK: wflnsb %v31, %f0
+0xe7 0xf0 0x00 0x18 0x28 0xcc
+
+# CHECK: wflnsb %f14, %v17
+0xe7 0xe1 0x00 0x18 0x24 0xcc
+
+# CHECK: wflnxb %v0, %v0
+0xe7 0x00 0x00 0x18 0x40 0xcc
+
+# CHECK: wflnxb %v0, %v15
+0xe7 0x0f 0x00 0x18 0x40 0xcc
+
+# CHECK: wflnxb %v0, %v31
+0xe7 0x0f 0x00 0x18 0x44 0xcc
+
+# CHECK: wflnxb %v15, %v0
+0xe7 0xf0 0x00 0x18 0x40 0xcc
+
+# CHECK: wflnxb %v31, %v0
+0xe7 0xf0 0x00 0x18 0x48 0xcc
+
+# CHECK: wflnxb %v14, %v17
+0xe7 0xe1 0x00 0x18 0x44 0xcc
+
+# CHECK: wflpsb %f0, %f0
+0xe7 0x00 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %f0, %f0
+0xe7 0x00 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %f0, %f15
+0xe7 0x0f 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %f0, %v31
+0xe7 0x0f 0x00 0x28 0x24 0xcc
+
+# CHECK: wflpsb %f15, %f0
+0xe7 0xf0 0x00 0x28 0x20 0xcc
+
+# CHECK: wflpsb %v31, %f0
+0xe7 0xf0 0x00 0x28 0x28 0xcc
+
+# CHECK: wflpsb %f14, %v17
+0xe7 0xe1 0x00 0x28 0x24 0xcc
+
+# CHECK: wflpxb %v0, %v0
+0xe7 0x00 0x00 0x28 0x40 0xcc
+
+# CHECK: wflpxb %v0, %v15
+0xe7 0x0f 0x00 0x28 0x40 0xcc
+
+# CHECK: wflpxb %v0, %v31
+0xe7 0x0f 0x00 0x28 0x44 0xcc
+
+# CHECK: wflpxb %v15, %v0
+0xe7 0xf0 0x00 0x28 0x40 0xcc
+
+# CHECK: wflpxb %v31, %v0
+0xe7 0xf0 0x00 0x28 0x48 0xcc
+
+# CHECK: wflpxb %v14, %v17
+0xe7 0xe1 0x00 0x28 0x44 0xcc
+
+# CHECK: wflld %v0, %f0
+0xe7 0x00 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v0, %f0
+0xe7 0x00 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v0, %f15
+0xe7 0x0f 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v0, %v31
+0xe7 0x0f 0x00 0x08 0x34 0xc4
+
+# CHECK: wflld %v15, %f0
+0xe7 0xf0 0x00 0x08 0x30 0xc4
+
+# CHECK: wflld %v31, %f0
+0xe7 0xf0 0x00 0x08 0x38 0xc4
+
+# CHECK: wflld %v14, %v17
+0xe7 0xe1 0x00 0x08 0x34 0xc4
+
+# CHECK: wflrx %f0, %v0, 0, 0
+0xe7 0x00 0x00 0x08 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 0, 0
+0xe7 0x00 0x00 0x08 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 0, 15
+0xe7 0x00 0x00 0xf8 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 4, 0
+0xe7 0x00 0x00 0x0c 0x40 0xc5
+
+# CHECK: wflrx %f0, %v0, 7, 0
+0xe7 0x00 0x00 0x0f 0x40 0xc5
+
+# CHECK: wflrx %f0, %v31, 0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xc5
+
+# CHECK: wflrx %v31, %v0, 0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xc5
+
+# CHECK: wflrx %f14, %v17, 4, 10
+0xe7 0xe1 0x00 0xac 0x44 0xc5
+
+# CHECK: wfmaxdb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xef
+
+# CHECK: wfmaxdb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xef
+
+# CHECK: wfmaxdb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x30 0xef
+
+# CHECK: wfmaxdb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x32 0xef
+
+# CHECK: wfmaxdb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x34 0xef
+
+# CHECK: wfmaxdb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x38 0xef
+
+# CHECK: wfmaxdb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x3a 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x20 0xef
+
+# CHECK: wfmaxsb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x22 0xef
+
+# CHECK: wfmaxsb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x24 0xef
+
+# CHECK: wfmaxsb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x28 0xef
+
+# CHECK: wfmaxsb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x2a 0xef
+
+# CHECK: wfmaxxb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x08 0x40 0xef
+
+# CHECK: wfmaxxb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x48 0x40 0xef
+
+# CHECK: wfmaxxb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x42 0xef
+
+# CHECK: wfmaxxb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xef
+
+# CHECK: wfmaxxb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xef
+
+# CHECK: wfmaxxb %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x4a 0xef
+
+# CHECK: wfmindb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xee
+
+# CHECK: wfmindb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x30 0xee
+
+# CHECK: wfmindb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x30 0xee
+
+# CHECK: wfmindb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x32 0xee
+
+# CHECK: wfmindb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x34 0xee
+
+# CHECK: wfmindb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x38 0xee
+
+# CHECK: wfmindb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x3a 0xee
+
+# CHECK: wfminsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xee
+
+# CHECK: wfminsb %f0, %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0xee
+
+# CHECK: wfminsb %f0, %f0, %f0, 4
+0xe7 0x00 0x00 0x48 0x20 0xee
+
+# CHECK: wfminsb %f0, %f0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x22 0xee
+
+# CHECK: wfminsb %f0, %v31, %f0, 0
+0xe7 0x0f 0x00 0x08 0x24 0xee
+
+# CHECK: wfminsb %v31, %f0, %f0, 0
+0xe7 0xf0 0x00 0x08 0x28 0xee
+
+# CHECK: wfminsb %v18, %f3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x2a 0xee
+
+# CHECK: wfminxb %v0, %v0, %v0, 0
+0xe7 0x00 0x00 0x08 0x40 0xee
+
+# CHECK: wfminxb %v0, %v0, %v0, 4
+0xe7 0x00 0x00 0x48 0x40 0xee
+
+# CHECK: wfminxb %v0, %v0, %v31, 0
+0xe7 0x00 0xf0 0x08 0x42 0xee
+
+# CHECK: wfminxb %v0, %v31, %v0, 0
+0xe7 0x0f 0x00 0x08 0x44 0xee
+
+# CHECK: wfminxb %v31, %v0, %v0, 0
+0xe7 0xf0 0x00 0x08 0x48 0xee
+
+# CHECK: wfminxb %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0xb8 0x4a 0xee
+
+# CHECK: wfmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8f
+
+# CHECK: wfmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8f
+
+# CHECK: wfmasb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x8f
+
+# CHECK: wfmasb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x8f
+
+# CHECK: wfmasb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x8f
+
+# CHECK: wfmasb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x8f
+
+# CHECK: wfmasb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x8f
+
+# CHECK: wfmaxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x8f
+
+# CHECK: wfmaxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x8f
+
+# CHECK: wfmaxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x8f
+
+# CHECK: wfmaxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x8f
+
+# CHECK: wfmaxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x8f
+
+# CHECK: wfmaxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x8f
+
+# CHECK: wfmsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe7
+
+# CHECK: wfmsb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe7
+
+# CHECK: wfmsb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe7
+
+# CHECK: wfmsb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe7
+
+# CHECK: wfmsb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe7
+
+# CHECK: wfmsb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe7
+
+# CHECK: wfmxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe7
+
+# CHECK: wfmxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe7
+
+# CHECK: wfmxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe7
+
+# CHECK: wfmxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe7
+
+# CHECK: wfmxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe7
+
+# CHECK: wfmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8e
+
+# CHECK: wfmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x8e
+
+# CHECK: wfmssb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x8e
+
+# CHECK: wfmssb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x8e
+
+# CHECK: wfmssb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x8e
+
+# CHECK: wfmssb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x8e
+
+# CHECK: wfmssb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x8e
+
+# CHECK: wfmsxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x8e
+
+# CHECK: wfmsxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x8e
+
+# CHECK: wfmsxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x8e
+
+# CHECK: wfmsxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x8e
+
+# CHECK: wfmsxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x8e
+
+# CHECK: wfmsxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x8e
+
+# CHECK: wfnmadb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9f
+
+# CHECK: wfnmadb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9f
+
+# CHECK: wfnmadb %f0, %f0, %f0, %v31
+0xe7 0x00 0x03 0x08 0xf1 0x9f
+
+# CHECK: wfnmadb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf3 0x08 0x02 0x9f
+
+# CHECK: wfnmadb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x03 0x08 0x04 0x9f
+
+# CHECK: wfnmadb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x03 0x08 0x08 0x9f
+
+# CHECK: wfnmadb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x08 0x97 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x9f
+
+# CHECK: wfnmasb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x9f
+
+# CHECK: wfnmasb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x9f
+
+# CHECK: wfnmasb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x9f
+
+# CHECK: wfnmasb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x9f
+
+# CHECK: wfnmaxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x9f
+
+# CHECK: wfnmaxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x9f
+
+# CHECK: wfnmaxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x9f
+
+# CHECK: wfnmaxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x9f
+
+# CHECK: wfnmaxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x9f
+
+# CHECK: wfnmaxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x9f
+
+# CHECK: wfnmsdb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9e
+
+# CHECK: wfnmsdb %f0, %f0, %f0, %f0
+0xe7 0x00 0x03 0x08 0x00 0x9e
+
+# CHECK: wfnmsdb %f0, %f0, %f0, %v31
+0xe7 0x00 0x03 0x08 0xf1 0x9e
+
+# CHECK: wfnmsdb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf3 0x08 0x02 0x9e
+
+# CHECK: wfnmsdb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x03 0x08 0x04 0x9e
+
+# CHECK: wfnmsdb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x03 0x08 0x08 0x9e
+
+# CHECK: wfnmsdb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x53 0x08 0x97 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %f0, %f0
+0xe7 0x00 0x02 0x08 0x00 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %f0, %v31
+0xe7 0x00 0x02 0x08 0xf1 0x9e
+
+# CHECK: wfnmssb %f0, %f0, %v31, %f0
+0xe7 0x00 0xf2 0x08 0x02 0x9e
+
+# CHECK: wfnmssb %f0, %v31, %f0, %f0
+0xe7 0x0f 0x02 0x08 0x04 0x9e
+
+# CHECK: wfnmssb %v31, %f0, %f0, %f0
+0xe7 0xf0 0x02 0x08 0x08 0x9e
+
+# CHECK: wfnmssb %f13, %v17, %v21, %v25
+0xe7 0xd1 0x52 0x08 0x97 0x9e
+
+# CHECK: wfnmsxb %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x08 0x00 0x9e
+
+# CHECK: wfnmsxb %v0, %v0, %v0, %v31
+0xe7 0x00 0x04 0x08 0xf1 0x9e
+
+# CHECK: wfnmsxb %v0, %v0, %v31, %v0
+0xe7 0x00 0xf4 0x08 0x02 0x9e
+
+# CHECK: wfnmsxb %v0, %v31, %v0, %v0
+0xe7 0x0f 0x04 0x08 0x04 0x9e
+
+# CHECK: wfnmsxb %v31, %v0, %v0, %v0
+0xe7 0xf0 0x04 0x08 0x08 0x9e
+
+# CHECK: wfnmsxb %v13, %v17, %v21, %v25
+0xe7 0xd1 0x54 0x08 0x97 0x9e
+
+# CHECK: wfssb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe2
+
+# CHECK: wfssb %f0, %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xe2
+
+# CHECK: wfssb %f0, %f0, %v31
+0xe7 0x00 0xf0 0x08 0x22 0xe2
+
+# CHECK: wfssb %f0, %v31, %f0
+0xe7 0x0f 0x00 0x08 0x24 0xe2
+
+# CHECK: wfssb %v31, %f0, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xe2
+
+# CHECK: wfssb %v18, %f3, %v20
+0xe7 0x23 0x40 0x08 0x2a 0xe2
+
+# CHECK: wfsxb %v0, %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xe2
+
+# CHECK: wfsxb %v0, %v0, %v31
+0xe7 0x00 0xf0 0x08 0x42 0xe2
+
+# CHECK: wfsxb %v0, %v31, %v0
+0xe7 0x0f 0x00 0x08 0x44 0xe2
+
+# CHECK: wfsxb %v31, %v0, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xe2
+
+# CHECK: wfsxb %v18, %v3, %v20
+0xe7 0x23 0x40 0x08 0x4a 0xe2
+
+# CHECK: wfsqsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %f0, %f0
+0xe7 0x00 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %f0, %f15
+0xe7 0x0f 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %f0, %v31
+0xe7 0x0f 0x00 0x08 0x24 0xce
+
+# CHECK: wfsqsb %f15, %f0
+0xe7 0xf0 0x00 0x08 0x20 0xce
+
+# CHECK: wfsqsb %v31, %f0
+0xe7 0xf0 0x00 0x08 0x28 0xce
+
+# CHECK: wfsqsb %f14, %v17
+0xe7 0xe1 0x00 0x08 0x24 0xce
+
+# CHECK: wfsqxb %v0, %v0
+0xe7 0x00 0x00 0x08 0x40 0xce
+
+# CHECK: wfsqxb %v0, %v15
+0xe7 0x0f 0x00 0x08 0x40 0xce
+
+# CHECK: wfsqxb %v0, %v31
+0xe7 0x0f 0x00 0x08 0x44 0xce
+
+# CHECK: wfsqxb %v15, %v0
+0xe7 0xf0 0x00 0x08 0x40 0xce
+
+# CHECK: wfsqxb %v31, %v0
+0xe7 0xf0 0x00 0x08 0x48 0xce
+
+# CHECK: wfsqxb %v14, %v17
+0xe7 0xe1 0x00 0x08 0x44 0xce
+
+# CHECK: wftcisb %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %f0, %f0, 0
+0xe7 0x00 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %f0, %f0, 4095
+0xe7 0x00 0xff 0xf8 0x20 0x4a
+
+# CHECK: wftcisb %f0, %f15, 0
+0xe7 0x0f 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %f0, %v31, 0
+0xe7 0x0f 0x00 0x08 0x24 0x4a
+
+# CHECK: wftcisb %f15, %f0, 0
+0xe7 0xf0 0x00 0x08 0x20 0x4a
+
+# CHECK: wftcisb %v31, %f0, 0
+0xe7 0xf0 0x00 0x08 0x28 0x4a
+
+# CHECK: wftcisb %f4, %v21, 1656
+0xe7 0x45 0x67 0x88 0x24 0x4a
+
+# CHECK: wftcixb %v0, %v0, 0
+0xe7 0x00 0x00 0x08 0x40 0x4a
+
+# CHECK: wftcixb %v0, %v0, 4095
+0xe7 0x00 0xff 0xf8 0x40 0x4a
+
+# CHECK: wftcixb %v0, %v15, 0
+0xe7 0x0f 0x00 0x08 0x40 0x4a
+
+# CHECK: wftcixb %v0, %v31, 0
+0xe7 0x0f 0x00 0x08 0x44 0x4a
+
+# CHECK: wftcixb %v15, %v0, 0
+0xe7 0xf0 0x00 0x08 0x40 0x4a
+
+# CHECK: wftcixb %v31, %v0, 0
+0xe7 0xf0 0x00 0x08 0x48 0x4a
+
+# CHECK: wftcixb %v4, %v21, 1656
+0xe7 0x45 0x67 0x88 0x44 0x4a
+
diff --git a/test/MC/Mips/mt/invalid-wrong-error.s b/test/MC/Mips/mt/invalid-wrong-error.s
deleted file mode 100644
index 0247089b70ae..000000000000
--- a/test/MC/Mips/mt/invalid-wrong-error.s
+++ /dev/null
@@ -1,3 +0,0 @@
-# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt < %s 2>&1 | FileCheck %s
-  mftr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list
-  mttr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list
diff --git a/test/MC/Mips/mt/invalid.s b/test/MC/Mips/mt/invalid.s
index d4055c4a50f4..5a145a7e0850 100644
--- a/test/MC/Mips/mt/invalid.s
+++ b/test/MC/Mips/mt/invalid.s
@@ -1,27 +1,13 @@
 # RUN: not llvm-mc -arch=mips -mcpu=mips32 -mattr=+mt < %s 2>&1 | FileCheck %s
-  dmt 4                   # CHECK: error: invalid operand for instruction
-  dmt $4, $5              # CHECK: error: invalid operand for instruction
-  dmt $5, 0($4)           # CHECK: error: invalid operand for instruction
-  emt 4                   # CHECK: error: invalid operand for instruction
-  emt $4, $5              # CHECK: error: invalid operand for instruction
-  emt $5, 0($5)           # CHECK: error: invalid operand for instruction
-  dvpe 4                  # CHECK: error: invalid operand for instruction
-  dvpe $4, $5             # CHECK: error: invalid operand for instruction
-  dvpe $5, 0($4)          # CHECK: error: invalid operand for instruction
-  evpe 4                  # CHECK: error: invalid operand for instruction
-  evpe $4, $5             # CHECK: error: invalid operand for instruction
-  evpe $5, 0($5)          # CHECK: error: invalid operand for instruction
-  mftr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction
-  mftr $4, $5, 2, 0, 0    # CHECK: error: expected 1-bit unsigned immediate
-  mftr $4, $5, -1, 0, 0   # CHECK: error: expected 1-bit unsigned immediate
-  mftr $4, $5, 0, 8, 0    # CHECK: error: expected 3-bit unsigned immediate
-  mftr $4, $5, 0, -1, 0   # CHECK: error: expected 3-bit unsigned immediate
-  mftr $4, $4, 0, 0, 2    # CHECK: error: expected 1-bit unsigned immediate
-  mftr $4, $5, 0, 0, -1   # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, 0($5), 0, 0, 0 # CHECK: error: invalid operand for instruction
-  mttr $4, $5, 2, 0, 0    # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, $5, -1, 0, 0   # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, $5, 0, 8, 0    # CHECK: error: expected 3-bit unsigned immediate
-  mttr $4, $5, 0, -1, 0   # CHECK: error: expected 3-bit unsigned immediate
-  mttr $4, $4, 0, 0, 2    # CHECK: error: expected 1-bit unsigned immediate
-  mttr $4, $5, 0, 0, -1   # CHECK: error: expected 1-bit unsigned immediate
+  dmt 4           # CHECK: error: invalid operand for instruction
+  dmt $4, $5      # CHECK: error: invalid operand for instruction
+  dmt $5, 0($4)   # CHECK: error: invalid operand for instruction
+  emt 4           # CHECK: error: invalid operand for instruction
+  emt $4, $5      # CHECK: error: invalid operand for instruction
+  emt $5, 0($5)   # CHECK: error: invalid operand for instruction
+  dvpe 4          # CHECK: error: invalid operand for instruction
+  dvpe $4, $5     # CHECK: error: invalid operand for instruction
+  dvpe $5, 0($4)  # CHECK: error: invalid operand for instruction
+  evpe 4          # CHECK: error: invalid operand for instruction
+  evpe $4, $5     # CHECK: error: invalid operand for instruction
+  evpe $5, 0($5)  # CHECK: error: invalid operand for instruction
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
deleted file mode 100644
index 4e872412e6ef..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
-# RUN:       2>&1 | FileCheck %s
-
-# The integrated assembler produces a wrong or misleading error message.
-
-  mftc0 0($4), $5    # CHECK: error: unexpected token in argument list
-  mftc0 0($4), $5, 1 # CHECK: error: unexpected token in argument list
-  mftgpr 0($4), $5   # CHECK: error: unexpected token in argument list
-  mftlo 0($3)        # CHECK: error: unexpected token in argument list
-  mftlo 0($3), $ac1  # CHECK: error: unexpected token in argument list
-  mfthi 0($3)        # CHECK: error: unexpected token in argument list
-  mfthi 0($3), $ac1  # CHECK: error: unexpected token in argument list
-  mftacx 0($3)       # CHECK: error: unexpected token in argument list
-  mftacx 0($3), $ac1 # CHECK: error: unexpected token in argument list
-  mftdsp 0($4)       # CHECK: error: unexpected token in argument list
-  mftc1 0($4), $f4   # CHECK: error: unexpected token in argument list
-  mfthc1 0($4), $f4  # CHECK: error: unexpected token in argument list
-  cftc1 0($4), $f8   # CHECK: error: unexpected token in argument list
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
deleted file mode 100644
index 06ae8c72e654..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s
+++ /dev/null
@@ -1,23 +0,0 @@
-# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
-# RUN:       2>&1 | FileCheck %s
-
-  mftc0 $4, 0($5)     # CHECK: error: invalid operand for instruction
-  mftc0 $4, 0($5), 1  # CHECK: error: invalid operand for instruction
-  mftc0 $4, $5, -1    # CHECK: error: expected 3-bit unsigned immediate
-  mftc0 $4, $5, 9     # CHECK: error: expected 3-bit unsigned immediate
-  mftc0 $4, $5, $6    # CHECK: error: expected 3-bit unsigned immediate
-  mftgpr $4, 0($5)    # CHECK: error: invalid operand for instruction
-  mftgpr $4, $5, $6   # CHECK: error: invalid operand for instruction
-  mftlo $3, 0($ac1)   # CHECK: error: invalid operand for instruction
-  mftlo $4, $ac1, $4  # CHECK: error: invalid operand for instruction
-  mfthi $3, 0($ac1)   # CHECK: error: invalid operand for instruction
-  mfthi $4, $ac1, $4  # CHECK: error: invalid operand for instruction
-  mftacx $3, 0($ac1)  # CHECK: error: invalid operand for instruction
-  mftacx $4, $ac1, $4 # CHECK: error: invalid operand for instruction
-  mftdsp $4, $5       # CHECK: error: invalid operand for instruction
-  mftdsp $4, $f5      # CHECK: error: invalid operand for instruction
-  mftdsp $4, $ac0     # CHECK: error: invalid operand for instruction
-  mftc1 $4, 0($f4)    # CHECK: error: invalid operand for instruction
-  mfthc1 $4, 0($f4)   # CHECK: error: invalid operand for instruction
-  cftc1 $4, 0($f4)    # CHECK: error: invalid operand for instruction
-  cftc1 $4, $f4, $5   # CHECK: error: invalid operand for instruction
diff --git a/test/MC/Mips/mt/mftr-mttr-aliases.s b/test/MC/Mips/mt/mftr-mttr-aliases.s
deleted file mode 100644
index 92ed9f9281f2..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-aliases.s
+++ /dev/null
@@ -1,47 +0,0 @@
-# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s
-
-# Check the various aliases of the m[ft]tr instruction.
-
-  mftc0 $4, $5           # CHECK: mftr  $4, $5, 0, 0, 0         # encoding: [0x41,0x05,0x20,0x00]
-  mftc0 $6, $7, 1        # CHECK: mftr  $6, $7, 0, 1, 0         # encoding: [0x41,0x07,0x30,0x01]
-  mftgpr $5, $9          # CHECK: mftr  $5, $9, 1, 0, 0         # encoding: [0x41,0x09,0x28,0x20]
-  mftlo $3               # CHECK: mftr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x00,0x18,0x21]
-  mftlo $3, $ac0         # CHECK: mftr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x00,0x18,0x21]
-  mftlo $3, $ac1         # CHECK: mftr  $3, $4, 1, 1, 0         # encoding: [0x41,0x04,0x18,0x21]
-  mftlo $3, $ac2         # CHECK: mftr  $3, $8, 1, 1, 0         # encoding: [0x41,0x08,0x18,0x21]
-  mftlo $3, $ac3         # CHECK: mftr  $3, $12, 1, 1, 0        # encoding: [0x41,0x0c,0x18,0x21]
-  mfthi $3, $ac0         # CHECK: mftr  $3, $1, 1, 1, 0         # encoding: [0x41,0x01,0x18,0x21]
-  mfthi $3, $ac1         # CHECK: mftr  $3, $5, 1, 1, 0         # encoding: [0x41,0x05,0x18,0x21]
-  mfthi $3, $ac2         # CHECK: mftr  $3, $9, 1, 1, 0         # encoding: [0x41,0x09,0x18,0x21]
-  mfthi $3, $ac3         # CHECK: mftr  $3, $13, 1, 1, 0        # encoding: [0x41,0x0d,0x18,0x21]
-  mftacx $3, $ac0        # CHECK: mftr  $3, $2, 1, 1, 0         # encoding: [0x41,0x02,0x18,0x21]
-  mftacx $3, $ac1        # CHECK: mftr  $3, $6, 1, 1, 0         # encoding: [0x41,0x06,0x18,0x21]
-  mftacx $3, $ac2        # CHECK: mftr  $3, $10, 1, 1, 0        # encoding: [0x41,0x0a,0x18,0x21]
-  mftacx $3, $ac3        # CHECK: mftr  $3, $14, 1, 1, 0        # encoding: [0x41,0x0e,0x18,0x21]
-  mftdsp $4              # CHECK: mftr  $4, $16, 1, 1, 0        # encoding: [0x41,0x10,0x20,0x21]
-  mftc1 $4, $f5          # CHECK: mftr  $4, $5, 1, 2, 0         # encoding: [0x41,0x05,0x20,0x22]
-  mfthc1 $4, $f5         # CHECK: mftr  $4, $5, 1, 2, 1         # encoding: [0x41,0x05,0x20,0x32]
-  cftc1  $4, $f9         # CHECK: mftr  $4, $9, 1, 3, 0         # encoding: [0x41,0x09,0x20,0x23]
-
-  mttc0 $4, $5           # CHECK: mttr  $4, $5, 0, 0, 0         # encoding: [0x41,0x84,0x28,0x00]
-  mttc0 $6, $7, 1        # CHECK: mttr  $6, $7, 0, 1, 0         # encoding: [0x41,0x86,0x38,0x01]
-  mttgpr $5, $9          # CHECK: mttr  $5, $9, 1, 0, 0         # encoding: [0x41,0x85,0x48,0x20]
-  mttlo $3               # CHECK: mttr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x83,0x00,0x21]
-  mttlo $3, $ac0         # CHECK: mttr  $3, $zero, 1, 1, 0      # encoding: [0x41,0x83,0x00,0x21]
-  mttlo $3, $ac1         # CHECK: mttr  $3, $4, 1, 1, 0         # encoding: [0x41,0x83,0x20,0x21]
-  mttlo $3, $ac2         # CHECK: mttr  $3, $8, 1, 1, 0         # encoding: [0x41,0x83,0x40,0x21]
-  mttlo $3, $ac3         # CHECK: mttr  $3, $12, 1, 1, 0        # encoding: [0x41,0x83,0x60,0x21]
-  mtthi $3               # CHECK: mttr  $3, $1, 1, 1, 0         # encoding: [0x41,0x83,0x08,0x21]
-  mtthi $3, $ac0         # CHECK: mttr  $3, $1, 1, 1, 0         # encoding: [0x41,0x83,0x08,0x21]
-  mtthi $3, $ac1         # CHECK: mttr  $3, $5, 1, 1, 0         # encoding: [0x41,0x83,0x28,0x21]
-  mtthi $3, $ac2         # CHECK: mttr  $3, $9, 1, 1, 0         # encoding: [0x41,0x83,0x48,0x21]
-  mtthi $3, $ac3         # CHECK: mttr  $3, $13, 1, 1, 0        # encoding: [0x41,0x83,0x68,0x21]
-  mttacx $3              # CHECK: mttr  $3, $2, 1, 1, 0         # encoding: [0x41,0x83,0x10,0x21]
-  mttacx $3, $ac0        # CHECK: mttr  $3, $2, 1, 1, 0         # encoding: [0x41,0x83,0x10,0x21]
-  mttacx $3, $ac1        # CHECK: mttr  $3, $6, 1, 1, 0         # encoding: [0x41,0x83,0x30,0x21]
-  mttacx $3, $ac2        # CHECK: mttr  $3, $10, 1, 1, 0        # encoding: [0x41,0x83,0x50,0x21]
-  mttacx $3, $ac3        # CHECK: mttr  $3, $14, 1, 1, 0        # encoding: [0x41,0x83,0x70,0x21]
-  mttdsp $4              # CHECK: mttr  $4, $16, 1, 1, 0        # encoding: [0x41,0x84,0x80,0x21]
-  mttc1 $4, $f5          # CHECK: mttr  $4, $5, 1, 2, 0         # encoding: [0x41,0x84,0x28,0x22]
-  mtthc1 $4, $f5         # CHECK: mttr  $4, $5, 1, 2, 1         # encoding: [0x41,0x84,0x28,0x32]
-  cttc1  $4, $f9         # CHECK: mttr  $4, $9, 1, 3, 0         # encoding: [0x41,0x84,0x48,0x23]
diff --git a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s
deleted file mode 100644
index c40e81bfc7d7..000000000000
--- a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s
+++ /dev/null
@@ -1,8 +0,0 @@
-# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s
-
-# The selector value and register values here are marked as reserved in the
-# documentation, but GAS accepts them without warning.
-  mftr  $31, $31, 1, 1, 0       # CHECK: mftr  $ra, $ra, 1, 1, 0   # encoding: [0x41,0x1f,0xf8,0x21]
-  mttr  $31, $31, 1, 1, 0       # CHECK: mttr  $ra, $ra, 1, 1, 0   # encoding: [0x41,0x9f,0xf8,0x21]
-  mftr  $31, $13, 1, 6, 0       # CHECK: mftr  $ra, $13, 1, 6, 0   # encoding: [0x41,0x0d,0xf8,0x26]
-  mttr  $31, $13, 1, 6, 0       # CHECK: mttr  $ra, $13, 1, 6, 0   # encoding: [0x41,0x9f,0x68,0x26]
diff --git a/test/MC/Mips/mt/valid.s b/test/MC/Mips/mt/valid.s
index 9fa07870a61f..ab1179d05c6a 100644
--- a/test/MC/Mips/mt/valid.s
+++ b/test/MC/Mips/mt/valid.s
@@ -1,33 +1,13 @@
 # RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \
 # RUN:   | FileCheck %s
-  dmt                    # CHECK:  dmt                       # encoding: [0x41,0x60,0x0b,0xc1]
-  dmt $5                 # CHECK:  dmt $5                    # encoding: [0x41,0x65,0x0b,0xc1]
-  emt                    # CHECK:  emt                       # encoding: [0x41,0x60,0x0b,0xe1]
-  emt $4                 # CHECK:  emt $4                    # encoding: [0x41,0x64,0x0b,0xe1]
-  dvpe                   # CHECK:  dvpe                      # encoding: [0x41,0x60,0x00,0x01]
-  dvpe $6                # CHECK:  dvpe  $6                  # encoding: [0x41,0x66,0x00,0x01]
-  evpe                   # CHECK:  evpe                      # encoding: [0x41,0x60,0x00,0x21]
-  evpe $4                # CHECK:  evpe  $4                  # encoding: [0x41,0x64,0x00,0x21]
-  fork $2, $3, $5        # CHECK:  fork  $2, $3, $5          # encoding: [0x7c,0x65,0x10,0x08]
-  yield $4               # CHECK:  yield  $4                 # encoding: [0x7c,0x80,0x00,0x09]
-  yield $4, $5           # CHECK:  yield $4, $5              # encoding: [0x7c,0xa0,0x20,0x09]
-  mftr $4, $5, 0, 2, 0   # CHECK:  mftr  $4, $5, 0, 2, 0     # encoding: [0x41,0x05,0x20,0x02]
-  mftr $4, $5, 1, 0, 0   # CHECK:  mftr  $4, $5, 1, 0, 0     # encoding: [0x41,0x05,0x20,0x20]
-  mftr $4, $0, 1, 1, 0   # CHECK:  mftr  $4, $zero, 1, 1, 0  # encoding: [0x41,0x00,0x20,0x21]
-  mftr $4, $10, 1, 1, 0  # CHECK:  mftr  $4, $10, 1, 1, 0    # encoding: [0x41,0x0a,0x20,0x21]
-  mftr $4, $10, 1, 2, 0  # CHECK:  mftr  $4, $10, 1, 2, 0    # encoding: [0x41,0x0a,0x20,0x22]
-  mftr $4, $10, 1, 2, 1  # CHECK:  mftr  $4, $10, 1, 2, 1    # encoding: [0x41,0x0a,0x20,0x32]
-  mftr $4, $26, 1, 3, 0  # CHECK:  mftr  $4, $26, 1, 3, 0    # encoding: [0x41,0x1a,0x20,0x23]
-  mftr $4, $31, 1, 3, 0  # CHECK:  mftr  $4, $ra, 1, 3, 0    # encoding: [0x41,0x1f,0x20,0x23]
-  mftr $4, $14, 1, 4, 0  # CHECK:  mftr  $4, $14, 1, 4, 0    # encoding: [0x41,0x0e,0x20,0x24]
-  mftr $4, $15, 1, 5, 0  # CHECK:  mftr  $4, $15, 1, 5, 0    # encoding: [0x41,0x0f,0x20,0x25]
-  mttr $4, $5, 0, 2, 0   # CHECK:  mttr  $4, $5, 0, 2, 0     # encoding: [0x41,0x84,0x28,0x02]
-  mttr $4, $5, 1, 0, 0   # CHECK:  mttr  $4, $5, 1, 0, 0     # encoding: [0x41,0x84,0x28,0x20]
-  mttr $4, $0, 1, 1, 0   # CHECK:  mttr  $4, $zero, 1, 1, 0  # encoding: [0x41,0x84,0x00,0x21]
-  mttr $4, $10, 1, 1, 0  # CHECK:  mttr  $4, $10, 1, 1, 0    # encoding: [0x41,0x84,0x50,0x21]
-  mttr $4, $10, 1, 2, 0  # CHECK:  mttr  $4, $10, 1, 2, 0    # encoding: [0x41,0x84,0x50,0x22]
-  mttr $4, $10, 1, 2, 1  # CHECK:  mttr  $4, $10, 1, 2, 1    # encoding: [0x41,0x84,0x50,0x32]
-  mttr $4, $26, 1, 3, 0  # CHECK:  mttr  $4, $26, 1, 3, 0    # encoding: [0x41,0x84,0xd0,0x23]
-  mttr $4, $31, 1, 3, 0  # CHECK:  mttr  $4, $ra, 1, 3, 0    # encoding: [0x41,0x84,0xf8,0x23]
-  mttr $4, $14, 1, 4, 0  # CHECK:  mttr  $4, $14, 1, 4, 0    # encoding: [0x41,0x84,0x70,0x24]
-  mttr $4, $15, 1, 5, 0  # CHECK:  mttr  $4, $15, 1, 5, 0    # encoding: [0x41,0x84,0x78,0x25]
+  dmt           # CHECK:  dmt         # encoding: [0x41,0x60,0x0b,0xc1]
+  dmt $5        # CHECK:  dmt $5      # encoding: [0x41,0x65,0x0b,0xc1]
+  emt           # CHECK:  emt         # encoding: [0x41,0x60,0x0b,0xe1]
+  emt $4        # CHECK:  emt $4      # encoding: [0x41,0x64,0x0b,0xe1]
+  dvpe          # CHECK:  dvpe        # encoding: [0x41,0x60,0x00,0x01]
+  dvpe $6       # CHECK:  dvpe  $6    # encoding: [0x41,0x66,0x00,0x01]
+  evpe          # CHECK:  evpe        # encoding: [0x41,0x60,0x00,0x21]
+  evpe $4       # CHECK:  evpe  $4    # encoding: [0x41,0x64,0x00,0x21]
+  fork $2, $3, $5 # CHECK:  fork  $2, $3, $5 # encoding: [0x7c,0x65,0x10,0x08]
+  yield $4        # CHECK:  yield  $4        # encoding: [0x7c,0x80,0x00,0x09]
+  yield $4, $5    # CHECK:  yield $4, $5     # encoding: [0x7c,0xa0,0x20,0x09]
diff --git a/test/MC/SystemZ/insn-bad-z13.s b/test/MC/SystemZ/insn-bad-z13.s
index e9fac44aa883..0ccdd11cbe97 100644
--- a/test/MC/SystemZ/insn-bad-z13.s
+++ b/test/MC/SystemZ/insn-bad-z13.s
@@ -4,6 +4,19 @@
 # RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch11 < %s 2> %t
 # RUN: FileCheck < %t %s
 
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: agh	%r0, 0
+
+	agh	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: bi	0
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: bic	0, 0
+
+	bi	0
+	bic	0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cdpt	%f0, 0(1), -1
 #CHECK: error: invalid operand
@@ -150,6 +163,16 @@
 	cxpt	%f0, 0(-), 0
 	cxpt	%f15, 0(1), 0
 
+#CHECK: error: instruction requires: insert-reference-bits-multiple
+#CHECK: irbm	%r0, %r0
+
+	irbm	%r0, %r0
+
+#CHECK: error: instruction requires: message-security-assist-extension8
+#CHECK: kma	%r2, %r4, %r6
+
+	kma	%r2, %r4, %r6
+
 #CHECK: error: invalid operand
 #CHECK: lcbb	%r0, 0, -1
 #CHECK: error: invalid operand
@@ -167,6 +190,21 @@
 	lcbb	%r0, 4096, 0
 	lcbb	%r0, 0(%v1,%r2), 0
 
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: lgg	%r0, 0
+
+	lgg	%r0, 0
+
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: lgsc	%r0, 0
+
+	lgsc	%r0, 0
+
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: llgfsg	%r0, 0
+
+	llgfsg	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: llzrgf	%r0, -524289
 #CHECK: error: invalid operand
@@ -249,6 +287,41 @@
 	lzrg	%r0, -524289
 	lzrg	%r0, 524288
 
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: mg	%r0, 0
+
+	mg	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: mgh	%r0, 0
+
+	mgh	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: mgrk	%r0, %r0, %r0
+
+	mgrk	%r0, %r0, %r0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msc	%r0, 0
+
+	msc	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msgc	%r0, 0
+
+	msgc	%r0, 0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msrkc	%r0, %r0, %r0
+
+	msrkc	%r0, %r0, %r0
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: msgrkc	%r0, %r0, %r0
+
+	msgrkc	%r0, %r0, %r0
+
 #CHECK: error: invalid register pair
 #CHECK: ppno	%r1, %r2
 #CHECK: error: invalid register pair
@@ -257,6 +330,21 @@
 	ppno	%r1, %r2
 	ppno	%r2, %r1
 
+#CHECK: error: instruction requires: message-security-assist-extension7
+#CHECK: prno	%r2, %r4
+
+	prno	%r2, %r4
+
+#CHECK: error: instruction requires: miscellaneous-extensions-2
+#CHECK: sgh	%r0, 0
+
+	sgh	%r0, 0
+
+#CHECK: error: instruction requires: guarded-storage
+#CHECK: stgsc	%r0, 0
+
+	stgsc	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: stocfh	%r0, 0, -1
 #CHECK: error: invalid operand
@@ -274,6 +362,16 @@
 	stocfh	%r0, 524288, 1
 	stocfh	%r0, 0(%r1,%r2), 1
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vap	%v0, %v0, %v0, 0, 0
+
+	vap	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vbperm	%v0, %v0, %v0
+
+	vbperm	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vcdg	%v0, %v0, 0, 0, -1
 #CHECK: error: invalid operand
@@ -410,6 +508,35 @@
 	vclgdb	%v0, %v0, -1, 0
 	vclgdb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcp	%v0, %v0, 0
+
+	vcp	%v0, %v0, 0
+
+#CHECK: vcvb	%r0, %v0, 0
+
+	vcvb	%r0, %v0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcvbg	%r0, %v0, 0
+
+	vcvbg	%r0, %v0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcvd	%v0, %r0, 0, 0
+
+	vcvd	%v0, %r0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vcvdg	%v0, %r0, 0, 0
+
+	vcvdg	%v0, %r0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vdp	%v0, %v0, %v0, 0, 0
+
+	vdp	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: verim	%v0, %v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -828,6 +955,40 @@
 	vfaezhs	%v0, %v0
 	vfaezhs	%v0, %v0, %v0, 0, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfasb	%v0, %v0, %v0
+
+	vfasb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfcesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfcesbs	%v0, %v0, %v0
+
+	vfcesb	%v0, %v0, %v0
+	vfcesbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchsbs	%v0, %v0, %v0
+
+	vfchsb	%v0, %v0, %v0
+	vfchsbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfchesbs %v0, %v0, %v0
+
+	vfchesb	%v0, %v0, %v0
+	vfchesbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfdsb	%v0, %v0, %v0
+
+	vfdsb	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vfee	%v0, %v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -1130,6 +1291,152 @@
 	vfidb	%v0, %v0, -1, 0
 	vfidb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfisb	%v0, %v0, 0, 0
+
+	vfisb	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkedbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkesbs	%v0, %v0, %v0
+
+	vfkedb	%v0, %v0, %v0
+	vfkedbs	%v0, %v0, %v0
+	vfkesb	%v0, %v0, %v0
+	vfkesbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhdb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhdbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhsbs	%v0, %v0, %v0
+
+	vfkhdb	%v0, %v0, %v0
+	vfkhdbs	%v0, %v0, %v0
+	vfkhsb	%v0, %v0, %v0
+	vfkhsbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhedbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfkhesbs %v0, %v0, %v0
+
+	vfkhedb	%v0, %v0, %v0
+	vfkhedbs %v0, %v0, %v0
+	vfkhesb	%v0, %v0, %v0
+	vfkhesbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfpsosb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflcsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflnsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflpsb	%v0, %v0
+
+	vfpsosb	%v0, %v0, 0
+	vflcsb	%v0, %v0
+	vflnsb	%v0, %v0
+	vflpsb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfll	%v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflls	%v0, %v0
+
+	vfll	%v0, %v0, 0, 0
+	vflls	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflr	%v0, %v0, 0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vflrd	%v0, %v0, 0, 0
+
+	vflr	%v0, %v0, 0, 0, 0
+	vflrd	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmaxdb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmaxsb	%v0, %v0, %v0, 0
+
+	vfmax	%v0, %v0, %v0, 0, 0, 0
+	vfmaxdb	%v0, %v0, %v0, 0
+	vfmaxsb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmindb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfminsb	%v0, %v0, %v0, 0
+
+	vfmin	%v0, %v0, %v0, 0, 0, 0
+	vfmindb	%v0, %v0, %v0, 0
+	vfminsb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmasb	%v0, %v0, %v0, %v0
+
+	vfmasb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmsb	%v0, %v0, %v0
+
+	vfmsb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfmssb	%v0, %v0, %v0, %v0
+
+	vfmssb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmadb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmasb	%v0, %v0, %v0, %v0
+
+	vfnma	%v0, %v0, %v0, %v0, 0, 0
+	vfnmadb	%v0, %v0, %v0, %v0
+	vfnmasb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmsdb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfnmssb	%v0, %v0, %v0, %v0
+
+	vfnms	%v0, %v0, %v0, %v0, 0, 0
+	vfnmsdb	%v0, %v0, %v0, %v0
+	vfnmssb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfssb	%v0, %v0, %v0
+
+	vfssb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vfsqsb	%v0, %v0
+
+	vfsqsb	%v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vftci	%v0, %v0, 0, 0, -1
 #CHECK: error: invalid operand
@@ -1158,6 +1465,11 @@
 	vftcidb	%v0, %v0, -1
 	vftcidb	%v0, %v0, 4096
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vftcisb	%v0, %v0, 0
+
+	vftcisb	%v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vgbm	%v0, -1
 #CHECK: error: invalid operand
@@ -1615,6 +1927,11 @@
 	vlgvh	%r0, %v0, 4096
 	vlgvh	%r0, %v0, 0(%r0)
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vlip	%v0, 0, 0
+
+	vlip	%v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vll	%v0, %r0, -1
 #CHECK: error: invalid operand
@@ -1687,6 +2004,11 @@
 	vllezh	%v0, 4096
 	vllezh	%v0, 0(%v1,%r2)
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vllezlf	%v0, 0
+
+	vllezlf	%v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vlm	%v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1756,6 +2078,16 @@
 	vlreph	%v0, 4096
 	vlreph	%v0, 0(%v1,%r2)
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vlrl	%v0, 0, 0
+
+	vlrl	%v0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vlrlr	%v0, %r0, 0
+
+	vlrlr	%v0, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vlvg	%v0, %r0, 0, -1
 #CHECK: error: invalid operand
@@ -1817,6 +2149,39 @@
 	vlvgh	%v0, %r0, 4096
 	vlvgh	%v0, %r0, 0(%r0)
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vmp	%v0, %v0, %v0, 0, 0
+
+	vmp	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 0
+
+	vmsl	%v0, %v0, %v0, %v0, 0, 0
+	vmslg	%v0, %v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vmsp	%v0, %v0, %v0, 0, 0
+
+	vmsp	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vnn	%v0, %v0, %v0
+
+	vnn	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vnx	%v0, %v0, %v0
+
+	vnx	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: voc	%v0, %v0, %v0
+
+	voc	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: vpdi	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1825,6 +2190,30 @@
 	vpdi	%v0, %v0, %v0, -1
 	vpdi	%v0, %v0, %v0, 16
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vpkz	%v0, 0, 0
+
+	vpkz	%v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopctb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopctf	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopctg	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: vpopcth	%v0, %v0
+
+	vpopctb	%v0, %v0
+	vpopctf	%v0, %v0
+	vpopctg	%v0, %v0
+	vpopcth	%v0, %v0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vpsop	%v0, %v0, 0, 0, 0
+
+	vpsop	%v0, %v0, 0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vrep	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -1917,6 +2306,11 @@
 	vrepih	%v0, -32769
 	vrepih	%v0, 32768
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vrp	%v0, %v0, %v0, 0, 0
+
+	vrp	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: vector index required
 #CHECK: vscef	%v0, 0(%r1), 0
 #CHECK: error: vector index required
@@ -1957,6 +2351,11 @@
 	vsceg	%v0, -1(%v0,%r1), 0
 	vsceg	%v0, 4096(%v0,%r1), 0
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vsdp	%v0, %v0, %v0, 0, 0
+
+	vsdp	%v0, %v0, %v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vsldb	%v0, %v0, %v0, -1
 #CHECK: error: invalid operand
@@ -1965,6 +2364,16 @@
 	vsldb	%v0, %v0, %v0, -1
 	vsldb	%v0, %v0, %v0, 256
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vsp	%v0, %v0, %v0, 0, 0
+
+	vsp	%v0, %v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vsrp	%v0, %v0, 0, 0, 0
+
+	vsrp	%v0, %v0, 0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: vst	%v0, -1
 #CHECK: error: invalid operand
@@ -2251,6 +2660,26 @@
 	vstrczhs %v0, %v0, %v0
 	vstrczhs %v0, %v0, %v0, %v0, 0, 0
 
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vstrl	%v0, 0, 0
+
+	vstrl	%v0, 0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vstrlr	%v0, %r0, 0
+
+	vstrlr	%v0, %r0, 0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vtp	%v0
+
+	vtp	%v0
+
+#CHECK: error: instruction requires: vector-packed-decimal
+#CHECK: vupkz	%v0, 0, 0
+
+	vupkz	%v0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: wcdgb	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -2307,6 +2736,72 @@
 	wclgdb	%v0, %v0, -1, 0
 	wclgdb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfasb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfaxb	%v0, %v0, %v0
+
+	wfasb	%v0, %v0, %v0
+	wfaxb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcxb	%v0, %v0
+
+	wfcsb	%v0, %v0
+	wfcxb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcesbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfcexbs	%v0, %v0, %v0
+
+	wfcesb	%v0, %v0, %v0
+	wfcesbs	%v0, %v0, %v0
+	wfcexb	%v0, %v0, %v0
+	wfcexbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchsbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchxb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchxbs	%v0, %v0, %v0
+
+	wfchsb	%v0, %v0, %v0
+	wfchsbs	%v0, %v0, %v0
+	wfchxb	%v0, %v0, %v0
+	wfchxbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchesbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfchexbs %v0, %v0, %v0
+
+	wfchesb	%v0, %v0, %v0
+	wfchesbs %v0, %v0, %v0
+	wfchexb	%v0, %v0, %v0
+	wfchexbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfdsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfdxb	%v0, %v0, %v0
+
+	wfdsb	%v0, %v0, %v0
+	wfdxb	%v0, %v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: wfidb	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
@@ -2321,6 +2816,208 @@
 	wfidb	%v0, %v0, -1, 0
 	wfidb	%v0, %v0, 16, 0
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfisb	%v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfixb	%v0, %v0, 0, 0
+
+	wfisb	%v0, %v0, 0, 0
+	wfixb	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfksb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkxb	%v0, %v0
+
+	wfksb	%v0, %v0
+	wfkxb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkedbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkesbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkexbs	%v0, %v0, %v0
+
+	wfkedb	%v0, %v0, %v0
+	wfkedbs	%v0, %v0, %v0
+	wfkesb	%v0, %v0, %v0
+	wfkesbs	%v0, %v0, %v0
+	wfkexb	%v0, %v0, %v0
+	wfkexbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhdb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhdbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhsbs	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhxb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhxbs	%v0, %v0, %v0
+
+	wfkhdb	%v0, %v0, %v0
+	wfkhdbs	%v0, %v0, %v0
+	wfkhsb	%v0, %v0, %v0
+	wfkhsbs	%v0, %v0, %v0
+	wfkhxb	%v0, %v0, %v0
+	wfkhxbs	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhedb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhedbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhesb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhesbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhexb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfkhexbs %v0, %v0, %v0
+
+	wfkhedb	%v0, %v0, %v0
+	wfkhedbs %v0, %v0, %v0
+	wfkhesb	%v0, %v0, %v0
+	wfkhesbs %v0, %v0, %v0
+	wfkhexb	%v0, %v0, %v0
+	wfkhexbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfpsosb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfpsoxb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflcsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflcxb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflnsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflnxb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflpsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflpxb	%v0, %v0
+
+	wfpsosb	%v0, %v0, 0
+	wfpsoxb	%v0, %v0, 0
+	wflcsb	%v0, %v0
+	wflcxb	%v0, %v0
+	wflnsb	%v0, %v0
+	wflnxb	%v0, %v0
+	wflpsb	%v0, %v0
+	wflpxb	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflls	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflld	%v0, %v0
+
+	wflls	%v0, %v0
+	wflld	%v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflrd	%v0, %v0, 0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wflrx	%v0, %v0, 0, 0
+
+	wflrd	%v0, %v0, 0, 0
+	wflrx	%v0, %v0, 0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxdb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxsb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxxb	%v0, %v0, %v0, 0
+
+	wfmaxdb	%v0, %v0, %v0, 0
+	wfmaxsb	%v0, %v0, %v0, 0
+	wfmaxxb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmindb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfminsb	%v0, %v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfminxb	%v0, %v0, %v0, 0
+
+	wfmindb	%v0, %v0, %v0, 0
+	wfminsb	%v0, %v0, %v0, 0
+	wfminxb	%v0, %v0, %v0, 0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmasb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmaxb	%v0, %v0, %v0, %v0
+
+	wfmasb	%v0, %v0, %v0, %v0
+	wfmaxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmsb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmxb	%v0, %v0, %v0
+
+	wfmsb	%v0, %v0, %v0
+	wfmxb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmssb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfmsxb	%v0, %v0, %v0, %v0
+
+	wfmssb	%v0, %v0, %v0, %v0
+	wfmsxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmadb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmasb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmaxb	%v0, %v0, %v0, %v0
+
+	wfnmadb	%v0, %v0, %v0, %v0
+	wfnmasb	%v0, %v0, %v0, %v0
+	wfnmaxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmsdb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmssb	%v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfnmsxb	%v0, %v0, %v0, %v0
+
+	wfnmsdb	%v0, %v0, %v0, %v0
+	wfnmssb	%v0, %v0, %v0, %v0
+	wfnmsxb	%v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfssb	%v0, %v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfsxb	%v0, %v0, %v0
+
+	wfssb	%v0, %v0, %v0
+	wfsxb	%v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfsqsb	%v0, %v0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wfsqxb	%v0, %v0
+
+	wfsqsb	%v0, %v0
+	wfsqxb	%v0, %v0
+
 #CHECK: error: invalid operand
 #CHECK: wftcidb	%v0, %v0, -1
 #CHECK: error: invalid operand
@@ -2329,6 +3026,14 @@
 	wftcidb	%v0, %v0, -1
 	wftcidb	%v0, %v0, 4096
 
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wftcisb	%v0, %v0, 0
+#CHECK: error: instruction requires: vector-enhancements-1
+#CHECK: wftcixb	%v0, %v0, 0
+
+	wftcisb	%v0, %v0, 0
+	wftcixb	%v0, %v0, 0
+
 #CHECK: error: invalid operand
 #CHECK: wledb	%v0, %v0, 0, -1
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-bad-z14.s b/test/MC/SystemZ/insn-bad-z14.s
new file mode 100644
index 000000000000..8bc736a7a1a4
--- /dev/null
+++ b/test/MC/SystemZ/insn-bad-z14.s
@@ -0,0 +1,752 @@
+# For z14 only.
+# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=z14 < %s 2> %t
+# RUN: FileCheck < %t %s
+# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch12 < %s 2> %t
+# RUN: FileCheck < %t %s
+
+#CHECK: error: invalid operand
+#CHECK: bi	-524289
+#CHECK: error: invalid operand
+#CHECK: bi	524288
+
+	bi	-524289
+	bi	524288
+
+#CHECK: error: invalid operand
+#CHECK: bic	-1, 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: bic	16, 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: bic	0, -524289
+#CHECK: error: invalid operand
+#CHECK: bic	0, 524288
+
+	bic	-1, 0(%r1)
+	bic	16, 0(%r1)
+	bic	0, -524289
+	bic	0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: agh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: agh	%r0, 524288
+
+	agh	%r0, -524289
+	agh	%r0, 524288
+
+#CHECK: error: invalid register pair
+#CHECK: kma	%r1, %r2, %r4
+#CHECK: error: invalid register pair
+#CHECK: kma	%r2, %r1, %r4
+#CHECK: error: invalid register pair
+#CHECK: kma	%r2, %r4, %r1
+
+	kma	%r1, %r2, %r4
+	kma	%r2, %r1, %r4
+	kma	%r2, %r4, %r1
+
+#CHECK: error: invalid operand
+#CHECK: lgg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lgg	%r0, 524288
+
+	lgg	%r0, -524289
+	lgg	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: lgsc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lgsc	%r0, 524288
+
+	lgsc	%r0, -524289
+	lgsc	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: llgfsg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llgfsg	%r0, 524288
+
+	llgfsg	%r0, -524289
+	llgfsg	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: mg	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mg	%r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: mg	%r1, 0
+
+	mg	%r0, -524289
+	mg	%r0, 524288
+	mg	%r1, 0
+
+#CHECK: error: invalid operand
+#CHECK: mgh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mgh	%r0, 524288
+
+	mgh	%r0, -524289
+	mgh	%r0, 524288
+
+#CHECK: error: invalid register pair
+#CHECK: mgrk	%r1, %r0, %r0
+
+	mgrk	%r1, %r0, %r0
+
+#CHECK: error: invalid operand
+#CHECK: msc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: msc	%r0, 524288
+
+	msc	%r0, -524289
+	msc	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: msgc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: msgc	%r0, 524288
+
+	msgc	%r0, -524289
+	msgc	%r0, 524288
+
+#CHECK: error: invalid register pair
+#CHECK: prno	%r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: prno	%r2, %r1
+
+	prno	%r1, %r2
+	prno	%r2, %r1
+
+#CHECK: error: invalid operand
+#CHECK: sgh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: sgh	%r0, 524288
+
+	sgh	%r0, -524289
+	sgh	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: stgsc	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: stgsc	%r0, 524288
+
+	stgsc	%r0, -524289
+	stgsc	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vap	%v0, %v0, %v0, 256, 0
+
+	vap	%v0, %v0, %v0, 0, -1
+	vap	%v0, %v0, %v0, 0, 16
+	vap	%v0, %v0, %v0, -1, 0
+	vap	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vcp	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vcp	%v0, %v0, 16
+
+	vcp	%v0, %v0, -1
+	vcp	%v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vcvb	%r0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvb	%r0, %v0, 16
+
+	vcvb	%r0, %v0, -1
+	vcvb	%r0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vcvbg	%r0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvbg	%r0, %v0, 16
+
+	vcvbg	%r0, %v0, -1
+	vcvbg	%r0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vcvd	%r0, %v0, 256, 0
+
+	vcvd	%r0, %v0, 0, -1
+	vcvd	%r0, %v0, 0, 16
+	vcvd	%r0, %v0, -1, 0
+	vcvd	%r0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vcvdg	%r0, %v0, 256, 0
+
+	vcvdg	%r0, %v0, 0, -1
+	vcvdg	%r0, %v0, 0, 16
+	vcvdg	%r0, %v0, -1, 0
+	vcvdg	%r0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vdp	%v0, %v0, %v0, 256, 0
+
+	vdp	%v0, %v0, %v0, 0, -1
+	vdp	%v0, %v0, %v0, 0, 16
+	vdp	%v0, %v0, %v0, -1, 0
+	vdp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfisb	%v0, %v0, 16, 0
+
+	vfisb	%v0, %v0, 0, -1
+	vfisb	%v0, %v0, 0, 16
+	vfisb	%v0, %v0, -1, 0
+	vfisb	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfll	%v0, %v0, 16, 0
+
+	vfll	%v0, %v0, 0, -1
+	vfll	%v0, %v0, 0, 16
+	vfll	%v0, %v0, -1, 0
+	vfll	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vflr	%v0, %v0, 16, 0, 0
+
+	vflr	%v0, %v0, 0, 0, -1
+	vflr	%v0, %v0, 0, 0, 16
+	vflr	%v0, %v0, 0, -1, 0
+	vflr	%v0, %v0, 0, 16, 0
+	vflr	%v0, %v0, -1, 0, 0
+	vflr	%v0, %v0, 16, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vflrd	%v0, %v0, 16, 0
+
+	vflrd	%v0, %v0, 0, -1
+	vflrd	%v0, %v0, 0, 16
+	vflrd	%v0, %v0, -1, 0
+	vflrd	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vfmax	%v0, %v0, %v0, 16, 0, 0
+
+	vfmax	%v0, %v0, %v0, 0, 0, -1
+	vfmax	%v0, %v0, %v0, 0, 0, 16
+	vfmax	%v0, %v0, %v0, 0, -1, 0
+	vfmax	%v0, %v0, %v0, 0, 16, 0
+	vfmax	%v0, %v0, %v0, -1, 0, 0
+	vfmax	%v0, %v0, %v0, 16, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfmaxdb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmaxdb	%v0, %v0, %v0, 16
+
+	vfmaxdb	%v0, %v0, %v0, -1
+	vfmaxdb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfmaxsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmaxsb	%v0, %v0, %v0, 16
+
+	vfmaxsb	%v0, %v0, %v0, -1
+	vfmaxsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vfmin	%v0, %v0, %v0, 16, 0, 0
+
+	vfmin	%v0, %v0, %v0, 0, 0, -1
+	vfmin	%v0, %v0, %v0, 0, 0, 16
+	vfmin	%v0, %v0, %v0, 0, -1, 0
+	vfmin	%v0, %v0, %v0, 0, 16, 0
+	vfmin	%v0, %v0, %v0, -1, 0, 0
+	vfmin	%v0, %v0, %v0, 16, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfmindb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfmindb	%v0, %v0, %v0, 16
+
+	vfmindb	%v0, %v0, %v0, -1
+	vfmindb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfminsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfminsb	%v0, %v0, %v0, 16
+
+	vfminsb	%v0, %v0, %v0, -1
+	vfminsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 16, 0
+
+	vfnma	%v0, %v0, %v0, %v0, 0, -1
+	vfnma	%v0, %v0, %v0, %v0, 0, 16
+	vfnma	%v0, %v0, %v0, %v0, -1, 0
+	vfnma	%v0, %v0, %v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 16, 0
+
+	vfnms	%v0, %v0, %v0, %v0, 0, -1
+	vfnms	%v0, %v0, %v0, %v0, 0, 16
+	vfnms	%v0, %v0, %v0, %v0, -1, 0
+	vfnms	%v0, %v0, %v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vftcisb	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vftcisb	%v0, %v0, 4096
+
+	vftcisb	%v0, %v0, -1
+	vftcisb	%v0, %v0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vlip	%v0, 65536, 0
+
+	vlip	%v0, 0, -1
+	vlip	%v0, 0, 16
+	vlip	%v0, -1, 0
+	vlip	%v0, 65536, 0
+
+#CHECK: error: invalid operand
+#CHECK: vllezlf	%v0, -1
+#CHECK: error: invalid operand
+#CHECK: vllezlf	%v0, 4096
+#CHECK: error: invalid use of vector addressing
+#CHECK: vllezlf	%v0, 0(%v1,%r2)
+
+	vllezlf	%v0, -1
+	vllezlf	%v0, 4096
+	vllezlf	%v0, 0(%v1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vlrl	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vlrl	%v0, 0(%r0), 0
+
+	vlrl	%v0, 0, -1
+	vlrl	%v0, 0, 256
+	vlrl	%v0, -1, 0
+	vlrl	%v0, 4096, 0
+	vlrl	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: vlrlr	%v0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: vlrlr	%v0, %r0, 4096
+#CHECK: error: %r0 used in an address
+#CHECK: vlrlr	%v0, %r0, 0(%r0)
+
+	vlrlr	%v0, %r0, -1
+	vlrlr	%v0, %r0, 4096
+	vlrlr	%v0, %r0, 0(%r0)
+
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vmp	%v0, %v0, %v0, 256, 0
+
+	vmp	%v0, %v0, %v0, 0, -1
+	vmp	%v0, %v0, %v0, 0, 16
+	vmp	%v0, %v0, %v0, -1, 0
+	vmp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vmsp	%v0, %v0, %v0, 256, 0
+
+	vmsp	%v0, %v0, %v0, 0, -1
+	vmsp	%v0, %v0, %v0, 0, 16
+	vmsp	%v0, %v0, %v0, -1, 0
+	vmsp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 16, 0
+
+	vmsl	%v0, %v0, %v0, %v0, 0, -1
+	vmsl	%v0, %v0, %v0, %v0, 0, 16
+	vmsl	%v0, %v0, %v0, %v0, -1, 0
+	vmsl	%v0, %v0, %v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: vmslg	%v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 16
+
+	vmslg	%v0, %v0, %v0, %v0, -1
+	vmslg	%v0, %v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vpkz	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vpkz	%v0, 0(%r0), 0
+
+	vpkz	%v0, 0, -1
+	vpkz	%v0, 0, 256
+	vpkz	%v0, -1, 0
+	vpkz	%v0, 4096, 0
+	vpkz	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 0, 256, 0
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vpsop	%v0, %v0, 256, 0, 0
+
+	vpsop	%v0, %v0, 0, 0, -1
+	vpsop	%v0, %v0, 0, 0, 16
+	vpsop	%v0, %v0, 0, -1, 0
+	vpsop	%v0, %v0, 0, 256, 0
+	vpsop	%v0, %v0, -1, 0, 0
+	vpsop	%v0, %v0, 256, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vrp	%v0, %v0, %v0, 256, 0
+
+	vrp	%v0, %v0, %v0, 0, -1
+	vrp	%v0, %v0, %v0, 0, 16
+	vrp	%v0, %v0, %v0, -1, 0
+	vrp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vsdp	%v0, %v0, %v0, 256, 0
+
+	vsdp	%v0, %v0, %v0, 0, -1
+	vsdp	%v0, %v0, %v0, 0, 16
+	vsdp	%v0, %v0, %v0, -1, 0
+	vsdp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vsp	%v0, %v0, %v0, 256, 0
+
+	vsp	%v0, %v0, %v0, 0, -1
+	vsp	%v0, %v0, %v0, 0, 16
+	vsp	%v0, %v0, %v0, -1, 0
+	vsp	%v0, %v0, %v0, 256, 0
+
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 0, 256, 0
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vsrp	%v0, %v0, 256, 0, 0
+
+	vsrp	%v0, %v0, 0, 0, -1
+	vsrp	%v0, %v0, 0, 0, 16
+	vsrp	%v0, %v0, 0, -1, 0
+	vsrp	%v0, %v0, 0, 256, 0
+	vsrp	%v0, %v0, -1, 0, 0
+	vsrp	%v0, %v0, 256, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vstrl	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vstrl	%v0, 0(%r0), 0
+
+	vstrl	%v0, 0, -1
+	vstrl	%v0, 0, 256
+	vstrl	%v0, -1, 0
+	vstrl	%v0, 4096, 0
+	vstrl	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrlr	%v0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrlr	%v0, %r0, 4096
+#CHECK: error: %r0 used in an address
+#CHECK: vstrlr	%v0, %r0, 0(%r0)
+
+	vstrlr	%v0, %r0, -1
+	vstrlr	%v0, %r0, 4096
+	vstrlr	%v0, %r0, 0(%r0)
+
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, 0, 256
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vupkz	%v0, 4096, 0
+#CHECK: error: %r0 used in an address
+#CHECK: vupkz	%v0, 0(%r0), 0
+
+	vupkz	%v0, 0, -1
+	vupkz	%v0, 0, 256
+	vupkz	%v0, -1, 0
+	vupkz	%v0, 4096, 0
+	vupkz	%v0, 0(%r0), 0
+
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wfisb	%v0, %v0, 16, 0
+
+	wfisb	%v0, %v0, 0, -1
+	wfisb	%v0, %v0, 0, 16
+	wfisb	%v0, %v0, -1, 0
+	wfisb	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wfixb	%v0, %v0, 16, 0
+
+	wfixb	%v0, %v0, 0, -1
+	wfixb	%v0, %v0, 0, 16
+	wfixb	%v0, %v0, -1, 0
+	wfixb	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wflrd	%v0, %v0, 16, 0
+
+	wflrd	%v0, %v0, 0, -1
+	wflrd	%v0, %v0, 0, 16
+	wflrd	%v0, %v0, -1, 0
+	wflrd	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: wflrx	%v0, %v0, 16, 0
+
+	wflrx	%v0, %v0, 0, -1
+	wflrx	%v0, %v0, 0, 16
+	wflrx	%v0, %v0, -1, 0
+	wflrx	%v0, %v0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: wfmaxdb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmaxdb	%v0, %v0, %v0, 16
+
+	wfmaxdb	%v0, %v0, %v0, -1
+	wfmaxdb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfmaxsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmaxsb	%v0, %v0, %v0, 16
+
+	wfmaxsb	%v0, %v0, %v0, -1
+	wfmaxsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfmaxxb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmaxxb	%v0, %v0, %v0, 16
+
+	wfmaxxb	%v0, %v0, %v0, -1
+	wfmaxxb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfmindb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfmindb	%v0, %v0, %v0, 16
+
+	wfmindb	%v0, %v0, %v0, -1
+	wfmindb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfminsb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfminsb	%v0, %v0, %v0, 16
+
+	wfminsb	%v0, %v0, %v0, -1
+	wfminsb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wfminxb	%v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wfminxb	%v0, %v0, %v0, 16
+
+	wfminxb	%v0, %v0, %v0, -1
+	wfminxb	%v0, %v0, %v0, 16
+
+#CHECK: error: invalid operand
+#CHECK: wftcisb	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wftcisb	%v0, %v0, 4096
+
+	wftcisb	%v0, %v0, -1
+	wftcisb	%v0, %v0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: wftcixb	%v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: wftcixb	%v0, %v0, 4096
+
+	wftcixb	%v0, %v0, -1
+	wftcixb	%v0, %v0, 4096
+
diff --git a/test/MC/SystemZ/insn-good-z14.s b/test/MC/SystemZ/insn-good-z14.s
new file mode 100644
index 000000000000..1fcdcb4ccab0
--- /dev/null
+++ b/test/MC/SystemZ/insn-good-z14.s
@@ -0,0 +1,2674 @@
+# For z14 and above.
+# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=z14 -show-encoding %s \
+# RUN:   | FileCheck %s
+# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch12 -show-encoding %s \
+# RUN:   | FileCheck %s
+
+#CHECK: agh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x38]
+#CHECK: agh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x38]
+#CHECK: agh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x38]
+#CHECK: agh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x38]
+#CHECK: agh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x38]
+#CHECK: agh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x38]
+#CHECK: agh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x38]
+#CHECK: agh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x38]
+#CHECK: agh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x38]
+#CHECK: agh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x38]
+
+	agh	%r0, -524288
+	agh	%r0, -1
+	agh	%r0, 0
+	agh	%r0, 1
+	agh	%r0, 524287
+	agh	%r0, 0(%r1)
+	agh	%r0, 0(%r15)
+	agh	%r0, 524287(%r1,%r15)
+	agh	%r0, 524287(%r15,%r1)
+	agh	%r15, 0
+
+#CHECK: bi	-524288                 # encoding: [0xe3,0xf0,0x00,0x00,0x80,0x47]
+#CHECK: bi	-1                      # encoding: [0xe3,0xf0,0x0f,0xff,0xff,0x47]
+#CHECK: bi	0                       # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x47]
+#CHECK: bi	1                       # encoding: [0xe3,0xf0,0x00,0x01,0x00,0x47]
+#CHECK: bi	524287                  # encoding: [0xe3,0xf0,0x0f,0xff,0x7f,0x47]
+#CHECK: bi	0(%r1)                  # encoding: [0xe3,0xf0,0x10,0x00,0x00,0x47]
+#CHECK: bi	0(%r15)                 # encoding: [0xe3,0xf0,0xf0,0x00,0x00,0x47]
+#CHECK: bi	524287(%r1,%r15)        # encoding: [0xe3,0xf1,0xff,0xff,0x7f,0x47]
+#CHECK: bi	524287(%r15,%r1)        # encoding: [0xe3,0xff,0x1f,0xff,0x7f,0x47]
+
+	bi	-524288
+	bi	-1
+	bi	0
+	bi	1
+	bi	524287
+	bi	0(%r1)
+	bi	0(%r15)
+	bi	524287(%r1,%r15)
+	bi	524287(%r15,%r1)
+
+#CHECK: bic	0, -524288              # encoding: [0xe3,0x00,0x00,0x00,0x80,0x47]
+#CHECK: bic	0, -1                   # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x47]
+#CHECK: bic	0, 0                    # encoding: [0xe3,0x00,0x00,0x00,0x00,0x47]
+#CHECK: bic	0, 1                    # encoding: [0xe3,0x00,0x00,0x01,0x00,0x47]
+#CHECK: bic	0, 524287               # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x47]
+#CHECK: bic	0, 0(%r1)               # encoding: [0xe3,0x00,0x10,0x00,0x00,0x47]
+#CHECK: bic	0, 0(%r15)              # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x47]
+#CHECK: bic	0, 524287(%r1,%r15)     # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x47]
+#CHECK: bic	0, 524287(%r15,%r1)     # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x47]
+#CHECK: bic	15, 0                   # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x47]
+
+	bic	0, -524288
+	bic	0, -1
+	bic	0, 0
+	bic	0, 1
+	bic	0, 524287
+	bic	0, 0(%r1)
+	bic	0, 0(%r15)
+	bic	0, 524287(%r1,%r15)
+	bic	0, 524287(%r15,%r1)
+	bic	15, 0
+
+#CHECK:	bic	1, 0(%r7)		# encoding: [0xe3,0x10,0x70,0x00,0x00,0x47]
+#CHECK:	bio	0(%r15)			# encoding: [0xe3,0x10,0xf0,0x00,0x00,0x47]
+
+	bic	1, 0(%r7)
+	bio	0(%r15)
+
+#CHECK:	bic	2, 0(%r7)		# encoding: [0xe3,0x20,0x70,0x00,0x00,0x47]
+#CHECK:	bih	0(%r15)			# encoding: [0xe3,0x20,0xf0,0x00,0x00,0x47]
+
+	bic	2, 0(%r7)
+	bih	0(%r15)
+
+#CHECK:	bic	3, 0(%r7)		# encoding: [0xe3,0x30,0x70,0x00,0x00,0x47]
+#CHECK:	binle	0(%r15)			# encoding: [0xe3,0x30,0xf0,0x00,0x00,0x47]
+
+	bic	3, 0(%r7)
+	binle	0(%r15)
+
+#CHECK:	bic	4, 0(%r7)		# encoding: [0xe3,0x40,0x70,0x00,0x00,0x47]
+#CHECK:	bil	0(%r15)			# encoding: [0xe3,0x40,0xf0,0x00,0x00,0x47]
+
+	bic	4, 0(%r7)
+	bil	0(%r15)
+
+#CHECK:	bic	5, 0(%r7)		# encoding: [0xe3,0x50,0x70,0x00,0x00,0x47]
+#CHECK:	binhe	0(%r15)			# encoding: [0xe3,0x50,0xf0,0x00,0x00,0x47]
+
+	bic	5, 0(%r7)
+	binhe	0(%r15)
+
+#CHECK:	bic	6, 0(%r7)		# encoding: [0xe3,0x60,0x70,0x00,0x00,0x47]
+#CHECK:	bilh	0(%r15)			# encoding: [0xe3,0x60,0xf0,0x00,0x00,0x47]
+
+	bic	6, 0(%r7)
+	bilh	0(%r15)
+
+#CHECK:	bic	7, 0(%r7)		# encoding: [0xe3,0x70,0x70,0x00,0x00,0x47]
+#CHECK:	bine	0(%r15)			# encoding: [0xe3,0x70,0xf0,0x00,0x00,0x47]
+
+	bic	7, 0(%r7)
+	bine	0(%r15)
+
+#CHECK:	bic	8, 0(%r7)		# encoding: [0xe3,0x80,0x70,0x00,0x00,0x47]
+#CHECK:	bie	0(%r15)			# encoding: [0xe3,0x80,0xf0,0x00,0x00,0x47]
+
+	bic	8, 0(%r7)
+	bie	0(%r15)
+
+#CHECK:	bic	9, 0(%r7)		# encoding: [0xe3,0x90,0x70,0x00,0x00,0x47]
+#CHECK:	binlh	0(%r15)			# encoding: [0xe3,0x90,0xf0,0x00,0x00,0x47]
+
+	bic	9, 0(%r7)
+	binlh	0(%r15)
+
+#CHECK:	bic	10, 0(%r7)		# encoding: [0xe3,0xa0,0x70,0x00,0x00,0x47]
+#CHECK:	bihe	0(%r15)			# encoding: [0xe3,0xa0,0xf0,0x00,0x00,0x47]
+
+	bic	10, 0(%r7)
+	bihe	0(%r15)
+
+#CHECK:	bic	11, 0(%r7)		# encoding: [0xe3,0xb0,0x70,0x00,0x00,0x47]
+#CHECK:	binl	0(%r15)			# encoding: [0xe3,0xb0,0xf0,0x00,0x00,0x47]
+
+	bic	11, 0(%r7)
+	binl	0(%r15)
+
+#CHECK:	bic	12, 0(%r7)		# encoding: [0xe3,0xc0,0x70,0x00,0x00,0x47]
+#CHECK:	bile	0(%r15)			# encoding: [0xe3,0xc0,0xf0,0x00,0x00,0x47]
+
+	bic	12, 0(%r7)
+	bile	0(%r15)
+
+#CHECK:	bic	13, 0(%r7)		# encoding: [0xe3,0xd0,0x70,0x00,0x00,0x47]
+#CHECK:	binh	0(%r15)			# encoding: [0xe3,0xd0,0xf0,0x00,0x00,0x47]
+
+	bic	13, 0(%r7)
+	binh	0(%r15)
+
+#CHECK:	bic	14, 0(%r7)		# encoding: [0xe3,0xe0,0x70,0x00,0x00,0x47]
+#CHECK:	bino	0(%r15)			# encoding: [0xe3,0xe0,0xf0,0x00,0x00,0x47]
+
+	bic	14, 0(%r7)
+	bino	0(%r15)
+
+#CHECK: irbm	%r0, %r0                # encoding: [0xb9,0xac,0x00,0x00]
+#CHECK: irbm	%r0, %r15               # encoding: [0xb9,0xac,0x00,0x0f]
+#CHECK: irbm	%r15, %r0               # encoding: [0xb9,0xac,0x00,0xf0]
+#CHECK: irbm	%r7, %r8                # encoding: [0xb9,0xac,0x00,0x78]
+#CHECK: irbm	%r15, %r15              # encoding: [0xb9,0xac,0x00,0xff]
+
+	irbm	%r0,%r0
+	irbm	%r0,%r15
+	irbm	%r15,%r0
+	irbm	%r7,%r8
+	irbm	%r15,%r15
+
+#CHECK: kma	%r2, %r2, %r2           # encoding: [0xb9,0x29,0x20,0x22]
+#CHECK: kma	%r2, %r8, %r14          # encoding: [0xb9,0x29,0x80,0x2e]
+#CHECK: kma	%r14, %r8, %r2          # encoding: [0xb9,0x29,0x80,0xe2]
+#CHECK: kma	%r6, %r8, %r10          # encoding: [0xb9,0x29,0x80,0x6a]
+
+	kma	%r2, %r2, %r2
+	kma	%r2, %r8, %r14
+	kma	%r14, %r8, %r2
+	kma	%r6, %r8, %r10
+
+#CHECK: lgg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x4c]
+#CHECK: lgg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x4c]
+#CHECK: lgg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x4c]
+#CHECK: lgg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x4c]
+#CHECK: lgg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x4c]
+#CHECK: lgg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x4c]
+#CHECK: lgg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x4c]
+#CHECK: lgg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x4c]
+#CHECK: lgg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x4c]
+#CHECK: lgg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x4c]
+
+	lgg	%r0, -524288
+	lgg	%r0, -1
+	lgg	%r0, 0
+	lgg	%r0, 1
+	lgg	%r0, 524287
+	lgg	%r0, 0(%r1)
+	lgg	%r0, 0(%r15)
+	lgg	%r0, 524287(%r1,%r15)
+	lgg	%r0, 524287(%r15,%r1)
+	lgg	%r15, 0
+
+#CHECK: lgsc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x4d]
+#CHECK: lgsc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x4d]
+#CHECK: lgsc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x4d]
+#CHECK: lgsc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x4d]
+#CHECK: lgsc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x4d]
+#CHECK: lgsc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x4d]
+#CHECK: lgsc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x4d]
+#CHECK: lgsc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x4d]
+#CHECK: lgsc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x4d]
+
+	lgsc	%r0, -524288
+	lgsc	%r0, -1
+	lgsc	%r0, 0
+	lgsc	%r0, 1
+	lgsc	%r0, 524287
+	lgsc	%r0, 0(%r1)
+	lgsc	%r0, 0(%r15)
+	lgsc	%r0, 524287(%r1,%r15)
+	lgsc	%r0, 524287(%r15,%r1)
+
+#CHECK: llgfsg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x48]
+#CHECK: llgfsg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x48]
+#CHECK: llgfsg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x48]
+#CHECK: llgfsg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x48]
+#CHECK: llgfsg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x48]
+#CHECK: llgfsg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x48]
+#CHECK: llgfsg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x48]
+#CHECK: llgfsg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x48]
+#CHECK: llgfsg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x48]
+#CHECK: llgfsg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x48]
+
+	llgfsg	%r0, -524288
+	llgfsg	%r0, -1
+	llgfsg	%r0, 0
+	llgfsg	%r0, 1
+	llgfsg	%r0, 524287
+	llgfsg	%r0, 0(%r1)
+	llgfsg	%r0, 0(%r15)
+	llgfsg	%r0, 524287(%r1,%r15)
+	llgfsg	%r0, 524287(%r15,%r1)
+	llgfsg	%r15, 0
+
+#CHECK: mg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x84]
+#CHECK: mg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x84]
+#CHECK: mg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x84]
+#CHECK: mg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x84]
+#CHECK: mg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x84]
+#CHECK: mg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x84]
+#CHECK: mg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x84]
+#CHECK: mg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x84]
+#CHECK: mg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x84]
+#CHECK: mg	%r14, 0                 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x84]
+
+	mg	%r0, -524288
+	mg	%r0, -1
+	mg	%r0, 0
+	mg	%r0, 1
+	mg	%r0, 524287
+	mg	%r0, 0(%r1)
+	mg	%r0, 0(%r15)
+	mg	%r0, 524287(%r1,%r15)
+	mg	%r0, 524287(%r15,%r1)
+	mg	%r14, 0
+
+#CHECK: mgh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3c]
+#CHECK: mgh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3c]
+#CHECK: mgh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3c]
+#CHECK: mgh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3c]
+#CHECK: mgh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3c]
+#CHECK: mgh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3c]
+#CHECK: mgh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3c]
+#CHECK: mgh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3c]
+#CHECK: mgh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3c]
+#CHECK: mgh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3c]
+
+	mgh	%r0, -524288
+	mgh	%r0, -1
+	mgh	%r0, 0
+	mgh	%r0, 1
+	mgh	%r0, 524287
+	mgh	%r0, 0(%r1)
+	mgh	%r0, 0(%r15)
+	mgh	%r0, 524287(%r1,%r15)
+	mgh	%r0, 524287(%r15,%r1)
+	mgh	%r15, 0
+
+#CHECK: mgrk	%r0, %r0, %r0           # encoding: [0xb9,0xec,0x00,0x00]
+#CHECK: mgrk	%r0, %r0, %r15          # encoding: [0xb9,0xec,0xf0,0x00]
+#CHECK: mgrk	%r0, %r15, %r0          # encoding: [0xb9,0xec,0x00,0x0f]
+#CHECK: mgrk	%r14, %r0, %r0          # encoding: [0xb9,0xec,0x00,0xe0]
+#CHECK: mgrk	%r6, %r8, %r9           # encoding: [0xb9,0xec,0x90,0x68]
+
+	mgrk	%r0,%r0,%r0
+	mgrk	%r0,%r0,%r15
+	mgrk	%r0,%r15,%r0
+	mgrk	%r14,%r0,%r0
+	mgrk	%r6,%r8,%r9
+
+#CHECK: msc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x53]
+#CHECK: msc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x53]
+#CHECK: msc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x53]
+#CHECK: msc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x53]
+#CHECK: msc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x53]
+#CHECK: msc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x53]
+#CHECK: msc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x53]
+#CHECK: msc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x53]
+#CHECK: msc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x53]
+#CHECK: msc	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x53]
+
+	msc	%r0, -524288
+	msc	%r0, -1
+	msc	%r0, 0
+	msc	%r0, 1
+	msc	%r0, 524287
+	msc	%r0, 0(%r1)
+	msc	%r0, 0(%r15)
+	msc	%r0, 524287(%r1,%r15)
+	msc	%r0, 524287(%r15,%r1)
+	msc	%r15, 0
+
+#CHECK: msgc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x83]
+#CHECK: msgc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x83]
+#CHECK: msgc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x83]
+#CHECK: msgc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x83]
+#CHECK: msgc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x83]
+#CHECK: msgc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x83]
+#CHECK: msgc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x83]
+#CHECK: msgc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x83]
+#CHECK: msgc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x83]
+#CHECK: msgc	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x83]
+
+	msgc	%r0, -524288
+	msgc	%r0, -1
+	msgc	%r0, 0
+	msgc	%r0, 1
+	msgc	%r0, 524287
+	msgc	%r0, 0(%r1)
+	msgc	%r0, 0(%r15)
+	msgc	%r0, 524287(%r1,%r15)
+	msgc	%r0, 524287(%r15,%r1)
+	msgc	%r15, 0
+
+#CHECK: msrkc	%r0, %r0, %r0           # encoding: [0xb9,0xfd,0x00,0x00]
+#CHECK: msrkc	%r0, %r0, %r15          # encoding: [0xb9,0xfd,0xf0,0x00]
+#CHECK: msrkc	%r0, %r15, %r0          # encoding: [0xb9,0xfd,0x00,0x0f]
+#CHECK: msrkc	%r15, %r0, %r0          # encoding: [0xb9,0xfd,0x00,0xf0]
+#CHECK: msrkc	%r7, %r8, %r9           # encoding: [0xb9,0xfd,0x90,0x78]
+
+	msrkc	%r0,%r0,%r0
+	msrkc	%r0,%r0,%r15
+	msrkc	%r0,%r15,%r0
+	msrkc	%r15,%r0,%r0
+	msrkc	%r7,%r8,%r9
+
+#CHECK: msgrkc	%r0, %r0, %r0           # encoding: [0xb9,0xed,0x00,0x00]
+#CHECK: msgrkc	%r0, %r0, %r15          # encoding: [0xb9,0xed,0xf0,0x00]
+#CHECK: msgrkc	%r0, %r15, %r0          # encoding: [0xb9,0xed,0x00,0x0f]
+#CHECK: msgrkc	%r15, %r0, %r0          # encoding: [0xb9,0xed,0x00,0xf0]
+#CHECK: msgrkc	%r7, %r8, %r9           # encoding: [0xb9,0xed,0x90,0x78]
+
+	msgrkc	%r0,%r0,%r0
+	msgrkc	%r0,%r0,%r15
+	msgrkc	%r0,%r15,%r0
+	msgrkc	%r15,%r0,%r0
+	msgrkc	%r7,%r8,%r9
+
+#CHECK: prno	%r2, %r2                # encoding: [0xb9,0x3c,0x00,0x22]
+#CHECK: prno	%r2, %r14               # encoding: [0xb9,0x3c,0x00,0x2e]
+#CHECK: prno	%r14, %r2               # encoding: [0xb9,0x3c,0x00,0xe2]
+#CHECK: prno	%r6, %r10               # encoding: [0xb9,0x3c,0x00,0x6a]
+
+	prno	%r2, %r2
+	prno	%r2, %r14
+	prno	%r14, %r2
+	prno	%r6, %r10
+
+#CHECK: sgh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x39]
+#CHECK: sgh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x39]
+#CHECK: sgh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x39]
+#CHECK: sgh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x39]
+#CHECK: sgh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x39]
+#CHECK: sgh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x39]
+#CHECK: sgh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x39]
+#CHECK: sgh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x39]
+#CHECK: sgh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x39]
+#CHECK: sgh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x39]
+
+	sgh	%r0, -524288
+	sgh	%r0, -1
+	sgh	%r0, 0
+	sgh	%r0, 1
+	sgh	%r0, 524287
+	sgh	%r0, 0(%r1)
+	sgh	%r0, 0(%r15)
+	sgh	%r0, 524287(%r1,%r15)
+	sgh	%r0, 524287(%r15,%r1)
+	sgh	%r15, 0
+
+#CHECK: stgsc	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x49]
+#CHECK: stgsc	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x49]
+#CHECK: stgsc	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x49]
+#CHECK: stgsc	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x49]
+#CHECK: stgsc	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x49]
+#CHECK: stgsc	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x49]
+#CHECK: stgsc	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x49]
+#CHECK: stgsc	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x49]
+#CHECK: stgsc	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x49]
+
+	stgsc	%r0, -524288
+	stgsc	%r0, -1
+	stgsc	%r0, 0
+	stgsc	%r0, 1
+	stgsc	%r0, 524287
+	stgsc	%r0, 0(%r1)
+	stgsc	%r0, 0(%r15)
+	stgsc	%r0, 524287(%r1,%r15)
+	stgsc	%r0, 524287(%r15,%r1)
+
+#CHECK: vap	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x71]
+#CHECK: vap	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x71]
+#CHECK: vap	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x71]
+#CHECK: vap	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x71]
+#CHECK: vap	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x71]
+#CHECK: vap	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x71]
+#CHECK: vap	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x71]
+
+	vap	%v0, %v0, %v0, 0, 0
+	vap	%v0, %v0, %v0, 0, 15
+	vap	%v0, %v0, %v0, 255, 0
+	vap	%v0, %v0, %v31, 0, 0
+	vap	%v0, %v31, %v0, 0, 0
+	vap	%v31, %v0, %v0, 0, 0
+	vap	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vbperm	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x85]
+#CHECK: vbperm	%v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x85]
+#CHECK: vbperm	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x85]
+#CHECK: vbperm	%v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x85]
+#CHECK: vbperm	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x85]
+#CHECK: vbperm	%v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x85]
+#CHECK: vbperm	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x85]
+#CHECK: vbperm	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x85]
+
+	vbperm	%v0, %v0, %v0
+	vbperm	%v0, %v0, %v15
+	vbperm	%v0, %v0, %v31
+	vbperm	%v0, %v15, %v0
+	vbperm	%v0, %v31, %v0
+	vbperm	%v15, %v0, %v0
+	vbperm	%v31, %v0, %v0
+	vbperm	%v18, %v3, %v20
+
+#CHECK: vcp	%v0, %v0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x77]
+#CHECK: vcp	%v0, %v0, 15            # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x77]
+#CHECK: vcp	%v15, %v0, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x77]
+#CHECK: vcp	%v31, %v0, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x77]
+#CHECK: vcp	%v0, %v15, 0            # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x77]
+#CHECK: vcp	%v0, %v31, 0            # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x77]
+#CHECK: vcp	%v3, %v18, 4            # encoding: [0xe6,0x03,0x20,0x40,0x02,0x77]
+
+	vcp	%v0, %v0, 0
+	vcp	%v0, %v0, 15
+	vcp	%v15, %v0, 0
+	vcp	%v31, %v0, 0
+	vcp	%v0, %v15, 0
+	vcp	%v0, %v31, 0
+	vcp	%v3, %v18, 4
+
+#CHECK: vcvb	%r0, %v0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x50]
+#CHECK: vcvb	%r0, %v0, 15            # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x50]
+#CHECK: vcvb	%r15, %v0, 0            # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x50]
+#CHECK: vcvb	%r0, %v15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x50]
+#CHECK: vcvb	%r0, %v31, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x50]
+#CHECK: vcvb	%r3, %v18, 4            # encoding: [0xe6,0x32,0x00,0x40,0x04,0x50]
+
+	vcvb	%r0, %v0, 0
+	vcvb	%r0, %v0, 15
+	vcvb	%r15, %v0, 0
+	vcvb	%r0, %v15, 0
+	vcvb	%r0, %v31, 0
+	vcvb	%r3, %v18, 4
+
+#CHECK: vcvbg	%r0, %v0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x52]
+#CHECK: vcvbg	%r0, %v0, 15            # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x52]
+#CHECK: vcvbg	%r15, %v0, 0            # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x52]
+#CHECK: vcvbg	%r0, %v15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x52]
+#CHECK: vcvbg	%r0, %v31, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x52]
+#CHECK: vcvbg	%r3, %v18, 4            # encoding: [0xe6,0x32,0x00,0x40,0x04,0x52]
+
+	vcvbg	%r0, %v0, 0
+	vcvbg	%r0, %v0, 15
+	vcvbg	%r15, %v0, 0
+	vcvbg	%r0, %v15, 0
+	vcvbg	%r0, %v31, 0
+	vcvbg	%r3, %v18, 4
+
+#CHECK: vcvd	%v0, %r0, 0, 0          # encoding: [0xe6,0x00,0x00,0x00,0x00,0x58]
+#CHECK: vcvd	%v0, %r0, 0, 15         # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x58]
+#CHECK: vcvd	%v0, %r0, 255, 0        # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x58]
+#CHECK: vcvd	%v0, %r15, 0, 0         # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x58]
+#CHECK: vcvd	%v15, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x58]
+#CHECK: vcvd	%v31, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x58]
+#CHECK: vcvd	%v18, %r9, 52, 11       # encoding: [0xe6,0x29,0x00,0xb3,0x48,0x58]
+
+	vcvd	%v0, %r0, 0, 0
+	vcvd	%v0, %r0, 0, 15
+	vcvd	%v0, %r0, 255, 0
+	vcvd	%v0, %r15, 0, 0
+	vcvd	%v15, %r0, 0, 0
+	vcvd	%v31, %r0, 0, 0
+	vcvd	%v18, %r9, 0x34, 11
+
+#CHECK: vcvdg	%v0, %r0, 0, 0          # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5a]
+#CHECK: vcvdg	%v0, %r0, 0, 15         # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x5a]
+#CHECK: vcvdg	%v0, %r0, 255, 0        # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x5a]
+#CHECK: vcvdg	%v0, %r15, 0, 0         # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x5a]
+#CHECK: vcvdg	%v15, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x5a]
+#CHECK: vcvdg	%v31, %r0, 0, 0         # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x5a]
+#CHECK: vcvdg	%v18, %r9, 52, 11       # encoding: [0xe6,0x29,0x00,0xb3,0x48,0x5a]
+
+	vcvdg	%v0, %r0, 0, 0
+	vcvdg	%v0, %r0, 0, 15
+	vcvdg	%v0, %r0, 255, 0
+	vcvdg	%v0, %r15, 0, 0
+	vcvdg	%v15, %r0, 0, 0
+	vcvdg	%v31, %r0, 0, 0
+	vcvdg	%v18, %r9, 0x34, 11
+
+#CHECK: vdp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7a]
+#CHECK: vdp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7a]
+#CHECK: vdp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7a]
+#CHECK: vdp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7a]
+#CHECK: vdp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7a]
+#CHECK: vdp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7a]
+#CHECK: vdp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7a]
+
+	vdp	%v0, %v0, %v0, 0, 0
+	vdp	%v0, %v0, %v0, 0, 15
+	vdp	%v0, %v0, %v0, 255, 0
+	vdp	%v0, %v0, %v31, 0, 0
+	vdp	%v0, %v31, %v0, 0, 0
+	vdp	%v31, %v0, %v0, 0, 0
+	vdp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vfasb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe3]
+#CHECK: vfasb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe3]
+#CHECK: vfasb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe3]
+#CHECK: vfasb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe3]
+#CHECK: vfasb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe3]
+
+	vfasb	%v0, %v0, %v0
+	vfasb	%v0, %v0, %v31
+	vfasb	%v0, %v31, %v0
+	vfasb	%v31, %v0, %v0
+	vfasb	%v18, %v3, %v20
+
+#CHECK: vfcesb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe8]
+#CHECK: vfcesb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe8]
+#CHECK: vfcesb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe8]
+#CHECK: vfcesb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe8]
+#CHECK: vfcesb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe8]
+
+	vfcesb	%v0, %v0, %v0
+	vfcesb	%v0, %v0, %v31
+	vfcesb	%v0, %v31, %v0
+	vfcesb	%v31, %v0, %v0
+	vfcesb	%v18, %v3, %v20
+
+#CHECK: vfcesbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x10,0x20,0xe8]
+#CHECK: vfcesbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xe8]
+#CHECK: vfcesbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xe8]
+#CHECK: vfcesbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xe8]
+#CHECK: vfcesbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xe8]
+
+	vfcesbs	%v0, %v0, %v0
+	vfcesbs	%v0, %v0, %v31
+	vfcesbs	%v0, %v31, %v0
+	vfcesbs	%v31, %v0, %v0
+	vfcesbs	%v18, %v3, %v20
+
+#CHECK: vfchsb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xeb]
+#CHECK: vfchsb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xeb]
+#CHECK: vfchsb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xeb]
+#CHECK: vfchsb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xeb]
+#CHECK: vfchsb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xeb]
+
+	vfchsb	%v0, %v0, %v0
+	vfchsb	%v0, %v0, %v31
+	vfchsb	%v0, %v31, %v0
+	vfchsb	%v31, %v0, %v0
+	vfchsb	%v18, %v3, %v20
+
+#CHECK: vfchsbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x10,0x20,0xeb]
+#CHECK: vfchsbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xeb]
+#CHECK: vfchsbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xeb]
+#CHECK: vfchsbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xeb]
+#CHECK: vfchsbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xeb]
+
+	vfchsbs	%v0, %v0, %v0
+	vfchsbs	%v0, %v0, %v31
+	vfchsbs	%v0, %v31, %v0
+	vfchsbs	%v31, %v0, %v0
+	vfchsbs	%v18, %v3, %v20
+
+#CHECK: vfchesb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xea]
+#CHECK: vfchesb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xea]
+#CHECK: vfchesb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xea]
+#CHECK: vfchesb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xea]
+#CHECK: vfchesb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xea]
+
+	vfchesb	%v0, %v0, %v0
+	vfchesb	%v0, %v0, %v31
+	vfchesb	%v0, %v31, %v0
+	vfchesb	%v31, %v0, %v0
+	vfchesb	%v18, %v3, %v20
+
+#CHECK: vfchesbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x10,0x20,0xea]
+#CHECK: vfchesbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x10,0x22,0xea]
+#CHECK: vfchesbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xea]
+#CHECK: vfchesbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xea]
+#CHECK: vfchesbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x10,0x2a,0xea]
+
+	vfchesbs %v0, %v0, %v0
+	vfchesbs %v0, %v0, %v31
+	vfchesbs %v0, %v31, %v0
+	vfchesbs %v31, %v0, %v0
+	vfchesbs %v18, %v3, %v20
+
+#CHECK: vfdsb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe5]
+#CHECK: vfdsb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe5]
+#CHECK: vfdsb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe5]
+#CHECK: vfdsb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe5]
+#CHECK: vfdsb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe5]
+
+	vfdsb	%v0, %v0, %v0
+	vfdsb	%v0, %v0, %v31
+	vfdsb	%v0, %v31, %v0
+	vfdsb	%v31, %v0, %v0
+	vfdsb	%v18, %v3, %v20
+
+#CHECK: vfisb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc7]
+#CHECK: vfisb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x20,0xc7]
+#CHECK: vfisb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x20,0xc7]
+#CHECK: vfisb   %v0, %v0, 7, 0          # encoding: [0xe7,0x00,0x00,0x07,0x20,0xc7]
+#CHECK: vfisb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc7]
+#CHECK: vfisb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc7]
+#CHECK: vfisb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x24,0xc7]
+
+	vfisb	%v0, %v0, 0, 0
+	vfisb	%v0, %v0, 0, 15
+	vfisb	%v0, %v0, 4, 0
+	vfisb	%v0, %v0, 7, 0
+	vfisb	%v0, %v31, 0, 0
+	vfisb	%v31, %v0, 0, 0
+	vfisb	%v14, %v17, 4, 10
+
+#CHECK: vfkedb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x30,0xe8]
+#CHECK: vfkedb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xe8]
+#CHECK: vfkedb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xe8]
+#CHECK: vfkedb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xe8]
+#CHECK: vfkedb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xe8]
+
+	vfkedb	%v0, %v0, %v0
+	vfkedb	%v0, %v0, %v31
+	vfkedb	%v0, %v31, %v0
+	vfkedb	%v31, %v0, %v0
+	vfkedb	%v18, %v3, %v20
+
+#CHECK: vfkedbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x30,0xe8]
+#CHECK: vfkedbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xe8]
+#CHECK: vfkedbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xe8]
+#CHECK: vfkedbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xe8]
+#CHECK: vfkedbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xe8]
+
+	vfkedbs	%v0, %v0, %v0
+	vfkedbs	%v0, %v0, %v31
+	vfkedbs	%v0, %v31, %v0
+	vfkedbs	%v31, %v0, %v0
+	vfkedbs	%v18, %v3, %v20
+
+#CHECK: vfkesb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x20,0xe8]
+#CHECK: vfkesb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xe8]
+#CHECK: vfkesb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xe8]
+#CHECK: vfkesb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xe8]
+#CHECK: vfkesb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xe8]
+
+	vfkesb	%v0, %v0, %v0
+	vfkesb	%v0, %v0, %v31
+	vfkesb	%v0, %v31, %v0
+	vfkesb	%v31, %v0, %v0
+	vfkesb	%v18, %v3, %v20
+
+#CHECK: vfkesbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x20,0xe8]
+#CHECK: vfkesbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xe8]
+#CHECK: vfkesbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xe8]
+#CHECK: vfkesbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xe8]
+#CHECK: vfkesbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xe8]
+
+	vfkesbs	%v0, %v0, %v0
+	vfkesbs	%v0, %v0, %v31
+	vfkesbs	%v0, %v31, %v0
+	vfkesbs	%v31, %v0, %v0
+	vfkesbs	%v18, %v3, %v20
+
+#CHECK: vfkhdb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x30,0xeb]
+#CHECK: vfkhdb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xeb]
+#CHECK: vfkhdb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xeb]
+#CHECK: vfkhdb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xeb]
+#CHECK: vfkhdb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xeb]
+
+	vfkhdb	%v0, %v0, %v0
+	vfkhdb	%v0, %v0, %v31
+	vfkhdb	%v0, %v31, %v0
+	vfkhdb	%v31, %v0, %v0
+	vfkhdb	%v18, %v3, %v20
+
+#CHECK: vfkhdbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x30,0xeb]
+#CHECK: vfkhdbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xeb]
+#CHECK: vfkhdbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xeb]
+#CHECK: vfkhdbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xeb]
+#CHECK: vfkhdbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xeb]
+
+	vfkhdbs	%v0, %v0, %v0
+	vfkhdbs	%v0, %v0, %v31
+	vfkhdbs	%v0, %v31, %v0
+	vfkhdbs	%v31, %v0, %v0
+	vfkhdbs	%v18, %v3, %v20
+
+#CHECK: vfkhsb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x20,0xeb]
+#CHECK: vfkhsb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xeb]
+#CHECK: vfkhsb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xeb]
+#CHECK: vfkhsb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xeb]
+#CHECK: vfkhsb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xeb]
+
+	vfkhsb	%v0, %v0, %v0
+	vfkhsb	%v0, %v0, %v31
+	vfkhsb	%v0, %v31, %v0
+	vfkhsb	%v31, %v0, %v0
+	vfkhsb	%v18, %v3, %v20
+
+#CHECK: vfkhsbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x14,0x20,0xeb]
+#CHECK: vfkhsbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xeb]
+#CHECK: vfkhsbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xeb]
+#CHECK: vfkhsbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xeb]
+#CHECK: vfkhsbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xeb]
+
+	vfkhsbs	%v0, %v0, %v0
+	vfkhsbs	%v0, %v0, %v31
+	vfkhsbs	%v0, %v31, %v0
+	vfkhsbs	%v31, %v0, %v0
+	vfkhsbs	%v18, %v3, %v20
+
+#CHECK: vfkhedb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x30,0xea]
+#CHECK: vfkhedb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x32,0xea]
+#CHECK: vfkhedb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x34,0xea]
+#CHECK: vfkhedb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x38,0xea]
+#CHECK: vfkhedb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x3a,0xea]
+
+	vfkhedb	%v0, %v0, %v0
+	vfkhedb	%v0, %v0, %v31
+	vfkhedb	%v0, %v31, %v0
+	vfkhedb	%v31, %v0, %v0
+	vfkhedb	%v18, %v3, %v20
+
+#CHECK: vfkhedbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x14,0x30,0xea]
+#CHECK: vfkhedbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x14,0x32,0xea]
+#CHECK: vfkhedbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x14,0x34,0xea]
+#CHECK: vfkhedbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x14,0x38,0xea]
+#CHECK: vfkhedbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x14,0x3a,0xea]
+
+	vfkhedbs %v0, %v0, %v0
+	vfkhedbs %v0, %v0, %v31
+	vfkhedbs %v0, %v31, %v0
+	vfkhedbs %v31, %v0, %v0
+	vfkhedbs %v18, %v3, %v20
+
+#CHECK: vfkhesb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x04,0x20,0xea]
+#CHECK: vfkhesb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x04,0x22,0xea]
+#CHECK: vfkhesb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x04,0x24,0xea]
+#CHECK: vfkhesb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x04,0x28,0xea]
+#CHECK: vfkhesb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x04,0x2a,0xea]
+
+	vfkhesb	%v0, %v0, %v0
+	vfkhesb	%v0, %v0, %v31
+	vfkhesb	%v0, %v31, %v0
+	vfkhesb	%v31, %v0, %v0
+	vfkhesb	%v18, %v3, %v20
+
+#CHECK: vfkhesbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x14,0x20,0xea]
+#CHECK: vfkhesbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x14,0x22,0xea]
+#CHECK: vfkhesbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x14,0x24,0xea]
+#CHECK: vfkhesbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x14,0x28,0xea]
+#CHECK: vfkhesbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x14,0x2a,0xea]
+
+	vfkhesbs %v0, %v0, %v0
+	vfkhesbs %v0, %v0, %v31
+	vfkhesbs %v0, %v31, %v0
+	vfkhesbs %v31, %v0, %v0
+	vfkhesbs %v18, %v3, %v20
+
+#CHECK: vfpsosb %v0, %v0, 3             # encoding: [0xe7,0x00,0x00,0x30,0x20,0xcc]
+#CHECK: vfpsosb %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0xf0,0x20,0xcc]
+#CHECK: vfpsosb %v0, %v15, 3            # encoding: [0xe7,0x0f,0x00,0x30,0x20,0xcc]
+#CHECK: vfpsosb %v0, %v31, 3            # encoding: [0xe7,0x0f,0x00,0x30,0x24,0xcc]
+#CHECK: vfpsosb %v15, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x30,0x20,0xcc]
+#CHECK: vfpsosb %v31, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x30,0x28,0xcc]
+#CHECK: vfpsosb %v14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x70,0x24,0xcc]
+
+	vfpsosb	%v0, %v0, 3
+	vfpsosb	%v0, %v0, 15
+	vfpsosb	%v0, %v15, 3
+	vfpsosb	%v0, %v31, 3
+	vfpsosb	%v15, %v0, 3
+	vfpsosb	%v31, %v0, 3
+	vfpsosb	%v14, %v17, 7
+
+#CHECK: vflcsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcc]
+#CHECK: vflcsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xcc]
+#CHECK: vflcsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xcc]
+#CHECK: vflcsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xcc]
+#CHECK: vflcsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xcc]
+#CHECK: vflcsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xcc]
+
+	vflcsb	%v0, %v0
+	vflcsb	%v0, %v15
+	vflcsb	%v0, %v31
+	vflcsb	%v15, %v0
+	vflcsb	%v31, %v0
+	vflcsb	%v14, %v17
+
+#CHECK: vflnsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x10,0x20,0xcc]
+#CHECK: vflnsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x10,0x20,0xcc]
+#CHECK: vflnsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x10,0x24,0xcc]
+#CHECK: vflnsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x10,0x20,0xcc]
+#CHECK: vflnsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x10,0x28,0xcc]
+#CHECK: vflnsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x10,0x24,0xcc]
+
+	vflnsb	%v0, %v0
+	vflnsb	%v0, %v15
+	vflnsb	%v0, %v31
+	vflnsb	%v15, %v0
+	vflnsb	%v31, %v0
+	vflnsb	%v14, %v17
+
+#CHECK: vflpsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x20,0x20,0xcc]
+#CHECK: vflpsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x20,0x20,0xcc]
+#CHECK: vflpsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x20,0x24,0xcc]
+#CHECK: vflpsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x20,0x20,0xcc]
+#CHECK: vflpsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x20,0x28,0xcc]
+#CHECK: vflpsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x20,0x24,0xcc]
+
+	vflpsb	%v0, %v0
+	vflpsb	%v0, %v15
+	vflpsb	%v0, %v31
+	vflpsb	%v15, %v0
+	vflpsb	%v31, %v0
+	vflpsb	%v14, %v17
+
+#CHECK: vfll    %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc4]
+#CHECK: vfll    %v0, %v0, 15, 0         # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc4]
+#CHECK: vfll    %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xc4]
+#CHECK: vfll    %v0, %v15, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xc4]
+#CHECK: vfll    %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xc4]
+#CHECK: vfll    %v15, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xc4]
+#CHECK: vfll    %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xc4]
+#CHECK: vfll    %v14, %v17, 11, 9       # encoding: [0xe7,0xe1,0x00,0x09,0xb4,0xc4]
+
+	vfll	%v0, %v0, 0, 0
+	vfll	%v0, %v0, 15, 0
+	vfll	%v0, %v0, 0, 15
+	vfll	%v0, %v15, 0, 0
+	vfll	%v0, %v31, 0, 0
+	vfll	%v15, %v0, 0, 0
+	vfll	%v31, %v0, 0, 0
+	vfll	%v14, %v17, 11, 9
+
+#CHECK: vflls   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc4]
+#CHECK: vflls   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xc4]
+#CHECK: vflls   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc4]
+#CHECK: vflls   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xc4]
+#CHECK: vflls   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc4]
+#CHECK: vflls   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xc4]
+
+	vflls	%v0, %v0
+	vflls	%v0, %v15
+	vflls	%v0, %v31
+	vflls	%v15, %v0
+	vflls	%v31, %v0
+	vflls	%v14, %v17
+
+#CHECK: vflr    %v0, %v0, 0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc5]
+#CHECK: vflr    %v0, %v0, 15, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc5]
+#CHECK: vflr    %v0, %v0, 0, 0, 15      # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xc5]
+#CHECK: vflr    %v0, %v0, 0, 4, 0       # encoding: [0xe7,0x00,0x00,0x04,0x00,0xc5]
+#CHECK: vflr    %v0, %v0, 0, 12, 0      # encoding: [0xe7,0x00,0x00,0x0c,0x00,0xc5]
+#CHECK: vflr    %v0, %v31, 0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xc5]
+#CHECK: vflr    %v31, %v0, 0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xc5]
+#CHECK: vflr    %v14, %v17, 11, 4, 10   # encoding: [0xe7,0xe1,0x00,0xa4,0xb4,0xc5]
+
+	vflr	%v0, %v0, 0, 0, 0
+	vflr	%v0, %v0, 15, 0, 0
+	vflr	%v0, %v0, 0, 0, 15
+	vflr	%v0, %v0, 0, 4, 0
+	vflr	%v0, %v0, 0, 12, 0
+	vflr	%v0, %v31, 0, 0, 0
+	vflr	%v31, %v0, 0, 0, 0
+	vflr	%v14, %v17, 11, 4, 10
+
+#CHECK: vflrd   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc5]
+#CHECK: vflrd   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc5]
+#CHECK: vflrd   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc5]
+#CHECK: vflrd   %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
+#CHECK: vflrd   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc5]
+#CHECK: vflrd   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc5]
+#CHECK: vflrd   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc5]
+
+	vflrd	%v0, %v0, 0, 0
+	vflrd	%v0, %v0, 0, 15
+	vflrd	%v0, %v0, 4, 0
+	vflrd	%v0, %v0, 12, 0
+	vflrd	%v0, %v31, 0, 0
+	vflrd	%v31, %v0, 0, 0
+	vflrd	%v14, %v17, 4, 10
+
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xef]
+#CHECK: vfmax	%v0, %v0, %v0, 15, 0, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xef]
+#CHECK: vfmax	%v0, %v0, %v0, 0, 15, 0    # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xef]
+#CHECK: vfmax	%v0, %v0, %v0, 0, 0, 4     # encoding: [0xe7,0x00,0x00,0x40,0x00,0xef]
+#CHECK: vfmax	%v0, %v0, %v31, 0, 0, 0    # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xef]
+#CHECK: vfmax	%v0, %v31, %v0, 0, 0, 0    # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xef]
+#CHECK: vfmax	%v31, %v0, %v0, 0, 0, 0    # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xef]
+#CHECK: vfmax	%v18, %v3, %v20, 11, 9, 12 # encoding: [0xe7,0x23,0x40,0xc9,0xba,0xef]
+
+	vfmax	%v0, %v0, %v0, 0, 0, 0
+	vfmax	%v0, %v0, %v0, 15, 0, 0
+	vfmax	%v0, %v0, %v0, 0, 15, 0
+	vfmax	%v0, %v0, %v0, 0, 0, 4
+	vfmax	%v0, %v0, %v31, 0, 0, 0
+	vfmax	%v0, %v31, %v0, 0, 0, 0
+	vfmax	%v31, %v0, %v0, 0, 0, 0
+	vfmax	%v18, %v3, %v20, 11, 9, 12
+
+#CHECK: vfmaxdb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x30,0xef]
+#CHECK: vfmaxdb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x30,0xef]
+#CHECK: vfmaxdb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xef]
+#CHECK: vfmaxdb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xef]
+#CHECK: vfmaxdb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xef]
+#CHECK: vfmaxdb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x3a,0xef]
+
+	vfmaxdb	%v0, %v0, %v0, 0
+	vfmaxdb	%v0, %v0, %v0, 4
+	vfmaxdb	%v0, %v0, %v31, 0
+	vfmaxdb	%v0, %v31, %v0, 0
+	vfmaxdb	%v31, %v0, %v0, 0
+	vfmaxdb	%v18, %v3, %v20, 12
+
+#CHECK: vfmaxsb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0xef]
+#CHECK: vfmaxsb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x20,0xef]
+#CHECK: vfmaxsb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xef]
+#CHECK: vfmaxsb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xef]
+#CHECK: vfmaxsb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xef]
+#CHECK: vfmaxsb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x2a,0xef]
+
+	vfmaxsb	%v0, %v0, %v0, 0
+	vfmaxsb	%v0, %v0, %v0, 4
+	vfmaxsb	%v0, %v0, %v31, 0
+	vfmaxsb	%v0, %v31, %v0, 0
+	vfmaxsb	%v31, %v0, %v0, 0
+	vfmaxsb	%v18, %v3, %v20, 12
+
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 0     # encoding: [0xe7,0x00,0x00,0x00,0x00,0xee]
+#CHECK: vfmin	%v0, %v0, %v0, 15, 0, 0    # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xee]
+#CHECK: vfmin	%v0, %v0, %v0, 0, 15, 0    # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xee]
+#CHECK: vfmin	%v0, %v0, %v0, 0, 0, 4     # encoding: [0xe7,0x00,0x00,0x40,0x00,0xee]
+#CHECK: vfmin	%v0, %v0, %v31, 0, 0, 0    # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xee]
+#CHECK: vfmin	%v0, %v31, %v0, 0, 0, 0    # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xee]
+#CHECK: vfmin	%v31, %v0, %v0, 0, 0, 0    # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xee]
+#CHECK: vfmin	%v18, %v3, %v20, 11, 9, 12 # encoding: [0xe7,0x23,0x40,0xc9,0xba,0xee]
+
+	vfmin	%v0, %v0, %v0, 0, 0, 0
+	vfmin	%v0, %v0, %v0, 15, 0, 0
+	vfmin	%v0, %v0, %v0, 0, 15, 0
+	vfmin	%v0, %v0, %v0, 0, 0, 4
+	vfmin	%v0, %v0, %v31, 0, 0, 0
+	vfmin	%v0, %v31, %v0, 0, 0, 0
+	vfmin	%v31, %v0, %v0, 0, 0, 0
+	vfmin	%v18, %v3, %v20, 11, 9, 12
+
+#CHECK: vfmindb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x30,0xee]
+#CHECK: vfmindb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x30,0xee]
+#CHECK: vfmindb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xee]
+#CHECK: vfmindb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xee]
+#CHECK: vfmindb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xee]
+#CHECK: vfmindb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x3a,0xee]
+
+	vfmindb	%v0, %v0, %v0, 0
+	vfmindb	%v0, %v0, %v0, 4
+	vfmindb	%v0, %v0, %v31, 0
+	vfmindb	%v0, %v31, %v0, 0
+	vfmindb	%v31, %v0, %v0, 0
+	vfmindb	%v18, %v3, %v20, 12
+
+#CHECK: vfminsb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0xee]
+#CHECK: vfminsb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x40,0x20,0xee]
+#CHECK: vfminsb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xee]
+#CHECK: vfminsb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xee]
+#CHECK: vfminsb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xee]
+#CHECK: vfminsb	%v18, %v3, %v20, 12     # encoding: [0xe7,0x23,0x40,0xc0,0x2a,0xee]
+
+	vfminsb	%v0, %v0, %v0, 0
+	vfminsb	%v0, %v0, %v0, 4
+	vfminsb	%v0, %v0, %v31, 0
+	vfminsb	%v0, %v31, %v0, 0
+	vfminsb	%v31, %v0, %v0, 0
+	vfminsb	%v18, %v3, %v20, 12
+
+#CHECK: vfmasb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8f]
+#CHECK: vfmasb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8f]
+#CHECK: vfmasb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8f]
+#CHECK: vfmasb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8f]
+#CHECK: vfmasb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8f]
+#CHECK: vfmasb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x8f]
+
+	vfmasb	%v0, %v0, %v0, %v0
+	vfmasb	%v0, %v0, %v0, %v31
+	vfmasb	%v0, %v0, %v31, %v0
+	vfmasb	%v0, %v31, %v0, %v0
+	vfmasb	%v31, %v0, %v0, %v0
+	vfmasb	%v13, %v17, %v21, %v25
+
+#CHECK: vfmsb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe7]
+#CHECK: vfmsb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe7]
+#CHECK: vfmsb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe7]
+#CHECK: vfmsb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe7]
+#CHECK: vfmsb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe7]
+
+	vfmsb	%v0, %v0, %v0
+	vfmsb	%v0, %v0, %v31
+	vfmsb	%v0, %v31, %v0
+	vfmsb	%v31, %v0, %v0
+	vfmsb	%v18, %v3, %v20
+
+#CHECK: vfmssb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8e]
+#CHECK: vfmssb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8e]
+#CHECK: vfmssb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8e]
+#CHECK: vfmssb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8e]
+#CHECK: vfmssb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8e]
+#CHECK: vfmssb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x8e]
+
+	vfmssb	%v0, %v0, %v0, %v0
+	vfmssb	%v0, %v0, %v0, %v31
+	vfmssb	%v0, %v0, %v31, %v0
+	vfmssb	%v0, %v31, %v0, %v0
+	vfmssb	%v31, %v0, %v0, %v0
+	vfmssb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0x9f]
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 0, 15      # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x9f]
+#CHECK: vfnma	%v0, %v0, %v0, %v0, 15, 0      # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x9f]
+#CHECK: vfnma	%v0, %v0, %v0, %v31, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x9f]
+#CHECK: vfnma	%v0, %v0, %v31, %v0, 0, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x9f]
+#CHECK: vfnma	%v0, %v31, %v0, %v0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x9f]
+#CHECK: vfnma	%v31, %v0, %v0, %v0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x9f]
+#CHECK: vfnma	%v13, %v17, %v21, %v25, 9, 11  # encoding: [0xe7,0xd1,0x5b,0x09,0x97,0x9f]
+
+	vfnma	%v0, %v0, %v0, %v0, 0, 0
+	vfnma	%v0, %v0, %v0, %v0, 0, 15
+	vfnma	%v0, %v0, %v0, %v0, 15, 0
+	vfnma	%v0, %v0, %v0, %v31, 0, 0
+	vfnma	%v0, %v0, %v31, %v0, 0, 0
+	vfnma	%v0, %v31, %v0, %v0, 0, 0
+	vfnma	%v31, %v0, %v0, %v0, 0, 0
+	vfnma	%v13, %v17, %v21, %v25, 9, 11
+
+#CHECK: vfnmadb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0x9f]
+#CHECK: vfnmadb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x9f]
+#CHECK: vfnmadb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x9f]
+#CHECK: vfnmadb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x9f]
+#CHECK: vfnmadb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x9f]
+#CHECK: vfnmadb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x9f]
+
+	vfnmadb	%v0, %v0, %v0, %v0
+	vfnmadb	%v0, %v0, %v0, %v31
+	vfnmadb	%v0, %v0, %v31, %v0
+	vfnmadb	%v0, %v31, %v0, %v0
+	vfnmadb	%v31, %v0, %v0, %v0
+	vfnmadb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnmasb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x9f]
+#CHECK: vfnmasb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x9f]
+#CHECK: vfnmasb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x9f]
+#CHECK: vfnmasb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x9f]
+#CHECK: vfnmasb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x9f]
+#CHECK: vfnmasb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x9f]
+
+	vfnmasb	%v0, %v0, %v0, %v0
+	vfnmasb	%v0, %v0, %v0, %v31
+	vfnmasb	%v0, %v0, %v31, %v0
+	vfnmasb	%v0, %v31, %v0, %v0
+	vfnmasb	%v31, %v0, %v0, %v0
+	vfnmasb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 0       # encoding: [0xe7,0x00,0x00,0x00,0x00,0x9e]
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 0, 15      # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x9e]
+#CHECK: vfnms	%v0, %v0, %v0, %v0, 15, 0      # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x9e]
+#CHECK: vfnms	%v0, %v0, %v0, %v31, 0, 0      # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x9e]
+#CHECK: vfnms	%v0, %v0, %v31, %v0, 0, 0      # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x9e]
+#CHECK: vfnms	%v0, %v31, %v0, %v0, 0, 0      # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x9e]
+#CHECK: vfnms	%v31, %v0, %v0, %v0, 0, 0      # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x9e]
+#CHECK: vfnms	%v13, %v17, %v21, %v25, 9, 11  # encoding: [0xe7,0xd1,0x5b,0x09,0x97,0x9e]
+
+	vfnms	%v0, %v0, %v0, %v0, 0, 0
+	vfnms	%v0, %v0, %v0, %v0, 0, 15
+	vfnms	%v0, %v0, %v0, %v0, 15, 0
+	vfnms	%v0, %v0, %v0, %v31, 0, 0
+	vfnms	%v0, %v0, %v31, %v0, 0, 0
+	vfnms	%v0, %v31, %v0, %v0, 0, 0
+	vfnms	%v31, %v0, %v0, %v0, 0, 0
+	vfnms	%v13, %v17, %v21, %v25, 9, 11
+
+#CHECK: vfnmsdb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0x9e]
+#CHECK: vfnmsdb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x9e]
+#CHECK: vfnmsdb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x9e]
+#CHECK: vfnmsdb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x9e]
+#CHECK: vfnmsdb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x9e]
+#CHECK: vfnmsdb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x9e]
+
+	vfnmsdb	%v0, %v0, %v0, %v0
+	vfnmsdb	%v0, %v0, %v0, %v31
+	vfnmsdb	%v0, %v0, %v31, %v0
+	vfnmsdb	%v0, %v31, %v0, %v0
+	vfnmsdb	%v31, %v0, %v0, %v0
+	vfnmsdb	%v13, %v17, %v21, %v25
+
+#CHECK: vfnmssb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0x9e]
+#CHECK: vfnmssb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x9e]
+#CHECK: vfnmssb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x9e]
+#CHECK: vfnmssb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x9e]
+#CHECK: vfnmssb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x9e]
+#CHECK: vfnmssb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0x9e]
+
+	vfnmssb	%v0, %v0, %v0, %v0
+	vfnmssb	%v0, %v0, %v0, %v31
+	vfnmssb	%v0, %v0, %v31, %v0
+	vfnmssb	%v0, %v31, %v0, %v0
+	vfnmssb	%v31, %v0, %v0, %v0
+	vfnmssb	%v13, %v17, %v21, %v25
+
+#CHECK: vfssb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xe2]
+#CHECK: vfssb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xe2]
+#CHECK: vfssb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xe2]
+#CHECK: vfssb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xe2]
+#CHECK: vfssb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xe2]
+
+	vfssb	%v0, %v0, %v0
+	vfssb	%v0, %v0, %v31
+	vfssb	%v0, %v31, %v0
+	vfssb	%v31, %v0, %v0
+	vfssb	%v18, %v3, %v20
+
+#CHECK: vfsqsb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xce]
+#CHECK: vfsqsb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xce]
+#CHECK: vfsqsb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xce]
+#CHECK: vfsqsb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xce]
+#CHECK: vfsqsb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xce]
+#CHECK: vfsqsb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xce]
+
+	vfsqsb	%v0, %v0
+	vfsqsb	%v0, %v15
+	vfsqsb	%v0, %v31
+	vfsqsb	%v15, %v0
+	vfsqsb	%v31, %v0
+	vfsqsb	%v14, %v17
+
+#CHECK: vftcisb %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x4a]
+#CHECK: vftcisb %v0, %v0, 4095          # encoding: [0xe7,0x00,0xff,0xf0,0x20,0x4a]
+#CHECK: vftcisb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x4a]
+#CHECK: vftcisb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x4a]
+#CHECK: vftcisb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x4a]
+#CHECK: vftcisb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x4a]
+#CHECK: vftcisb %v4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x80,0x24,0x4a]
+
+	vftcisb	%v0, %v0, 0
+	vftcisb	%v0, %v0, 4095
+	vftcisb	%v0, %v15, 0
+	vftcisb	%v0, %v31, 0
+	vftcisb	%v15, %v0, 0
+	vftcisb	%v31, %v0, 0
+	vftcisb	%v4, %v21, 0x678
+
+#CHECK: vlip	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x49]
+#CHECK: vlip	%v0, 0, 15              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x49]
+#CHECK: vlip	%v0, 65535, 0           # encoding: [0xe6,0x00,0xff,0xff,0x00,0x49]
+#CHECK: vlip	%v15, 0, 0              # encoding: [0xe6,0xf0,0x00,0x00,0x00,0x49]
+#CHECK: vlip	%v31, 0, 0              # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x49]
+#CHECK: vlip	%v17, 4660, 7           # encoding: [0xe6,0x10,0x12,0x34,0x78,0x49]
+
+	vlip	%v0, 0, 0
+	vlip	%v0, 0, 15
+	vlip	%v0, 0xffff, 0
+	vlip	%v15, 0, 0
+	vlip	%v31, 0, 0
+	vlip	%v17, 0x1234, 7
+
+#CHECK: vllezlf	%v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x60,0x04]
+#CHECK: vllezlf	%v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x60,0x04]
+#CHECK: vllezlf	%v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x60,0x04]
+#CHECK: vllezlf	%v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x60,0x04]
+#CHECK: vllezlf	%v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x60,0x04]
+#CHECK: vllezlf	%v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x68,0x04]
+#CHECK: vllezlf	%v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x68,0x04]
+
+	vllezlf	%v0, 0
+	vllezlf	%v0, 4095
+	vllezlf	%v0, 0(%r15)
+	vllezlf	%v0, 0(%r15,%r1)
+	vllezlf	%v15, 0
+	vllezlf	%v31, 0
+	vllezlf	%v18, 0x567(%r3,%r4)
+
+#CHECK: vlrl	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x35]
+#CHECK: vlrl	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x35]
+#CHECK: vlrl	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x35]
+#CHECK: vlrl	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x35]
+#CHECK: vlrl	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x35]
+#CHECK: vlrl	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x35]
+#CHECK: vlrl	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x35]
+
+	vlrl	%v0, 0, 0
+	vlrl	%v0, 4095, 0
+	vlrl	%v0, 0(%r15), 0
+	vlrl	%v0, 0, 255
+	vlrl	%v15, 0, 0
+	vlrl	%v31, 0, 0
+	vlrl	%v18, 1383(%r4), 3
+
+#CHECK: vlrlr	%v0, %r0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x37]
+#CHECK: vlrlr	%v0, %r0, 4095          # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x37]
+#CHECK: vlrlr	%v0, %r0, 0(%r15)       # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x37]
+#CHECK: vlrlr	%v0, %r15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x37]
+#CHECK: vlrlr	%v15, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x37]
+#CHECK: vlrlr	%v31, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x37]
+#CHECK: vlrlr	%v18, %r3, 1383(%r4)    # encoding: [0xe6,0x03,0x45,0x67,0x21,0x37]
+
+	vlrlr	%v0, %r0, 0
+	vlrlr	%v0, %r0, 4095
+	vlrlr	%v0, %r0, 0(%r15)
+	vlrlr	%v0, %r15, 0
+	vlrlr	%v15, %r0, 0
+	vlrlr	%v31, %r0, 0
+	vlrlr	%v18, %r3, 1383(%r4)
+
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 0    # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 15, 0   # encoding: [0xe7,0x00,0x0f,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v0, 0, 12   # encoding: [0xe7,0x00,0x00,0xc0,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v15, 0, 0   # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb8]
+#CHECK: vmsl	%v0, %v0, %v0, %v31, 0, 0   # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xb8]
+#CHECK: vmsl	%v0, %v0, %v15, %v0, 0, 0   # encoding: [0xe7,0x00,0xf0,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v0, %v31, %v0, 0, 0   # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb8]
+#CHECK: vmsl	%v0, %v15, %v0, %v0, 0, 0   # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xb8]
+#CHECK: vmsl	%v0, %v31, %v0, %v0, 0, 0   # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb8]
+#CHECK: vmsl	%v15, %v0, %v0, %v0, 0, 0   # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xb8]
+#CHECK: vmsl	%v31, %v0, %v0, %v0, 0, 0   # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb8]
+#CHECK: vmsl	%v18, %v3, %v20, %v5, 0, 4  # encoding: [0xe7,0x23,0x40,0x40,0x5a,0xb8]
+#CHECK: vmsl	%v18, %v3, %v20, %v5, 11, 8 # encoding: [0xe7,0x23,0x4b,0x80,0x5a,0xb8]
+
+	vmsl	%v0, %v0, %v0, %v0, 0, 0
+	vmsl	%v0, %v0, %v0, %v0, 15, 0
+	vmsl	%v0, %v0, %v0, %v0, 0, 12
+	vmsl	%v0, %v0, %v0, %v15, 0, 0
+	vmsl	%v0, %v0, %v0, %v31, 0, 0
+	vmsl	%v0, %v0, %v15, %v0, 0, 0
+	vmsl	%v0, %v0, %v31, %v0, 0, 0
+	vmsl	%v0, %v15, %v0, %v0, 0, 0
+	vmsl	%v0, %v31, %v0, %v0, 0, 0
+	vmsl	%v15, %v0, %v0, %v0, 0, 0
+	vmsl	%v31, %v0, %v0, %v0, 0, 0
+	vmsl	%v18, %v3, %v20, %v5, 0, 4
+	vmsl	%v18, %v3, %v20, %v5, 11, 8
+
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x03,0x00,0x00,0xb8]
+#CHECK: vmslg	%v0, %v0, %v0, %v0, 12  # encoding: [0xe7,0x00,0x03,0xc0,0x00,0xb8]
+#CHECK: vmslg	%v0, %v0, %v0, %v15, 0  # encoding: [0xe7,0x00,0x03,0x00,0xf0,0xb8]
+#CHECK: vmslg	%v0, %v0, %v0, %v31, 0  # encoding: [0xe7,0x00,0x03,0x00,0xf1,0xb8]
+#CHECK: vmslg	%v0, %v0, %v15, %v0, 0  # encoding: [0xe7,0x00,0xf3,0x00,0x00,0xb8]
+#CHECK: vmslg	%v0, %v0, %v31, %v0, 0  # encoding: [0xe7,0x00,0xf3,0x00,0x02,0xb8]
+#CHECK: vmslg	%v0, %v15, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x03,0x00,0x00,0xb8]
+#CHECK: vmslg	%v0, %v31, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x03,0x00,0x04,0xb8]
+#CHECK: vmslg	%v15, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x03,0x00,0x00,0xb8]
+#CHECK: vmslg	%v31, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x03,0x00,0x08,0xb8]
+#CHECK: vmslg	%v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x43,0x40,0x5a,0xb8]
+#CHECK: vmslg	%v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x43,0x80,0x5a,0xb8]
+
+	vmslg	%v0, %v0, %v0, %v0, 0
+	vmslg	%v0, %v0, %v0, %v0, 12
+	vmslg	%v0, %v0, %v0, %v15, 0
+	vmslg	%v0, %v0, %v0, %v31, 0
+	vmslg	%v0, %v0, %v15, %v0, 0
+	vmslg	%v0, %v0, %v31, %v0, 0
+	vmslg	%v0, %v15, %v0, %v0, 0
+	vmslg	%v0, %v31, %v0, %v0, 0
+	vmslg	%v15, %v0, %v0, %v0, 0
+	vmslg	%v31, %v0, %v0, %v0, 0
+	vmslg	%v18, %v3, %v20, %v5, 4
+	vmslg	%v18, %v3, %v20, %v5, 8
+
+#CHECK: vmp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x78]
+#CHECK: vmp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x78]
+#CHECK: vmp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x78]
+#CHECK: vmp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x78]
+#CHECK: vmp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x78]
+#CHECK: vmp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x78]
+#CHECK: vmp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x78]
+
+	vmp	%v0, %v0, %v0, 0, 0
+	vmp	%v0, %v0, %v0, 0, 15
+	vmp	%v0, %v0, %v0, 255, 0
+	vmp	%v0, %v0, %v31, 0, 0
+	vmp	%v0, %v31, %v0, 0, 0
+	vmp	%v31, %v0, %v0, 0, 0
+	vmp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vmsp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x79]
+#CHECK: vmsp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x79]
+#CHECK: vmsp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x79]
+#CHECK: vmsp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x79]
+#CHECK: vmsp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x79]
+#CHECK: vmsp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x79]
+#CHECK: vmsp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x79]
+
+	vmsp	%v0, %v0, %v0, 0, 0
+	vmsp	%v0, %v0, %v0, 0, 15
+	vmsp	%v0, %v0, %v0, 255, 0
+	vmsp	%v0, %v0, %v31, 0, 0
+	vmsp	%v0, %v31, %v0, 0, 0
+	vmsp	%v31, %v0, %v0, 0, 0
+	vmsp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vnn	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6e]
+#CHECK: vnn	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6e]
+#CHECK: vnn	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6e]
+#CHECK: vnn	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6e]
+#CHECK: vnn	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6e]
+
+	vnn	%v0, %v0, %v0
+	vnn	%v0, %v0, %v31
+	vnn	%v0, %v31, %v0
+	vnn	%v31, %v0, %v0
+	vnn	%v18, %v3, %v20
+
+#CHECK: vnx	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6c]
+#CHECK: vnx	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6c]
+#CHECK: vnx	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6c]
+#CHECK: vnx	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6c]
+#CHECK: vnx	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6c]
+
+	vnx	%v0, %v0, %v0
+	vnx	%v0, %v0, %v31
+	vnx	%v0, %v31, %v0
+	vnx	%v31, %v0, %v0
+	vnx	%v18, %v3, %v20
+
+#CHECK: voc	%v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6f]
+#CHECK: voc	%v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6f]
+#CHECK: voc	%v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6f]
+#CHECK: voc	%v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6f]
+#CHECK: voc	%v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6f]
+
+	voc	%v0, %v0, %v0
+	voc	%v0, %v0, %v31
+	voc	%v0, %v31, %v0
+	voc	%v31, %v0, %v0
+	voc	%v18, %v3, %v20
+
+#CHECK: vpkz	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x34]
+#CHECK: vpkz	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x34]
+#CHECK: vpkz	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x34]
+#CHECK: vpkz	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x34]
+#CHECK: vpkz	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x34]
+#CHECK: vpkz	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x34]
+#CHECK: vpkz	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x34]
+
+	vpkz	%v0, 0, 0
+	vpkz	%v0, 4095, 0
+	vpkz	%v0, 0(%r15), 0
+	vpkz	%v0, 0, 255
+	vpkz	%v15, 0, 0
+	vpkz	%v31, 0, 0
+	vpkz	%v18, 1383(%r4), 3
+
+#CHECK: vpopctb	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0x50]
+#CHECK: vpopctb	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x50]
+#CHECK: vpopctb	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x50]
+#CHECK: vpopctb	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x50]
+#CHECK: vpopctb	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x50]
+#CHECK: vpopctb	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x50]
+
+	vpopctb	%v0, %v0
+	vpopctb	%v0, %v15
+	vpopctb	%v0, %v31
+	vpopctb	%v15, %v0
+	vpopctb	%v31, %v0
+	vpopctb	%v14, %v17
+
+#CHECK: vpopctf	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0x50]
+#CHECK: vpopctf	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x50]
+#CHECK: vpopctf	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x50]
+#CHECK: vpopctf	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x50]
+#CHECK: vpopctf	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x50]
+#CHECK: vpopctf	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0x50]
+
+	vpopctf	%v0, %v0
+	vpopctf	%v0, %v15
+	vpopctf	%v0, %v31
+	vpopctf	%v15, %v0
+	vpopctf	%v31, %v0
+	vpopctf	%v14, %v17
+
+#CHECK: vpopctg	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0x50]
+#CHECK: vpopctg	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x50]
+#CHECK: vpopctg	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x50]
+#CHECK: vpopctg	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x50]
+#CHECK: vpopctg	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x50]
+#CHECK: vpopctg	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0x50]
+
+	vpopctg	%v0, %v0
+	vpopctg	%v0, %v15
+	vpopctg	%v0, %v31
+	vpopctg	%v15, %v0
+	vpopctg	%v31, %v0
+	vpopctg	%v14, %v17
+
+#CHECK: vpopcth	%v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0x50]
+#CHECK: vpopcth	%v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x50]
+#CHECK: vpopcth	%v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x50]
+#CHECK: vpopcth	%v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x50]
+#CHECK: vpopcth	%v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x50]
+#CHECK: vpopcth	%v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0x50]
+
+	vpopcth	%v0, %v0
+	vpopcth	%v0, %v15
+	vpopcth	%v0, %v31
+	vpopcth	%v15, %v0
+	vpopcth	%v31, %v0
+	vpopcth	%v14, %v17
+
+#CHECK: vpsop	%v0, %v0, 0, 0, 0       # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5b]
+#CHECK: vpsop	%v0, %v0, 0, 0, 15      # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x5b]
+#CHECK: vpsop	%v0, %v0, 0, 255, 0     # encoding: [0xe6,0x00,0xff,0x00,0x00,0x5b]
+#CHECK: vpsop	%v0, %v0, 255, 0, 0     # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x5b]
+#CHECK: vpsop	%v0, %v31, 0, 0, 0      # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x5b]
+#CHECK: vpsop	%v31, %v0, 0, 0, 0      # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x5b]
+#CHECK: vpsop	%v13, %v17, 52, 121, 11 # encoding: [0xe6,0xd1,0x79,0xb3,0x44,0x5b]
+
+	vpsop	%v0, %v0, 0, 0, 0
+	vpsop	%v0, %v0, 0, 0, 15
+	vpsop	%v0, %v0, 0, 255, 0
+	vpsop	%v0, %v0, 255, 0, 0
+	vpsop	%v0, %v31, 0, 0, 0
+	vpsop	%v31, %v0, 0, 0, 0
+	vpsop	%v13, %v17, 0x34, 0x79, 11
+
+#CHECK: vrp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7b]
+#CHECK: vrp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7b]
+#CHECK: vrp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7b]
+#CHECK: vrp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7b]
+#CHECK: vrp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7b]
+#CHECK: vrp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7b]
+#CHECK: vrp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7b]
+
+	vrp	%v0, %v0, %v0, 0, 0
+	vrp	%v0, %v0, %v0, 0, 15
+	vrp	%v0, %v0, %v0, 255, 0
+	vrp	%v0, %v0, %v31, 0, 0
+	vrp	%v0, %v31, %v0, 0, 0
+	vrp	%v31, %v0, %v0, 0, 0
+	vrp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vsdp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x7e]
+#CHECK: vsdp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x7e]
+#CHECK: vsdp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x7e]
+#CHECK: vsdp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x7e]
+#CHECK: vsdp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x7e]
+#CHECK: vsdp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x7e]
+#CHECK: vsdp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x7e]
+
+	vsdp	%v0, %v0, %v0, 0, 0
+	vsdp	%v0, %v0, %v0, 0, 15
+	vsdp	%v0, %v0, %v0, 255, 0
+	vsdp	%v0, %v0, %v31, 0, 0
+	vsdp	%v0, %v31, %v0, 0, 0
+	vsdp	%v31, %v0, %v0, 0, 0
+	vsdp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vsp	%v0, %v0, %v0, 0, 0     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x73]
+#CHECK: vsp	%v0, %v0, %v0, 0, 15    # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x73]
+#CHECK: vsp	%v0, %v0, %v0, 255, 0   # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x73]
+#CHECK: vsp	%v0, %v0, %v31, 0, 0    # encoding: [0xe6,0x00,0xf0,0x00,0x02,0x73]
+#CHECK: vsp	%v0, %v31, %v0, 0, 0    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x73]
+#CHECK: vsp	%v31, %v0, %v0, 0, 0    # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x73]
+#CHECK: vsp	%v13, %v17, %v21, 121, 11 # encoding: [0xe6,0xd1,0x50,0xb7,0x96,0x73]
+
+	vsp	%v0, %v0, %v0, 0, 0
+	vsp	%v0, %v0, %v0, 0, 15
+	vsp	%v0, %v0, %v0, 255, 0
+	vsp	%v0, %v0, %v31, 0, 0
+	vsp	%v0, %v31, %v0, 0, 0
+	vsp	%v31, %v0, %v0, 0, 0
+	vsp	%v13, %v17, %v21, 0x79, 11
+
+#CHECK: vsrp	%v0, %v0, 0, 0, 0       # encoding: [0xe6,0x00,0x00,0x00,0x00,0x59]
+#CHECK: vsrp	%v0, %v0, 0, 0, 15      # encoding: [0xe6,0x00,0x00,0xf0,0x00,0x59]
+#CHECK: vsrp	%v0, %v0, 0, 255, 0     # encoding: [0xe6,0x00,0xff,0x00,0x00,0x59]
+#CHECK: vsrp	%v0, %v0, 255, 0, 0     # encoding: [0xe6,0x00,0x00,0x0f,0xf0,0x59]
+#CHECK: vsrp	%v0, %v31, 0, 0, 0      # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x59]
+#CHECK: vsrp	%v31, %v0, 0, 0, 0      # encoding: [0xe6,0xf0,0x00,0x00,0x08,0x59]
+#CHECK: vsrp	%v13, %v17, 52, 121, 11 # encoding: [0xe6,0xd1,0x79,0xb3,0x44,0x59]
+
+	vsrp	%v0, %v0, 0, 0, 0
+	vsrp	%v0, %v0, 0, 0, 15
+	vsrp	%v0, %v0, 0, 255, 0
+	vsrp	%v0, %v0, 255, 0, 0
+	vsrp	%v0, %v31, 0, 0, 0
+	vsrp	%v31, %v0, 0, 0, 0
+	vsrp	%v13, %v17, 0x34, 0x79, 11
+
+#CHECK: vstrl	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3d]
+#CHECK: vstrl	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3d]
+#CHECK: vstrl	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3d]
+#CHECK: vstrl	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x3d]
+#CHECK: vstrl	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3d]
+#CHECK: vstrl	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3d]
+#CHECK: vstrl	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3d]
+
+	vstrl	%v0, 0, 0
+	vstrl	%v0, 4095, 0
+	vstrl	%v0, 0(%r15), 0
+	vstrl	%v0, 0, 255
+	vstrl	%v15, 0, 0
+	vstrl	%v31, 0, 0
+	vstrl	%v18, 1383(%r4), 3
+
+#CHECK: vstrlr	%v0, %r0, 0             # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3f]
+#CHECK: vstrlr	%v0, %r0, 4095          # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3f]
+#CHECK: vstrlr	%v0, %r0, 0(%r15)       # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3f]
+#CHECK: vstrlr	%v0, %r15, 0            # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x3f]
+#CHECK: vstrlr	%v15, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3f]
+#CHECK: vstrlr	%v31, %r0, 0            # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3f]
+#CHECK: vstrlr	%v18, %r3, 1383(%r4)    # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3f]
+
+	vstrlr	%v0, %r0, 0
+	vstrlr	%v0, %r0, 4095
+	vstrlr	%v0, %r0, 0(%r15)
+	vstrlr	%v0, %r15, 0
+	vstrlr	%v15, %r0, 0
+	vstrlr	%v31, %r0, 0
+	vstrlr	%v18, %r3, 1383(%r4)
+
+#CHECK: vtp	%v0                     # encoding: [0xe6,0x00,0x00,0x00,0x00,0x5f]
+#CHECK: vtp	%v15                    # encoding: [0xe6,0x0f,0x00,0x00,0x00,0x5f]
+#CHECK: vtp	%v31                    # encoding: [0xe6,0x0f,0x00,0x00,0x04,0x5f]
+
+	vtp	%v0
+	vtp	%v15
+	vtp	%v31
+
+#CHECK: vupkz	%v0, 0, 0               # encoding: [0xe6,0x00,0x00,0x00,0x00,0x3c]
+#CHECK: vupkz	%v0, 4095, 0            # encoding: [0xe6,0x00,0x0f,0xff,0x00,0x3c]
+#CHECK: vupkz	%v0, 0(%r15), 0         # encoding: [0xe6,0x00,0xf0,0x00,0x00,0x3c]
+#CHECK: vupkz	%v0, 0, 255             # encoding: [0xe6,0xff,0x00,0x00,0x00,0x3c]
+#CHECK: vupkz	%v15, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf0,0x3c]
+#CHECK: vupkz	%v31, 0, 0              # encoding: [0xe6,0x00,0x00,0x00,0xf1,0x3c]
+#CHECK: vupkz	%v18, 1383(%r4), 3      # encoding: [0xe6,0x03,0x45,0x67,0x21,0x3c]
+
+	vupkz	%v0, 0, 0
+	vupkz	%v0, 4095, 0
+	vupkz	%v0, 0(%r15), 0
+	vupkz	%v0, 0, 255
+	vupkz	%v15, 0, 0
+	vupkz	%v31, 0, 0
+	vupkz	%v18, 1383(%r4), 3
+
+#CHECK: wfasb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe3]
+#CHECK: wfasb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe3]
+#CHECK: wfasb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe3]
+#CHECK: wfasb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe3]
+#CHECK: wfasb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe3]
+#CHECK: wfasb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe3]
+
+	wfasb	%v0, %v0, %v0
+	wfasb   %f0, %f0, %f0
+	wfasb	%v0, %v0, %v31
+	wfasb	%v0, %v31, %v0
+	wfasb	%v31, %v0, %v0
+	wfasb	%v18, %v3, %v20
+
+#CHECK: wfaxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe3]
+#CHECK: wfaxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe3]
+#CHECK: wfaxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe3]
+#CHECK: wfaxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe3]
+#CHECK: wfaxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe3]
+
+	wfaxb	%v0, %v0, %v0
+	wfaxb	%v0, %v0, %v31
+	wfaxb	%v0, %v31, %v0
+	wfaxb	%v31, %v0, %v0
+	wfaxb	%v18, %v3, %v20
+
+#CHECK: wfcsb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xcb]
+#CHECK: wfcsb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xcb]
+#CHECK: wfcsb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xcb]
+#CHECK: wfcsb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xcb]
+
+	wfcsb	%v0, %v0
+	wfcsb	%f0, %f0
+	wfcsb	%v0, %v15
+	wfcsb	%v0, %v31
+	wfcsb	%v15, %v0
+	wfcsb	%v31, %v0
+	wfcsb	%v14, %v17
+
+#CHECK: wfcxb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x40,0xcb]
+#CHECK: wfcxb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x40,0xcb]
+#CHECK: wfcxb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xcb]
+#CHECK: wfcxb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x40,0xcb]
+#CHECK: wfcxb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xcb]
+#CHECK: wfcxb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x44,0xcb]
+
+	wfcxb	%v0, %v0
+	wfcxb	%v0, %v15
+	wfcxb	%v0, %v31
+	wfcxb	%v15, %v0
+	wfcxb	%v31, %v0
+	wfcxb	%v14, %v17
+
+#CHECK: wfcesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe8]
+#CHECK: wfcesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe8]
+#CHECK: wfcesb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe8]
+#CHECK: wfcesb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe8]
+#CHECK: wfcesb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe8]
+#CHECK: wfcesb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe8]
+
+	wfcesb	%v0, %v0, %v0
+	wfcesb	%f0, %f0, %f0
+	wfcesb	%v0, %v0, %v31
+	wfcesb	%v0, %v31, %v0
+	wfcesb	%v31, %v0, %v0
+	wfcesb	%v18, %v3, %v20
+
+#CHECK: wfcesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xe8]
+#CHECK: wfcesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xe8]
+#CHECK: wfcesbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xe8]
+#CHECK: wfcesbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xe8]
+#CHECK: wfcesbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xe8]
+#CHECK: wfcesbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xe8]
+
+	wfcesbs	%v0, %v0, %v0
+	wfcesbs	%f0, %f0, %f0
+	wfcesbs	%v0, %v0, %v31
+	wfcesbs	%v0, %v31, %v0
+	wfcesbs	%v31, %v0, %v0
+	wfcesbs	%v18, %v3, %v20
+
+#CHECK: wfcexb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe8]
+#CHECK: wfcexb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe8]
+#CHECK: wfcexb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe8]
+#CHECK: wfcexb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe8]
+#CHECK: wfcexb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe8]
+
+	wfcexb	%v0, %v0, %v0
+	wfcexb	%v0, %v0, %v31
+	wfcexb	%v0, %v31, %v0
+	wfcexb	%v31, %v0, %v0
+	wfcexb	%v18, %v3, %v20
+
+#CHECK: wfcexbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x18,0x40,0xe8]
+#CHECK: wfcexbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xe8]
+#CHECK: wfcexbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xe8]
+#CHECK: wfcexbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xe8]
+#CHECK: wfcexbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xe8]
+
+	wfcexbs	%v0, %v0, %v0
+	wfcexbs	%v0, %v0, %v31
+	wfcexbs	%v0, %v31, %v0
+	wfcexbs	%v31, %v0, %v0
+	wfcexbs	%v18, %v3, %v20
+
+#CHECK: wfchsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xeb]
+#CHECK: wfchsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xeb]
+#CHECK: wfchsb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xeb]
+#CHECK: wfchsb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xeb]
+#CHECK: wfchsb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xeb]
+#CHECK: wfchsb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xeb]
+
+	wfchsb	%v0, %v0, %v0
+	wfchsb	%f0, %f0, %f0
+	wfchsb	%v0, %v0, %v31
+	wfchsb	%v0, %v31, %v0
+	wfchsb	%v31, %v0, %v0
+	wfchsb	%v18, %v3, %v20
+
+#CHECK: wfchsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xeb]
+#CHECK: wfchsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x20,0xeb]
+#CHECK: wfchsbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xeb]
+#CHECK: wfchsbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xeb]
+#CHECK: wfchsbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xeb]
+#CHECK: wfchsbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xeb]
+
+	wfchsbs	%v0, %v0, %v0
+	wfchsbs	%f0, %f0, %f0
+	wfchsbs	%v0, %v0, %v31
+	wfchsbs	%v0, %v31, %v0
+	wfchsbs	%v31, %v0, %v0
+	wfchsbs	%v18, %v3, %v20
+
+#CHECK: wfchxb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xeb]
+#CHECK: wfchxb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xeb]
+#CHECK: wfchxb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xeb]
+#CHECK: wfchxb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xeb]
+#CHECK: wfchxb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xeb]
+
+	wfchxb	%v0, %v0, %v0
+	wfchxb	%v0, %v0, %v31
+	wfchxb	%v0, %v31, %v0
+	wfchxb	%v31, %v0, %v0
+	wfchxb	%v18, %v3, %v20
+
+#CHECK: wfchxbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x18,0x40,0xeb]
+#CHECK: wfchxbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xeb]
+#CHECK: wfchxbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xeb]
+#CHECK: wfchxbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xeb]
+#CHECK: wfchxbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xeb]
+
+	wfchxbs	%v0, %v0, %v0
+	wfchxbs	%v0, %v0, %v31
+	wfchxbs	%v0, %v31, %v0
+	wfchxbs	%v31, %v0, %v0
+	wfchxbs	%v18, %v3, %v20
+
+#CHECK: wfchesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xea]
+#CHECK: wfchesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xea]
+#CHECK: wfchesb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xea]
+#CHECK: wfchesb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xea]
+#CHECK: wfchesb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xea]
+#CHECK: wfchesb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xea]
+
+	wfchesb	%v0, %v0, %v0
+	wfchesb	%f0, %f0, %f0
+	wfchesb	%v0, %v0, %v31
+	wfchesb	%v0, %v31, %v0
+	wfchesb	%v31, %v0, %v0
+	wfchesb	%v18, %v3, %v20
+
+#CHECK: wfchesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x18,0x20,0xea]
+#CHECK: wfchesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x18,0x20,0xea]
+#CHECK: wfchesbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x18,0x22,0xea]
+#CHECK: wfchesbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xea]
+#CHECK: wfchesbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xea]
+#CHECK: wfchesbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x18,0x2a,0xea]
+
+	wfchesbs %v0, %v0, %v0
+	wfchesbs %f0, %f0, %f0
+	wfchesbs %v0, %v0, %v31
+	wfchesbs %v0, %v31, %v0
+	wfchesbs %v31, %v0, %v0
+	wfchesbs %v18, %v3, %v20
+
+#CHECK: wfchexb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xea]
+#CHECK: wfchexb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xea]
+#CHECK: wfchexb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xea]
+#CHECK: wfchexb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xea]
+#CHECK: wfchexb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xea]
+
+	wfchexb	%v0, %v0, %v0
+	wfchexb	%v0, %v0, %v31
+	wfchexb	%v0, %v31, %v0
+	wfchexb	%v31, %v0, %v0
+	wfchexb	%v18, %v3, %v20
+
+#CHECK: wfchexbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x18,0x40,0xea]
+#CHECK: wfchexbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x18,0x42,0xea]
+#CHECK: wfchexbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xea]
+#CHECK: wfchexbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xea]
+#CHECK: wfchexbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x18,0x4a,0xea]
+
+	wfchexbs %v0, %v0, %v0
+	wfchexbs %v0, %v0, %v31
+	wfchexbs %v0, %v31, %v0
+	wfchexbs %v31, %v0, %v0
+	wfchexbs %v18, %v3, %v20
+
+#CHECK: wfdsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe5]
+#CHECK: wfdsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe5]
+#CHECK: wfdsb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe5]
+#CHECK: wfdsb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe5]
+#CHECK: wfdsb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe5]
+#CHECK: wfdsb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe5]
+
+	wfdsb	%v0, %v0, %v0
+	wfdsb	%f0, %f0, %f0
+	wfdsb	%v0, %v0, %v31
+	wfdsb	%v0, %v31, %v0
+	wfdsb	%v31, %v0, %v0
+	wfdsb	%v18, %v3, %v20
+
+#CHECK: wfdxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe5]
+#CHECK: wfdxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe5]
+#CHECK: wfdxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe5]
+#CHECK: wfdxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe5]
+#CHECK: wfdxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe5]
+
+	wfdxb	%v0, %v0, %v0
+	wfdxb	%v0, %v0, %v31
+	wfdxb	%v0, %v31, %v0
+	wfdxb	%v31, %v0, %v0
+	wfdxb	%v18, %v3, %v20
+
+#CHECK: wfisb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc7]
+#CHECK: wfisb	%f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc7]
+#CHECK: wfisb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x20,0xc7]
+#CHECK: wfisb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xc7]
+#CHECK: wfisb   %f0, %f0, 7, 0          # encoding: [0xe7,0x00,0x00,0x0f,0x20,0xc7]
+#CHECK: wfisb   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc7]
+#CHECK: wfisb   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc7]
+#CHECK: wfisb   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x24,0xc7]
+
+	wfisb	%v0, %v0, 0, 0
+	wfisb	%f0, %f0, 0, 0
+ 	wfisb	%v0, %v0, 0, 15
+	wfisb	%v0, %v0, 4, 0
+	wfisb	%v0, %v0, 7, 0
+	wfisb	%v0, %v31, 0, 0
+	wfisb	%v31, %v0, 0, 0
+	wfisb	%v14, %v17, 4, 10
+
+#CHECK: wfixb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc7]
+#CHECK: wfixb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xc7]
+#CHECK: wfixb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xc7]
+#CHECK: wfixb   %v0, %v0, 7, 0          # encoding: [0xe7,0x00,0x00,0x0f,0x40,0xc7]
+#CHECK: wfixb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xc7]
+#CHECK: wfixb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xc7]
+#CHECK: wfixb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x44,0xc7]
+
+	wfixb	%v0, %v0, 0, 0
+ 	wfixb	%v0, %v0, 0, 15
+	wfixb	%v0, %v0, 4, 0
+	wfixb	%v0, %v0, 7, 0
+	wfixb	%v0, %v31, 0, 0
+	wfixb	%v31, %v0, 0, 0
+	wfixb	%v14, %v17, 4, 10
+
+#CHECK: wfksb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xca]
+#CHECK: wfksb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xca]
+#CHECK: wfksb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xca]
+#CHECK: wfksb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xca]
+
+	wfksb	%v0, %v0
+	wfksb	%f0, %f0
+	wfksb	%v0, %v15
+	wfksb	%v0, %v31
+	wfksb	%v15, %v0
+	wfksb	%v31, %v0
+	wfksb	%v14, %v17
+
+#CHECK: wfkxb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x40,0xca]
+#CHECK: wfkxb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x40,0xca]
+#CHECK: wfkxb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xca]
+#CHECK: wfkxb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x40,0xca]
+#CHECK: wfkxb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xca]
+#CHECK: wfkxb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x44,0xca]
+
+	wfkxb	%v0, %v0
+	wfkxb	%v0, %v15
+	wfkxb	%v0, %v31
+	wfkxb	%v15, %v0
+	wfkxb	%v31, %v0
+	wfkxb	%v14, %v17
+
+#CHECK: wfkedb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xe8]
+#CHECK: wfkedb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xe8]
+#CHECK: wfkedb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xe8]
+#CHECK: wfkedb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xe8]
+#CHECK: wfkedb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xe8]
+#CHECK: wfkedb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xe8]
+
+	wfkedb	%v0, %v0, %v0
+	wfkedb	%f0, %f0, %f0
+	wfkedb	%v0, %v0, %v31
+	wfkedb	%v0, %v31, %v0
+	wfkedb	%v31, %v0, %v0
+	wfkedb	%v18, %v3, %v20
+
+#CHECK: wfkedbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xe8]
+#CHECK: wfkedbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xe8]
+#CHECK: wfkedbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xe8]
+#CHECK: wfkedbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xe8]
+#CHECK: wfkedbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xe8]
+#CHECK: wfkedbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xe8]
+
+	wfkedbs	%v0, %v0, %v0
+	wfkedbs	%f0, %f0, %f0
+	wfkedbs	%v0, %v0, %v31
+	wfkedbs	%v0, %v31, %v0
+	wfkedbs	%v31, %v0, %v0
+	wfkedbs	%v18, %v3, %v20
+
+#CHECK: wfkesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xe8]
+#CHECK: wfkesb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xe8]
+#CHECK: wfkesb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xe8]
+#CHECK: wfkesb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xe8]
+#CHECK: wfkesb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xe8]
+#CHECK: wfkesb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xe8]
+
+	wfkesb	%v0, %v0, %v0
+	wfkesb	%f0, %f0, %f0
+	wfkesb	%v0, %v0, %v31
+	wfkesb	%v0, %v31, %v0
+	wfkesb	%v31, %v0, %v0
+	wfkesb	%v18, %v3, %v20
+
+#CHECK: wfkesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xe8]
+#CHECK: wfkesbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xe8]
+#CHECK: wfkesbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xe8]
+#CHECK: wfkesbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xe8]
+#CHECK: wfkesbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xe8]
+#CHECK: wfkesbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xe8]
+
+	wfkesbs	%v0, %v0, %v0
+	wfkesbs	%f0, %f0, %f0
+	wfkesbs	%v0, %v0, %v31
+	wfkesbs	%v0, %v31, %v0
+	wfkesbs	%v31, %v0, %v0
+	wfkesbs	%v18, %v3, %v20
+
+#CHECK: wfkexb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xe8]
+#CHECK: wfkexb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xe8]
+#CHECK: wfkexb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xe8]
+#CHECK: wfkexb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xe8]
+#CHECK: wfkexb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xe8]
+
+	wfkexb	%v0, %v0, %v0
+	wfkexb	%v0, %v0, %v31
+	wfkexb	%v0, %v31, %v0
+	wfkexb	%v31, %v0, %v0
+	wfkexb	%v18, %v3, %v20
+
+#CHECK: wfkexbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xe8]
+#CHECK: wfkexbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xe8]
+#CHECK: wfkexbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xe8]
+#CHECK: wfkexbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xe8]
+#CHECK: wfkexbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xe8]
+
+	wfkexbs	%v0, %v0, %v0
+	wfkexbs	%v0, %v0, %v31
+	wfkexbs	%v0, %v31, %v0
+	wfkexbs	%v31, %v0, %v0
+	wfkexbs	%v18, %v3, %v20
+
+#CHECK: wfkhdb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xeb]
+#CHECK: wfkhdb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xeb]
+#CHECK: wfkhdb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xeb]
+#CHECK: wfkhdb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xeb]
+#CHECK: wfkhdb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xeb]
+#CHECK: wfkhdb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xeb]
+
+	wfkhdb	%v0, %v0, %v0
+	wfkhdb	%f0, %f0, %f0
+	wfkhdb	%v0, %v0, %v31
+	wfkhdb	%v0, %v31, %v0
+	wfkhdb	%v31, %v0, %v0
+	wfkhdb	%v18, %v3, %v20
+
+#CHECK: wfkhdbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xeb]
+#CHECK: wfkhdbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xeb]
+#CHECK: wfkhdbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xeb]
+#CHECK: wfkhdbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xeb]
+#CHECK: wfkhdbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xeb]
+#CHECK: wfkhdbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xeb]
+
+	wfkhdbs	%v0, %v0, %v0
+	wfkhdbs	%f0, %f0, %f0
+	wfkhdbs	%v0, %v0, %v31
+	wfkhdbs	%v0, %v31, %v0
+	wfkhdbs	%v31, %v0, %v0
+	wfkhdbs	%v18, %v3, %v20
+
+#CHECK: wfkhsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xeb]
+#CHECK: wfkhsb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xeb]
+#CHECK: wfkhsb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xeb]
+#CHECK: wfkhsb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xeb]
+#CHECK: wfkhsb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xeb]
+#CHECK: wfkhsb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xeb]
+
+	wfkhsb	%v0, %v0, %v0
+	wfkhsb	%f0, %f0, %f0
+	wfkhsb	%v0, %v0, %v31
+	wfkhsb	%v0, %v31, %v0
+	wfkhsb	%v31, %v0, %v0
+	wfkhsb	%v18, %v3, %v20
+
+#CHECK: wfkhsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xeb]
+#CHECK: wfkhsbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xeb]
+#CHECK: wfkhsbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xeb]
+#CHECK: wfkhsbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xeb]
+#CHECK: wfkhsbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xeb]
+#CHECK: wfkhsbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xeb]
+
+	wfkhsbs	%v0, %v0, %v0
+	wfkhsbs	%f0, %f0, %f0
+	wfkhsbs	%v0, %v0, %v31
+	wfkhsbs	%v0, %v31, %v0
+	wfkhsbs	%v31, %v0, %v0
+	wfkhsbs	%v18, %v3, %v20
+
+#CHECK: wfkhxb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xeb]
+#CHECK: wfkhxb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xeb]
+#CHECK: wfkhxb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xeb]
+#CHECK: wfkhxb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xeb]
+#CHECK: wfkhxb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xeb]
+
+	wfkhxb	%v0, %v0, %v0
+	wfkhxb	%v0, %v0, %v31
+	wfkhxb	%v0, %v31, %v0
+	wfkhxb	%v31, %v0, %v0
+	wfkhxb	%v18, %v3, %v20
+
+#CHECK: wfkhxbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xeb]
+#CHECK: wfkhxbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xeb]
+#CHECK: wfkhxbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xeb]
+#CHECK: wfkhxbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xeb]
+#CHECK: wfkhxbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xeb]
+
+	wfkhxbs	%v0, %v0, %v0
+	wfkhxbs	%v0, %v0, %v31
+	wfkhxbs	%v0, %v31, %v0
+	wfkhxbs	%v31, %v0, %v0
+	wfkhxbs	%v18, %v3, %v20
+
+#CHECK: wfkhedb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xea]
+#CHECK: wfkhedb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xea]
+#CHECK: wfkhedb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x32,0xea]
+#CHECK: wfkhedb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x34,0xea]
+#CHECK: wfkhedb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x38,0xea]
+#CHECK: wfkhedb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x3a,0xea]
+
+	wfkhedb	%v0, %v0, %v0
+	wfkhedb	%f0, %f0, %f0
+	wfkhedb	%v0, %v0, %v31
+	wfkhedb	%v0, %v31, %v0
+	wfkhedb	%v31, %v0, %v0
+	wfkhedb	%v18, %v3, %v20
+
+#CHECK: wfkhedbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xea]
+#CHECK: wfkhedbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x30,0xea]
+#CHECK: wfkhedbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x1c,0x32,0xea]
+#CHECK: wfkhedbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x1c,0x34,0xea]
+#CHECK: wfkhedbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x1c,0x38,0xea]
+#CHECK: wfkhedbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x1c,0x3a,0xea]
+
+	wfkhedbs %v0, %v0, %v0
+	wfkhedbs %f0, %f0, %f0
+	wfkhedbs %v0, %v0, %v31
+	wfkhedbs %v0, %v31, %v0
+	wfkhedbs %v31, %v0, %v0
+	wfkhedbs %v18, %v3, %v20
+
+#CHECK: wfkhesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xea]
+#CHECK: wfkhesb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x0c,0x20,0xea]
+#CHECK: wfkhesb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x22,0xea]
+#CHECK: wfkhesb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x0c,0x24,0xea]
+#CHECK: wfkhesb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x0c,0x28,0xea]
+#CHECK: wfkhesb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x2a,0xea]
+
+	wfkhesb	%v0, %v0, %v0
+	wfkhesb	%f0, %f0, %f0
+	wfkhesb	%v0, %v0, %v31
+	wfkhesb	%v0, %v31, %v0
+	wfkhesb	%v31, %v0, %v0
+	wfkhesb	%v18, %v3, %v20
+
+#CHECK: wfkhesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xea]
+#CHECK: wfkhesbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x1c,0x20,0xea]
+#CHECK: wfkhesbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x1c,0x22,0xea]
+#CHECK: wfkhesbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x1c,0x24,0xea]
+#CHECK: wfkhesbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x1c,0x28,0xea]
+#CHECK: wfkhesbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x1c,0x2a,0xea]
+
+	wfkhesbs %v0, %v0, %v0
+	wfkhesbs %f0, %f0, %f0
+	wfkhesbs %v0, %v0, %v31
+	wfkhesbs %v0, %v31, %v0
+	wfkhesbs %v31, %v0, %v0
+	wfkhesbs %v18, %v3, %v20
+
+#CHECK: wfkhexb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xea]
+#CHECK: wfkhexb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x0c,0x42,0xea]
+#CHECK: wfkhexb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x0c,0x44,0xea]
+#CHECK: wfkhexb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x0c,0x48,0xea]
+#CHECK: wfkhexb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x0c,0x4a,0xea]
+
+	wfkhexb	%v0, %v0, %v0
+	wfkhexb	%v0, %v0, %v31
+	wfkhexb	%v0, %v31, %v0
+	wfkhexb	%v31, %v0, %v0
+	wfkhexb	%v18, %v3, %v20
+
+#CHECK: wfkhexbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x1c,0x40,0xea]
+#CHECK: wfkhexbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x1c,0x42,0xea]
+#CHECK: wfkhexbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x1c,0x44,0xea]
+#CHECK: wfkhexbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x1c,0x48,0xea]
+#CHECK: wfkhexbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x1c,0x4a,0xea]
+
+	wfkhexbs %v0, %v0, %v0
+	wfkhexbs %v0, %v0, %v31
+	wfkhexbs %v0, %v31, %v0
+	wfkhexbs %v31, %v0, %v0
+	wfkhexbs %v18, %v3, %v20
+
+#CHECK: wfpsosb %f0, %f0, 3             # encoding: [0xe7,0x00,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %f0, %f0, 3             # encoding: [0xe7,0x00,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %f0, %f0, 15            # encoding: [0xe7,0x00,0x00,0xf8,0x20,0xcc]
+#CHECK: wfpsosb %f0, %f15, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %f0, %v31, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x24,0xcc]
+#CHECK: wfpsosb %f15, %f0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x20,0xcc]
+#CHECK: wfpsosb %v31, %f0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x28,0xcc]
+#CHECK: wfpsosb %f14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x78,0x24,0xcc]
+
+	wfpsosb	%v0, %v0, 3
+	wfpsosb	%f0, %f0, 3
+	wfpsosb	%v0, %v0, 15
+	wfpsosb	%v0, %v15, 3
+	wfpsosb	%v0, %v31, 3
+	wfpsosb	%v15, %v0, 3
+	wfpsosb	%v31, %v0, 3
+	wfpsosb	%v14, %v17, 7
+
+#CHECK: wfpsoxb %v0, %v0, 3             # encoding: [0xe7,0x00,0x00,0x38,0x40,0xcc]
+#CHECK: wfpsoxb %v0, %v0, 15            # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xcc]
+#CHECK: wfpsoxb %v0, %v15, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x40,0xcc]
+#CHECK: wfpsoxb %v0, %v31, 3            # encoding: [0xe7,0x0f,0x00,0x38,0x44,0xcc]
+#CHECK: wfpsoxb %v15, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x40,0xcc]
+#CHECK: wfpsoxb %v31, %v0, 3            # encoding: [0xe7,0xf0,0x00,0x38,0x48,0xcc]
+#CHECK: wfpsoxb %v14, %v17, 7           # encoding: [0xe7,0xe1,0x00,0x78,0x44,0xcc]
+
+	wfpsoxb	%v0, %v0, 3
+	wfpsoxb	%v0, %v0, 15
+	wfpsoxb	%v0, %v15, 3
+	wfpsoxb	%v0, %v31, 3
+	wfpsoxb	%v15, %v0, 3
+	wfpsoxb	%v31, %v0, 3
+	wfpsoxb	%v14, %v17, 7
+
+#CHECK: wflcsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xcc]
+#CHECK: wflcsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xcc]
+#CHECK: wflcsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xcc]
+#CHECK: wflcsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xcc]
+
+	wflcsb	%v0, %v0
+	wflcsb  %f0, %f0
+	wflcsb	%v0, %v15
+	wflcsb	%v0, %v31
+	wflcsb	%v15, %v0
+	wflcsb	%v31, %v0
+	wflcsb	%v14, %v17
+
+#CHECK: wflcxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x08,0x40,0xcc]
+#CHECK: wflcxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x08,0x40,0xcc]
+#CHECK: wflcxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xcc]
+#CHECK: wflcxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x40,0xcc]
+#CHECK: wflcxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xcc]
+#CHECK: wflcxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x44,0xcc]
+
+	wflcxb	%v0, %v0
+	wflcxb	%v0, %v15
+	wflcxb	%v0, %v31
+	wflcxb	%v15, %v0
+	wflcxb	%v31, %v0
+	wflcxb	%v14, %v17
+
+#CHECK: wflnsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x18,0x24,0xcc]
+#CHECK: wflnsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x18,0x20,0xcc]
+#CHECK: wflnsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x18,0x28,0xcc]
+#CHECK: wflnsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x18,0x24,0xcc]
+
+	wflnsb	%v0, %v0
+	wflnsb	%f0, %f0
+	wflnsb	%v0, %v15
+	wflnsb	%v0, %v31
+	wflnsb	%v15, %v0
+	wflnsb	%v31, %v0
+	wflnsb	%v14, %v17
+
+#CHECK: wflnxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x18,0x40,0xcc]
+#CHECK: wflnxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x18,0x40,0xcc]
+#CHECK: wflnxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x18,0x44,0xcc]
+#CHECK: wflnxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x18,0x40,0xcc]
+#CHECK: wflnxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x18,0x48,0xcc]
+#CHECK: wflnxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x18,0x44,0xcc]
+
+	wflnxb	%v0, %v0
+	wflnxb	%v0, %v15
+	wflnxb	%v0, %v31
+	wflnxb	%v15, %v0
+	wflnxb	%v31, %v0
+	wflnxb	%v14, %v17
+
+#CHECK: wflpsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x28,0x24,0xcc]
+#CHECK: wflpsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x28,0x20,0xcc]
+#CHECK: wflpsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x28,0x28,0xcc]
+#CHECK: wflpsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x28,0x24,0xcc]
+
+	wflpsb	%v0, %v0
+	wflpsb	%f0, %f0
+	wflpsb	%v0, %v15
+	wflpsb	%v0, %v31
+	wflpsb	%v15, %v0
+	wflpsb	%v31, %v0
+	wflpsb	%v14, %v17
+
+#CHECK: wflpxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x28,0x40,0xcc]
+#CHECK: wflpxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x28,0x40,0xcc]
+#CHECK: wflpxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x28,0x44,0xcc]
+#CHECK: wflpxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x28,0x40,0xcc]
+#CHECK: wflpxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x28,0x48,0xcc]
+#CHECK: wflpxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x28,0x44,0xcc]
+
+	wflpxb	%v0, %v0
+	wflpxb	%v0, %v15
+	wflpxb	%v0, %v31
+	wflpxb	%v15, %v0
+	wflpxb	%v31, %v0
+	wflpxb	%v14, %v17
+
+#CHECK: wflls   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc4]
+#CHECK: wflls   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xc4]
+#CHECK: wflls   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc4]
+#CHECK: wflls   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xc4]
+
+	wflls	%v0, %v0
+	wflls	%f0, %f0
+	wflls	%v0, %v15
+	wflls	%v0, %v31
+	wflls	%v15, %v0
+	wflls	%v31, %v0
+	wflls	%v14, %v17
+
+#CHECK: wflld   %v0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc4]
+#CHECK: wflld   %v15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xc4]
+#CHECK: wflld   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc4]
+#CHECK: wflld   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xc4]
+
+	wflld	%v0, %v0
+	wflld	%v0, %f0
+	wflld	%v0, %v15
+	wflld	%v0, %v31
+	wflld	%v15, %v0
+	wflld	%v31, %v0
+	wflld	%v14, %v17
+
+#CHECK: wflrd   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
+#CHECK: wflrd   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
+#CHECK: wflrd   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc5]
+#CHECK: wflrd   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc5]
+#CHECK: wflrd   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc5]
+
+	wflrd	%v0, %v0, 0, 0
+	wflrd	%f0, %f0, 0, 0
+ 	wflrd	%v0, %v0, 0, 15
+	wflrd	%v0, %v0, 4, 0
+	wflrd	%v0, %v0, 12, 0
+	wflrd	%v0, %v31, 0, 0
+	wflrd	%v31, %v0, 0, 0
+	wflrd	%v14, %v17, 4, 10
+
+#CHECK: wflrx   %f0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x40,0xc5]
+#CHECK: wflrx   %f0, %v0, 7, 0          # encoding: [0xe7,0x00,0x00,0x0f,0x40,0xc5]
+#CHECK: wflrx   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xc5]
+#CHECK: wflrx   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xc5]
+#CHECK: wflrx   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x44,0xc5]
+
+	wflrx	%v0, %v0, 0, 0
+	wflrx	%f0, %v0, 0, 0
+ 	wflrx	%v0, %v0, 0, 15
+	wflrx	%v0, %v0, 4, 0
+	wflrx	%v0, %v0, 7, 0
+	wflrx	%v0, %v31, 0, 0
+	wflrx	%v31, %v0, 0, 0
+	wflrx	%v14, %v17, 4, 10
+
+#CHECK: wfmaxdb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xef]
+#CHECK: wfmaxdb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xef]
+#CHECK: wfmaxdb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x30,0xef]
+#CHECK: wfmaxdb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xef]
+#CHECK: wfmaxdb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xef]
+#CHECK: wfmaxdb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xef]
+#CHECK: wfmaxdb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x3a,0xef]
+
+	wfmaxdb	%v0, %v0, %v0, 0
+	wfmaxdb	%f0, %f0, %f0, 0
+	wfmaxdb	%v0, %v0, %v0, 4
+	wfmaxdb	%v0, %v0, %v31, 0
+	wfmaxdb	%v0, %v31, %v0, 0
+	wfmaxdb	%v31, %v0, %v0, 0
+	wfmaxdb	%v18, %v3, %v20, 11
+
+#CHECK: wfmaxsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xef]
+#CHECK: wfmaxsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xef]
+#CHECK: wfmaxsb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x20,0xef]
+#CHECK: wfmaxsb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xef]
+#CHECK: wfmaxsb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xef]
+#CHECK: wfmaxsb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xef]
+#CHECK: wfmaxsb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x2a,0xef]
+
+	wfmaxsb	%v0, %v0, %v0, 0
+	wfmaxsb	%f0, %f0, %f0, 0
+	wfmaxsb	%v0, %v0, %v0, 4
+	wfmaxsb	%v0, %v0, %v31, 0
+	wfmaxsb	%v0, %v31, %v0, 0
+	wfmaxsb	%v31, %v0, %v0, 0
+	wfmaxsb	%v18, %v3, %v20, 11
+
+#CHECK: wfmaxxb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x40,0xef]
+#CHECK: wfmaxxb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x40,0xef]
+#CHECK: wfmaxxb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xef]
+#CHECK: wfmaxxb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xef]
+#CHECK: wfmaxxb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xef]
+#CHECK: wfmaxxb	%v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x4a,0xef]
+
+	wfmaxxb	%v0, %v0, %v0, 0
+	wfmaxxb	%v0, %v0, %v0, 4
+	wfmaxxb	%v0, %v0, %v31, 0
+	wfmaxxb	%v0, %v31, %v0, 0
+	wfmaxxb	%v31, %v0, %v0, 0
+	wfmaxxb	%v18, %v3, %v20, 11
+
+#CHECK: wfmindb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xee]
+#CHECK: wfmindb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x30,0xee]
+#CHECK: wfmindb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x30,0xee]
+#CHECK: wfmindb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xee]
+#CHECK: wfmindb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xee]
+#CHECK: wfmindb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xee]
+#CHECK: wfmindb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x3a,0xee]
+
+	wfmindb	%v0, %v0, %v0, 0
+	wfmindb	%f0, %f0, %f0, 0
+	wfmindb	%v0, %v0, %v0, 4
+	wfmindb	%v0, %v0, %v31, 0
+	wfmindb	%v0, %v31, %v0, 0
+	wfmindb	%v31, %v0, %v0, 0
+	wfmindb	%v18, %v3, %v20, 11
+
+#CHECK: wfminsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xee]
+#CHECK: wfminsb	%f0, %f0, %f0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x20,0xee]
+#CHECK: wfminsb	%f0, %f0, %f0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x20,0xee]
+#CHECK: wfminsb	%f0, %f0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xee]
+#CHECK: wfminsb	%f0, %v31, %f0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xee]
+#CHECK: wfminsb	%v31, %f0, %f0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xee]
+#CHECK: wfminsb	%v18, %f3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x2a,0xee]
+
+	wfminsb	%v0, %v0, %v0, 0
+	wfminsb	%f0, %f0, %f0, 0
+	wfminsb	%v0, %v0, %v0, 4
+	wfminsb	%v0, %v0, %v31, 0
+	wfminsb	%v0, %v31, %v0, 0
+	wfminsb	%v31, %v0, %v0, 0
+	wfminsb	%v18, %v3, %v20, 11
+
+#CHECK: wfminxb	%v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x08,0x40,0xee]
+#CHECK: wfminxb	%v0, %v0, %v0, 4        # encoding: [0xe7,0x00,0x00,0x48,0x40,0xee]
+#CHECK: wfminxb	%v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xee]
+#CHECK: wfminxb	%v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xee]
+#CHECK: wfminxb	%v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xee]
+#CHECK: wfminxb	%v18, %v3, %v20, 11     # encoding: [0xe7,0x23,0x40,0xb8,0x4a,0xee]
+
+	wfminxb	%v0, %v0, %v0, 0
+	wfminxb	%v0, %v0, %v0, 4
+	wfminxb	%v0, %v0, %v31, 0
+	wfminxb	%v0, %v31, %v0, 0
+	wfminxb	%v31, %v0, %v0, 0
+	wfminxb	%v18, %v3, %v20, 11
+
+#CHECK: wfmasb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8f]
+#CHECK: wfmasb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8f]
+#CHECK: wfmasb  %f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x8f]
+#CHECK: wfmasb  %f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x8f]
+#CHECK: wfmasb  %f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x8f]
+#CHECK: wfmasb  %v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x8f]
+#CHECK: wfmasb  %f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x8f]
+
+	wfmasb	%v0, %v0, %v0, %v0
+	wfmasb	%f0, %f0, %f0, %f0
+	wfmasb	%v0, %v0, %v0, %v31
+	wfmasb	%v0, %v0, %v31, %v0
+	wfmasb	%v0, %v31, %v0, %v0
+	wfmasb	%v31, %v0, %v0, %v0
+	wfmasb	%v13, %v17, %v21, %v25
+
+#CHECK: wfmaxb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x8f]
+#CHECK: wfmaxb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x8f]
+#CHECK: wfmaxb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x8f]
+#CHECK: wfmaxb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x8f]
+#CHECK: wfmaxb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x8f]
+#CHECK: wfmaxb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x8f]
+
+	wfmaxb	%v0, %v0, %v0, %v0
+	wfmaxb	%v0, %v0, %v0, %v31
+	wfmaxb	%v0, %v0, %v31, %v0
+	wfmaxb	%v0, %v31, %v0, %v0
+	wfmaxb	%v31, %v0, %v0, %v0
+	wfmaxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfmsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe7]
+#CHECK: wfmsb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe7]
+#CHECK: wfmsb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe7]
+#CHECK: wfmsb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe7]
+#CHECK: wfmsb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe7]
+#CHECK: wfmsb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe7]
+
+	wfmsb	%v0, %v0, %v0
+	wfmsb	%f0, %f0, %f0
+	wfmsb	%v0, %v0, %v31
+	wfmsb	%v0, %v31, %v0
+	wfmsb	%v31, %v0, %v0
+	wfmsb	%v18, %v3, %v20
+
+#CHECK: wfmxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe7]
+#CHECK: wfmxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe7]
+#CHECK: wfmxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe7]
+#CHECK: wfmxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe7]
+#CHECK: wfmxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe7]
+
+	wfmxb	%v0, %v0, %v0
+	wfmxb	%v0, %v0, %v31
+	wfmxb	%v0, %v31, %v0
+	wfmxb	%v31, %v0, %v0
+	wfmxb	%v18, %v3, %v20
+
+#CHECK: wfmssb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8e]
+#CHECK: wfmssb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x8e]
+#CHECK: wfmssb  %f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x8e]
+#CHECK: wfmssb  %f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x8e]
+#CHECK: wfmssb  %f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x8e]
+#CHECK: wfmssb  %v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x8e]
+#CHECK: wfmssb  %f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x8e]
+
+	wfmssb	%v0, %v0, %v0, %v0
+	wfmssb	%f0, %f0, %f0, %f0
+	wfmssb	%v0, %v0, %v0, %v31
+	wfmssb	%v0, %v0, %v31, %v0
+	wfmssb	%v0, %v31, %v0, %v0
+	wfmssb	%v31, %v0, %v0, %v0
+	wfmssb	%v13, %v17, %v21, %v25
+
+#CHECK: wfmsxb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x8e]
+#CHECK: wfmsxb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x8e]
+#CHECK: wfmsxb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x8e]
+#CHECK: wfmsxb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x8e]
+#CHECK: wfmsxb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x8e]
+#CHECK: wfmsxb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x8e]
+
+	wfmsxb	%v0, %v0, %v0, %v0
+	wfmsxb	%v0, %v0, %v0, %v31
+	wfmsxb	%v0, %v0, %v31, %v0
+	wfmsxb	%v0, %v31, %v0, %v0
+	wfmsxb	%v31, %v0, %v0, %v0
+	wfmsxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmadb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9f]
+#CHECK: wfnmadb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9f]
+#CHECK: wfnmadb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x9f]
+#CHECK: wfnmadb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x9f]
+#CHECK: wfnmadb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x9f]
+#CHECK: wfnmadb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x9f]
+#CHECK: wfnmadb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x9f]
+
+	wfnmadb	%v0, %v0, %v0, %v0
+	wfnmadb	%f0, %f0, %f0, %f0
+	wfnmadb	%v0, %v0, %v0, %v31
+	wfnmadb	%v0, %v0, %v31, %v0
+	wfnmadb	%v0, %v31, %v0, %v0
+	wfnmadb	%v31, %v0, %v0, %v0
+	wfnmadb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmasb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9f]
+#CHECK: wfnmasb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9f]
+#CHECK: wfnmasb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x9f]
+#CHECK: wfnmasb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x9f]
+#CHECK: wfnmasb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x9f]
+#CHECK: wfnmasb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x9f]
+#CHECK: wfnmasb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x9f]
+
+	wfnmasb	%v0, %v0, %v0, %v0
+	wfnmasb	%f0, %f0, %f0, %f0
+	wfnmasb	%v0, %v0, %v0, %v31
+	wfnmasb	%v0, %v0, %v31, %v0
+	wfnmasb	%v0, %v31, %v0, %v0
+	wfnmasb	%v31, %v0, %v0, %v0
+	wfnmasb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmaxb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x9f]
+#CHECK: wfnmaxb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x9f]
+#CHECK: wfnmaxb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x9f]
+#CHECK: wfnmaxb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x9f]
+#CHECK: wfnmaxb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x9f]
+#CHECK: wfnmaxb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x9f]
+
+	wfnmaxb	%v0, %v0, %v0, %v0
+	wfnmaxb	%v0, %v0, %v0, %v31
+	wfnmaxb	%v0, %v0, %v31, %v0
+	wfnmaxb	%v0, %v31, %v0, %v0
+	wfnmaxb	%v31, %v0, %v0, %v0
+	wfnmaxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmsdb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9e]
+#CHECK: wfnmsdb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x9e]
+#CHECK: wfnmsdb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x9e]
+#CHECK: wfnmsdb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x9e]
+#CHECK: wfnmsdb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x9e]
+#CHECK: wfnmsdb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x9e]
+#CHECK: wfnmsdb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x9e]
+
+	wfnmsdb	%v0, %v0, %v0, %v0
+	wfnmsdb	%f0, %f0, %f0, %f0
+	wfnmsdb	%v0, %v0, %v0, %v31
+	wfnmsdb	%v0, %v0, %v31, %v0
+	wfnmsdb	%v0, %v31, %v0, %v0
+	wfnmsdb	%v31, %v0, %v0, %v0
+	wfnmsdb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmssb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9e]
+#CHECK: wfnmssb	%f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x02,0x08,0x00,0x9e]
+#CHECK: wfnmssb	%f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x02,0x08,0xf1,0x9e]
+#CHECK: wfnmssb	%f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf2,0x08,0x02,0x9e]
+#CHECK: wfnmssb	%f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x02,0x08,0x04,0x9e]
+#CHECK: wfnmssb	%v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x02,0x08,0x08,0x9e]
+#CHECK: wfnmssb	%f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x08,0x97,0x9e]
+
+	wfnmssb	%v0, %v0, %v0, %v0
+	wfnmssb	%f0, %f0, %f0, %f0
+	wfnmssb	%v0, %v0, %v0, %v31
+	wfnmssb	%v0, %v0, %v31, %v0
+	wfnmssb	%v0, %v31, %v0, %v0
+	wfnmssb	%v31, %v0, %v0, %v0
+	wfnmssb	%v13, %v17, %v21, %v25
+
+#CHECK: wfnmsxb	%v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x08,0x00,0x9e]
+#CHECK: wfnmsxb	%v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x08,0xf1,0x9e]
+#CHECK: wfnmsxb	%v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x08,0x02,0x9e]
+#CHECK: wfnmsxb	%v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x08,0x04,0x9e]
+#CHECK: wfnmsxb	%v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x08,0x08,0x9e]
+#CHECK: wfnmsxb	%v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x08,0x97,0x9e]
+
+	wfnmsxb	%v0, %v0, %v0, %v0
+	wfnmsxb	%v0, %v0, %v0, %v31
+	wfnmsxb	%v0, %v0, %v31, %v0
+	wfnmsxb	%v0, %v31, %v0, %v0
+	wfnmsxb	%v31, %v0, %v0, %v0
+	wfnmsxb	%v13, %v17, %v21, %v25
+
+#CHECK: wfssb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe2]
+#CHECK: wfssb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x20,0xe2]
+#CHECK: wfssb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x22,0xe2]
+#CHECK: wfssb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xe2]
+#CHECK: wfssb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xe2]
+#CHECK: wfssb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x2a,0xe2]
+
+	wfssb	%v0, %v0, %v0
+	wfssb	%f0, %f0, %f0
+	wfssb	%v0, %v0, %v31
+	wfssb	%v0, %v31, %v0
+	wfssb	%v31, %v0, %v0
+	wfssb	%v18, %v3, %v20
+
+#CHECK: wfsxb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x08,0x40,0xe2]
+#CHECK: wfsxb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x42,0xe2]
+#CHECK: wfsxb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xe2]
+#CHECK: wfsxb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xe2]
+#CHECK: wfsxb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x4a,0xe2]
+
+	wfsxb	%v0, %v0, %v0
+	wfsxb	%v0, %v0, %v31
+	wfsxb	%v0, %v31, %v0
+	wfsxb	%v31, %v0, %v0
+	wfsxb	%v18, %v3, %v20
+
+#CHECK: wfsqsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xce]
+#CHECK: wfsqsb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xce]
+#CHECK: wfsqsb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xce]
+#CHECK: wfsqsb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xce]
+
+	wfsqsb	%v0, %v0
+	wfsqsb	%f0, %f0
+	wfsqsb	%v0, %v15
+	wfsqsb	%v0, %v31
+	wfsqsb	%v15, %v0
+	wfsqsb	%v31, %v0
+	wfsqsb	%v14, %v17
+
+#CHECK: wfsqxb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x08,0x40,0xce]
+#CHECK: wfsqxb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x08,0x40,0xce]
+#CHECK: wfsqxb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x44,0xce]
+#CHECK: wfsqxb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x40,0xce]
+#CHECK: wfsqxb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x08,0x48,0xce]
+#CHECK: wfsqxb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x44,0xce]
+
+	wfsqxb	%v0, %v0
+	wfsqxb	%v0, %v15
+	wfsqxb	%v0, %v31
+	wfsqxb	%v15, %v0
+	wfsqxb	%v31, %v0
+	wfsqxb	%v14, %v17
+
+#CHECK: wftcisb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %f0, %f0, 4095          # encoding: [0xe7,0x00,0xff,0xf8,0x20,0x4a]
+#CHECK: wftcisb %f0, %f15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %f0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x24,0x4a]
+#CHECK: wftcisb %f15, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x20,0x4a]
+#CHECK: wftcisb %v31, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x28,0x4a]
+#CHECK: wftcisb %f4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x88,0x24,0x4a]
+
+	wftcisb	%v0, %v0, 0
+	wftcisb	%f0, %f0, 0
+	wftcisb	%v0, %v0, 4095
+	wftcisb	%v0, %v15, 0
+	wftcisb	%v0, %v31, 0
+	wftcisb	%v15, %v0, 0
+	wftcisb	%v31, %v0, 0
+	wftcisb	%v4, %v21, 0x678
+
+#CHECK: wftcixb %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x40,0x4a]
+#CHECK: wftcixb %v0, %v0, 4095          # encoding: [0xe7,0x00,0xff,0xf8,0x40,0x4a]
+#CHECK: wftcixb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x40,0x4a]
+#CHECK: wftcixb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x44,0x4a]
+#CHECK: wftcixb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x40,0x4a]
+#CHECK: wftcixb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x48,0x4a]
+#CHECK: wftcixb %v4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x88,0x44,0x4a]
+
+	wftcixb	%v0, %v0, 0
+	wftcixb	%v0, %v0, 4095
+	wftcixb	%v0, %v15, 0
+	wftcixb	%v0, %v31, 0
+	wftcixb	%v15, %v0, 0
+	wftcixb	%v31, %v0, 0
+	wftcixb	%v4, %v21, 0x678
+
diff --git a/test/MC/SystemZ/invalid-instructions-spellcheck.s b/test/MC/SystemZ/invalid-instructions-spellcheck.s
new file mode 100644
index 000000000000..e77b99d9a387
--- /dev/null
+++ b/test/MC/SystemZ/invalid-instructions-spellcheck.s
@@ -0,0 +1,66 @@
+# RUN: not llvm-mc -triple=systemz -mcpu=z13 -show-encoding < %s 2>&1 | FileCheck %s
+# RUN: not llvm-mc -triple=systemz -mcpu=zEC12 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZEC12
+
+# This tests the mnemonic spell checker.
+
+# First check what happens when an instruction is omitted:
+
+  %r1, %r2, %r3
+
+# CHECK:      error: unexpected token at start of statement
+# CHECK-NEXT: %r1, %r2, %r3
+# CHECK-NEXT:   ^
+
+# We don't want to see a suggestion here; the edit distance is too large to
+# give sensible suggestions:
+
+  aaaaaaaaaaaaaaa %r1, %r2, %r3
+
+# CHECK:      error: invalid instruction
+# CHECK-NEXT: aaaaaaaaaaaaaaa %r1, %r2, %r3
+# CHECK-NEXT: ^
+
+# Check that we get one suggestion: 'cpdt' is 1 edit away, i.e. an deletion.
+
+  cpdtX %r1, 0(4, %r15), 0
+
+#CHECK:      error: invalid instruction, did you mean: cpdt
+#CHECK-NEXT: cpdtX %r1, 0(4, %r15), 0
+#CHECK-NEXT: ^
+
+# Check edit distance 1 and 2
+
+  ltTr %r1, %r2
+
+# CHECK:      error: invalid instruction, did you mean: lr, lt, ltdr, ltdtr, lter, ltgr, ltr, ltxr, ltxtr, tr, trtr?
+# CHECK-NEXT: ltTr %r1, %r2
+# CHECK-NEXT: ^
+
+# Check edit distance 1 and 2, just insertions:
+
+  begin 0, 65292
+
+# CHECK:      error: invalid instruction, did you mean: tbegin, tbeginc?
+# CHECK-NEXT: begin 0, 65292
+# CHECK-NEXT: ^
+
+# Check an instruction that is 2 edits away, and also has a lot of candidates:
+
+  adt %r1, 244(%r15)
+
+# CHECK:      error: invalid instruction, did you mean: a, ad, adb, adr, adtr, adtra, d, lat, mad, qadtr?
+# CHECK-NEXT: adt %r1, 244(%r15)
+# CHECK-NEXT: ^
+
+# Here it is checked that we don't suggest instructions that are not supported.
+# For example, in pre-z13 mode we don't want to see suggestions for vector instructions.
+
+  vlvggp %v1, %r2, %r3
+
+# CHECK-ZEC12: error: invalid instruction
+# CHECK-ZEC12: vlvggp
+# CHECK-ZEC12: ^
+
+# CHECK:      error: invalid instruction, did you mean: vlvg, vlvgg, vlvgp?
+# CHECK-NEXT: vlvggp %v1, %r2, %r3
+# CHECK-NEXT: ^
diff --git a/test/MC/X86/pr22028.s b/test/MC/X86/pr22028.s
index 6ce7da07488b..d82b50f051a1 100644
--- a/test/MC/X86/pr22028.s
+++ b/test/MC/X86/pr22028.s
@@ -1,6 +1,6 @@
 // RUN: llvm-mc -triple i386-unknown-unknown-code16 -show-encoding %s | FileCheck --check-prefix=CHECK16 %s
-// RUN: llvm-mc -triple i386-unknown-unknown -show-encoding %s | FileCheck --check-prefix=CHECK %s
-// RUN: llvm-mc -triple i686-unknown-unknown -show-encoding %s | FileCheck --check-prefix=CHECK %s	
+// RUN: llvm-mc -triple i386-unknown-unknown -show-encoding %s | FileCheck %s
+// RUN: llvm-mc -triple i686-unknown-unknown -show-encoding %s | FileCheck %s	
 	
 .intel_syntax
 	
diff --git a/test/Object/no-section-table.test b/test/Object/no-section-table.test
index bd60e681b71f..9ecde4f8c369 100644
--- a/test/Object/no-section-table.test
+++ b/test/Object/no-section-table.test
@@ -3,7 +3,7 @@ RUN:   | FileCheck %s
 
 CHECK: DynamicSection [ (24 entries)
 CHECK:   Tag                Type                 Name/Value
-CHECK:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
+CHECK:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
 CHECK:   0x000000000000000C INIT                 0x4B8
 CHECK:   0x000000000000000D FINI                 0x618
 CHECK:   0x0000000000000019 INIT_ARRAY           0x2006C0
diff --git a/test/Object/readobj-shared-object.test b/test/Object/readobj-shared-object.test
index 173581e60c39..59f5ff127cf3 100644
--- a/test/Object/readobj-shared-object.test
+++ b/test/Object/readobj-shared-object.test
@@ -302,9 +302,9 @@ ELF: ]
 
 ELF32: DynamicSection [ (9 entries)
 ELF32:   Tag        Type                 Name/Value
-ELF32:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
-ELF32:   0x00000001 NEEDED               SharedLibrary (libm.so.6)
-ELF32:   0x0000000E SONAME               LibrarySoname (libfoo.so)
+ELF32:   0x00000001 NEEDED               Shared library: [libc.so.6]
+ELF32:   0x00000001 NEEDED               Shared library: [libm.so.6]
+ELF32:   0x0000000E SONAME               Library soname: [libfoo.so]
 ELF32:   0x00000004 HASH                 {{[0-9a-f]+}}
 ELF32:   0x00000005 STRTAB               {{[0-9a-f]+}}
 ELF32:   0x00000006 SYMTAB               {{[0-9a-f]+}}
@@ -315,9 +315,9 @@ ELF32: ]
 
 ELF64: DynamicSection [ (9 entries)
 ELF64:   Tag        Type                 Name/Value
-ELF64:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
-ELF64:   0x0000000000000001 NEEDED               SharedLibrary (libm.so.6)
-ELF64:   0x000000000000000E SONAME               LibrarySoname (libfoo.so)
+ELF64:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
+ELF64:   0x0000000000000001 NEEDED               Shared library: [libm.so.6]
+ELF64:   0x000000000000000E SONAME               Library soname: [libfoo.so]
 ELF64:   0x0000000000000004 HASH                 {{[0-9a-f]+}}
 ELF64:   0x0000000000000005 STRTAB               {{[0-9a-f]+}}
 ELF64:   0x0000000000000006 SYMTAB               {{[0-9a-f]+}}
diff --git a/test/ObjectYAML/CodeView/guid.yaml b/test/ObjectYAML/CodeView/guid.yaml
new file mode 100644
index 000000000000..8d8d0142c5e3
--- /dev/null
+++ b/test/ObjectYAML/CodeView/guid.yaml
@@ -0,0 +1,59 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.debug$T'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    Types:
+      - Kind:            LF_TYPESERVER2
+        TypeServer2:
+          Guid:            '{01DF191B-22BF-6B42-96CE-5258B8329FE5}'
+          Age:             24
+          Name:            'C:\src\llvm-project\build\vc140.pdb'
+symbols:
+  - Name:            '.debug$T'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          64
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...
+
+# CHECK: --- !COFF
+# CHECK: header:
+# CHECK:   Machine:         IMAGE_FILE_MACHINE_AMD64
+# CHECK:   Characteristics: [  ]
+# CHECK: sections:
+# CHECK:   - Name:            '.debug$T'
+# CHECK:     Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+# CHECK:     Alignment:       1
+# CHECK:     Types:
+# CHECK:       - Kind:            LF_TYPESERVER2
+# CHECK:         TypeServer2:
+# CHECK:           Guid:            '{01DF191B-22BF-6B42-96CE-5258B8329FE5}'
+# CHECK:           Age:             24
+# CHECK:           Name:            'C:\src\llvm-project\build\vc140.pdb'
+# CHECK: symbols:
+# CHECK:   - Name:            '.debug$T'
+# CHECK:     Value:           0
+# CHECK:     SectionNumber:   1
+# CHECK:     SimpleType:      IMAGE_SYM_TYPE_NULL
+# CHECK:     ComplexType:     IMAGE_SYM_DTYPE_NULL
+# CHECK:     StorageClass:    IMAGE_SYM_CLASS_STATIC
+# CHECK:     SectionDefinition:
+# CHECK:       Length:          64
+# CHECK:       NumberOfRelocations: 0
+# CHECK:       NumberOfLinenumbers: 0
+# CHECK:       CheckSum:        0
+# CHECK:       Number:          0
+# CHECK: ...
diff --git a/test/Other/cgscc-libcall-update.ll b/test/Other/cgscc-libcall-update.ll
new file mode 100644
index 000000000000..e0833ca09266
--- /dev/null
+++ b/test/Other/cgscc-libcall-update.ll
@@ -0,0 +1,61 @@
+; Make sure that the CGSCC pass manager can handle when instcombine simplifies
+; one libcall into an unrelated libcall and update the call graph accordingly.
+;
+; Also check that it can handle inlining *removing* a libcall entirely.
+;
+; RUN: opt -passes='cgscc(inline,function(instcombine))' -S < %s | FileCheck %s
+
+define i8* @wibble(i8* %arg1, i8* %arg2) {
+; CHECK-LABEL: define i8* @wibble(
+bb:
+  %tmp = alloca [1024 x i8], align 16
+  %tmp2 = getelementptr inbounds [1024 x i8], [1024 x i8]* %tmp, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* %arg1, i64 1024, i32 0, i1 false)
+; CHECK:         call void @llvm.memcpy
+  %tmp3 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 false, i1 true)
+  %tmp4 = call i8* @__strncpy_chk(i8* %arg2, i8* %tmp2, i64 1023, i64 %tmp3)
+; CHECK-NOT:     call
+; CHECK:         call i8* @strncpy(i8* %arg2, i8* %tmp2, i64 1023)
+; CHECK-NOT:     call
+
+  ret i8* %tmp4
+; CHECK:         ret
+}
+
+define i8* @strncpy(i8* %arg1, i8* %arg2, i64 %size) noinline {
+bb:
+  %result = call i8* @my_special_strncpy(i8* %arg1, i8* %arg2, i64 %size)
+  ret i8* %result
+}
+
+declare i8* @my_special_strncpy(i8* %arg1, i8* %arg2, i64 %size)
+
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1)
+
+declare i8* @__strncpy_chk(i8*, i8*, i64, i64)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+; Check that even when we completely remove a libcall we don't get the call
+; graph wrong once we handle libcalls in the call graph specially to address
+; the above case.
+define i32 @hoge(i32* %arg1) {
+; CHECK-LABEL: define i32 @hoge(
+bb:
+  %tmp41 = load i32*, i32** null
+  %tmp6 = load i32, i32* %arg1
+  %tmp7 = call i32 @ntohl(i32 %tmp6)
+; CHECK-NOT: call i32 @ntohl
+  ret i32 %tmp7
+; CHECK: ret i32
+}
+
+; Even though this function is not used, it should be retained as it may be
+; used when doing further libcall transformations.
+define internal i32 @ntohl(i32 %x) {
+; CHECK-LABEL: define internal i32 @ntohl(
+entry:
+  %and2 = lshr i32 %x, 8
+  %shr = and i32 %and2, 65280
+  ret i32 %shr
+}
diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll
index bf8e596d118b..35f596e77988 100644
--- a/test/Other/new-pass-manager.ll
+++ b/test/Other/new-pass-manager.ll
@@ -23,6 +23,7 @@
 ; CHECK-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(CGSCCAnalysisManager|AnalysisManager<.*LazyCallGraph::SCC.*>).*}},{{.*}}Module>
 ; CHECK-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(FunctionAnalysisManager|AnalysisManager<.*Function.*>).*}},{{.*}}Module>
 ; CHECK-CGSCC-PASS-NEXT: Running analysis: LazyCallGraphAnalysis
+; CHECK-CGSCC-PASS-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-CGSCC-PASS-NEXT: Running an SCC pass across the RefSCC: [(foo)]
 ; CHECK-CGSCC-PASS-NEXT: Starting CGSCC pass manager run
 ; CHECK-CGSCC-PASS-NEXT: Running pass: NoOpCGSCCPass
@@ -407,6 +408,7 @@
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(CGSCCAnalysisManager|AnalysisManager<.*LazyCallGraph::SCC.*>).*}},{{.*}}Module>
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*(FunctionAnalysisManager|AnalysisManager<.*Function.*>).*}},{{.*}}Module>
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: LazyCallGraphAnalysis
+; CHECK-REPEAT-CGSCC-PASS-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running an SCC pass across the RefSCC: [(foo)]
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Starting CGSCC pass manager run
 ; CHECK-REPEAT-CGSCC-PASS-NEXT: Running pass: RepeatedPass
diff --git a/test/ThinLTO/X86/debuginfo-cu-import.ll b/test/ThinLTO/X86/debuginfo-cu-import.ll
index 42a751191860..e0b066c736e4 100644
--- a/test/ThinLTO/X86/debuginfo-cu-import.ll
+++ b/test/ThinLTO/X86/debuginfo-cu-import.ll
@@ -51,11 +51,11 @@ entry:
 !9 = !DIGlobalVariableExpression(var: !10)
 !10 = !DIGlobalVariable(name: "version", scope: !4, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true)
 !11 = !{!12, !16}
-!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !13, line: 8)
+!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !4, entity: !13, file: !1, line: 8)
 !13 = distinct !DISubprogram(name: "a", linkageName: "_ZN1A1aEv", scope: !4, file: !1, line: 7, type: !14, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !5)
 !14 = !DISubroutineType(types: !15)
 !15 = !{null}
-!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, line: 8)
+!16 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !17, entity: !19, file: !1, line: 8)
 !17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 9, column: 8)
 !18 = distinct !DISubprogram(name: "c", linkageName: "_ZN1A1cEv", scope: !4, file: !1, line: 9, type: !14, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !5)
 !19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 10, column: 8)
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
index 4b9e7c3956f5..1dfc08761965 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -23,9 +23,63 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp3(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp3(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2
+; X32-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X32-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP14]] to i32
+; X32-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp3(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X64-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 2
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP14]] to i32
+; X64-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
   ret i32 %call
@@ -50,27 +104,225 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp5(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp5(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4
+; X32-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X32-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X32-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp5(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i8 4
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP14]] to i32
+; X64-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP15]], [[TMP16]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP17]], [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
   ret i32 %call
 }
 
 define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp6(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp6(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
+; X32-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X32-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i32
+; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
+; X32-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
+; X32-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp6(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2
+; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]])
+; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT:    [[TMP20]] = zext i16 [[TMP18]] to i64
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
   ret i32 %call
 }
 
 define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp7(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp7(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
+; X32-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X32-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i32
+; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
+; X32-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
+; X32-NEXT:    br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 6
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 6
+; X32-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i32
+; X32-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP23]] to i32
+; X32-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp7(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; X64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; X64-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    br i1 [[TMP8]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP6]], [[LOADBB:%.*]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP7]], [[LOADBB]] ], [ [[TMP20:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 2
+; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP16]])
+; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT:    [[TMP20]] = zext i16 [[TMP18]] to i64
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, i8* [[X]], i8 6
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, i8* [[Y]], i8 6
+; X64-NEXT:    [[TMP24:%.*]] = load i8, i8* [[TMP22]]
+; X64-NEXT:    [[TMP25:%.*]] = load i8, i8* [[TMP23]]
+; X64-NEXT:    [[TMP26:%.*]] = zext i8 [[TMP24]] to i32
+; X64-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP25]] to i32
+; X64-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP28]], [[LOADBB2]] ], [ [[TMP10]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
   ret i32 %call
@@ -78,8 +330,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-LABEL: @cmp8(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8)
-; X32-NEXT:    ret i32 [[CALL]]
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
 ; X64-LABEL: @cmp8(
 ; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
@@ -99,72 +378,691 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp9(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp9(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X32-NEXT:    [[TMP20:%.*]] = load i8, i8* [[TMP18]]
+; X32-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X32-NEXT:    [[TMP22:%.*]] = zext i8 [[TMP20]] to i32
+; X32-NEXT:    [[TMP23:%.*]] = zext i8 [[TMP21]] to i32
+; X32-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP22]], [[TMP23]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP24]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp9(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X64-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X64-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
   ret i32 %call
 }
 
 define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp10(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp10(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4
+; X32-NEXT:    [[TMP22:%.*]] = load i16, i16* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]])
+; X32-NEXT:    [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]])
+; X32-NEXT:    [[TMP26]] = zext i16 [[TMP24]] to i32
+; X32-NEXT:    [[TMP27]] = zext i16 [[TMP25]] to i32
+; X32-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X32-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp10(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
+; X64-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
   ret i32 %call
 }
 
 define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp11(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp11(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP26:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP27:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i16, i16* [[TMP18]], i16 4
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 4
+; X32-NEXT:    [[TMP22:%.*]] = load i16, i16* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP22]])
+; X32-NEXT:    [[TMP25:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP23]])
+; X32-NEXT:    [[TMP26]] = zext i16 [[TMP24]] to i32
+; X32-NEXT:    [[TMP27]] = zext i16 [[TMP25]] to i32
+; X32-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X32-NEXT:    br i1 [[TMP28]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP29:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X32-NEXT:    [[TMP30:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X32-NEXT:    [[TMP31:%.*]] = load i8, i8* [[TMP29]]
+; X32-NEXT:    [[TMP32:%.*]] = load i8, i8* [[TMP30]]
+; X32-NEXT:    [[TMP33:%.*]] = zext i8 [[TMP31]] to i32
+; X32-NEXT:    [[TMP34:%.*]] = zext i8 [[TMP32]] to i32
+; X32-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP33]], [[TMP34]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP35]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp11(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
+; X64-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X64-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X64-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X64-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP21]]
+; X64-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i32
+; X64-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP23]] to i32
+; X64-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
   ret i32 %call
 }
 
 define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp12(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp12(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp12(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
   ret i32 %call
 }
 
 define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp13(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp13(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP27:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X32-NEXT:    [[TMP28:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X32-NEXT:    [[TMP29:%.*]] = load i8, i8* [[TMP27]]
+; X32-NEXT:    [[TMP30:%.*]] = load i8, i8* [[TMP28]]
+; X32-NEXT:    [[TMP31:%.*]] = zext i8 [[TMP29]] to i32
+; X32-NEXT:    [[TMP32:%.*]] = zext i8 [[TMP30]] to i32
+; X32-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP33]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp13(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X64-NEXT:    [[TMP21:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X64-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X64-NEXT:    [[TMP23:%.*]] = load i8, i8* [[TMP21]]
+; X64-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i32
+; X64-NEXT:    [[TMP25:%.*]] = zext i8 [[TMP23]] to i32
+; X64-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP26]], [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
   ret i32 %call
 }
 
 define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp14(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp14(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP35:%.*]], [[LOADBB3:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP36:%.*]], [[LOADBB3]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP27:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP29:%.*]] = getelementptr i16, i16* [[TMP27]], i16 6
+; X32-NEXT:    [[TMP30:%.*]] = getelementptr i16, i16* [[TMP28]], i16 6
+; X32-NEXT:    [[TMP31:%.*]] = load i16, i16* [[TMP29]]
+; X32-NEXT:    [[TMP32:%.*]] = load i16, i16* [[TMP30]]
+; X32-NEXT:    [[TMP33:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP31]])
+; X32-NEXT:    [[TMP34:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP32]])
+; X32-NEXT:    [[TMP35]] = zext i16 [[TMP33]] to i32
+; X32-NEXT:    [[TMP36]] = zext i16 [[TMP34]] to i32
+; X32-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP35]], [[TMP36]]
+; X32-NEXT:    br i1 [[TMP37]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp14(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP21:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6
+; X64-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP22]]
+; X64-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP23]]
+; X64-NEXT:    [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]])
+; X64-NEXT:    [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]])
+; X64-NEXT:    [[TMP28]] = zext i16 [[TMP26]] to i64
+; X64-NEXT:    [[TMP29]] = zext i16 [[TMP27]] to i64
+; X64-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]]
+; X64-NEXT:    br i1 [[TMP30]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
   ret i32 %call
 }
 
 define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp15(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+; X64-LABEL: @cmp15(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ], [ [[TMP28:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ], [ [[TMP29:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
+; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
+; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
+; X64-NEXT:    br i1 [[TMP19]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP20:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP21:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i16, i16* [[TMP21]], i16 6
+; X64-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP22]]
+; X64-NEXT:    [[TMP25:%.*]] = load i16, i16* [[TMP23]]
+; X64-NEXT:    [[TMP26:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP24]])
+; X64-NEXT:    [[TMP27:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP25]])
+; X64-NEXT:    [[TMP28]] = zext i16 [[TMP26]] to i64
+; X64-NEXT:    [[TMP29]] = zext i16 [[TMP27]] to i64
+; X64-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP28]], [[TMP29]]
+; X64-NEXT:    br i1 [[TMP30]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP31:%.*]] = getelementptr i8, i8* [[X]], i8 14
+; X64-NEXT:    [[TMP32:%.*]] = getelementptr i8, i8* [[Y]], i8 14
+; X64-NEXT:    [[TMP33:%.*]] = load i8, i8* [[TMP31]]
+; X64-NEXT:    [[TMP34:%.*]] = load i8, i8* [[TMP32]]
+; X64-NEXT:    [[TMP35:%.*]] = zext i8 [[TMP33]] to i32
+; X64-NEXT:    [[TMP36:%.*]] = zext i8 [[TMP34]] to i32
+; X64-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP35]], [[TMP36]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP37]], [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
   ret i32 %call
 }
 
 define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp16(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
-; ALL-NEXT:    ret i32 [[CALL]]
+; X32-LABEL: @cmp16(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ], [ [[TMP24:%.*]], [[LOADBB2:%.*]] ], [ [[TMP33:%.*]], [[LOADBB3:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ], [ [[TMP25:%.*]], [[LOADBB2]] ], [ [[TMP34:%.*]], [[LOADBB3]] ]
+; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
+; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
+; X32-NEXT:    br i1 [[TMP17]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP18:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[TMP18]], i32 2
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 2
+; X32-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24]] = call i32 @llvm.bswap.i32(i32 [[TMP22]])
+; X32-NEXT:    [[TMP25]] = call i32 @llvm.bswap.i32(i32 [[TMP23]])
+; X32-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP24]], [[TMP25]]
+; X32-NEXT:    br i1 [[TMP26]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP27:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP28:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP29:%.*]] = getelementptr i32, i32* [[TMP27]], i32 3
+; X32-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP28]], i32 3
+; X32-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP29]]
+; X32-NEXT:    [[TMP32:%.*]] = load i32, i32* [[TMP30]]
+; X32-NEXT:    [[TMP33]] = call i32 @llvm.bswap.i32(i32 [[TMP31]])
+; X32-NEXT:    [[TMP34]] = call i32 @llvm.bswap.i32(i32 [[TMP32]])
+; X32-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP33]], [[TMP34]]
+; X32-NEXT:    br i1 [[TMP35]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-LABEL: @cmp16(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i64*
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
+; X64-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
+; X64-NEXT:    [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
+; X64-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]]
+; X64-NEXT:    br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
   ret i32 %call
@@ -190,8 +1088,25 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq3(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2
+; ALL-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -221,8 +1136,25 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq5(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4
+; ALL-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -234,8 +1166,27 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq6(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; ALL-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; ALL-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; ALL-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -247,8 +1198,34 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq7(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; ALL-NEXT:  loadbb:
+; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL:       res_block:
+; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb1:
+; ALL-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
+; ALL-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; ALL-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; ALL-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; ALL:       loadbb2:
+; ALL-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 6
+; ALL-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 6
+; ALL-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; ALL-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; ALL-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL:       endblock:
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; ALL-NEXT:    ret i32 [[CONV]]
 ;
@@ -260,8 +1237,27 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq8(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -283,11 +1279,60 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq9(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq9(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X32-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X32-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X32-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; X32-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq9(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8
+; X64-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; X64-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
+; X64-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
   %cmp = icmp eq i32 %call, 0
@@ -296,11 +1341,64 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq10(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq10(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4
+; X32-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq10(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
+; X64-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
   %cmp = icmp eq i32 %call, 0
@@ -309,11 +1407,78 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq11(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq11(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 4
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 4
+; X32-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X32-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X32-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]]
+; X32-NEXT:    br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq11(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
+; X64-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 10
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 10
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11)
   %cmp = icmp eq i32 %call, 0
@@ -322,11 +1487,64 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq12(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq12(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq12(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
   %cmp = icmp eq i32 %call, 0
@@ -335,11 +1553,78 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq13(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq13(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X32-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X32-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X32-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X32-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]]
+; X32-NEXT:    br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq13(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[X]], i8 12
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[Y]], i8 12
+; X64-NEXT:    [[TMP14:%.*]] = load i8, i8* [[TMP12]]
+; X64-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13)
   %cmp = icmp eq i32 %call, 0
@@ -348,11 +1633,82 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq14(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq14(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[X]] to i16*
+; X32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[Y]] to i16*
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i16, i16* [[TMP19]], i16 6
+; X32-NEXT:    [[TMP22:%.*]] = getelementptr i16, i16* [[TMP20]], i16 6
+; X32-NEXT:    [[TMP23:%.*]] = load i16, i16* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP22]]
+; X32-NEXT:    [[TMP25:%.*]] = icmp ne i16 [[TMP23]], [[TMP24]]
+; X32-NEXT:    br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq14(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X64-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14)
   %cmp = icmp eq i32 %call, 0
@@ -361,11 +1717,52 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq15(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq15(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
+; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i16*
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i16, i16* [[TMP12]], i16 6
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i16, i16* [[TMP13]], i16 6
+; X64-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP14]]
+; X64-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]]
+; X64-NEXT:    [[TMP18:%.*]] = icmp ne i16 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[X]], i8 14
+; X64-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[Y]], i8 14
+; X64-NEXT:    [[TMP21:%.*]] = load i8, i8* [[TMP19]]
+; X64-NEXT:    [[TMP22:%.*]] = load i8, i8* [[TMP20]]
+; X64-NEXT:    [[TMP23:%.*]] = icmp ne i8 [[TMP21]], [[TMP22]]
+; X64-NEXT:    br i1 [[TMP23]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15)
   %cmp = icmp eq i32 %call, 0
@@ -374,11 +1771,73 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq16(
-; ALL-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X32-LABEL: @cmp_eq16(
+; X32-NEXT:  loadbb:
+; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
+; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32:       res_block:
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
+; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
+; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; X32:       loadbb2:
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i32 2
+; X32-NEXT:    [[TMP15:%.*]] = getelementptr i32, i32* [[TMP13]], i32 2
+; X32-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP14]]
+; X32-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP15]]
+; X32-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[RES_BLOCK]], label [[LOADBB3:%.*]]
+; X32:       loadbb3:
+; X32-NEXT:    [[TMP19:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP20:%.*]] = bitcast i8* [[Y]] to i32*
+; X32-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP19]], i32 3
+; X32-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[TMP20]], i32 3
+; X32-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]]
+; X32-NEXT:    [[TMP24:%.*]] = load i32, i32* [[TMP22]]
+; X32-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP23]], [[TMP24]]
+; X32-NEXT:    br i1 [[TMP25]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ 1, [[RES_BLOCK]] ]
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+; X64-LABEL: @cmp_eq16(
+; X64-NEXT:  loadbb:
+; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
+; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64:       res_block:
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i64*
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1
+; X64-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]]
+; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
   %cmp = icmp eq i32 %call, 0
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index b6b775797826..088b177c2e11 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -194,7 +194,6 @@ rare.2:
   br label %fallthrough
 }
 
-
 declare void @slowpath(i32, i32*)
 
 ; Make sure we don't end up in an infinite loop after we fail to sink.
@@ -218,3 +217,37 @@ load.i145:
 pl_loop.i.i122:
   br label %pl_loop.i.i122
 }
+
+; Make sure we can sink address computation even
+; if there is a cycle in phi nodes.
+define void @test9(i1 %cond, i64* %base) {
+; CHECK-LABEL: @test9
+entry:
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br label %header
+
+header:
+  %iv = phi i32 [0, %entry], [%iv.inc, %backedge]
+  %casted.loop = phi i32* [%casted, %entry], [%casted.merged, %backedge]
+  br i1 %cond, label %if.then, label %backedge
+
+if.then:
+  call void @foo(i32 %iv)
+  %addr.1 = getelementptr inbounds i64, i64* %base, i64 5
+  %casted.1 = bitcast i64* %addr.1 to i32*
+  br label %backedge
+
+backedge:
+; CHECK-LABEL: backedge:
+; CHECK: getelementptr i8, {{.+}} 40
+  %casted.merged = phi i32* [%casted.loop, %header], [%casted.1, %if.then]
+  %v = load i32, i32* %casted.merged, align 4
+  call void @foo(i32 %v)
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, 1000
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll
new file mode 100644
index 000000000000..57dbdd883190
--- /dev/null
+++ b/test/Transforms/EarlyCSE/globalsaa-memoryssa.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -globals-aa -early-cse-memssa | FileCheck %s
+
+define i16 @f1() readonly {
+  ret i16 0
+}
+
+declare void @f2()
+
+; Check that EarlyCSE correctly handles function calls that don't have
+; a MemoryAccess.  In this case the calls to @f1 have no
+; MemoryAccesses since globals-aa determines that @f1 doesn't
+; read/write memory at all.
+
+define void @f3() {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:    [[CALL1:%.*]] = call i16 @f1()
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    ret void
+;
+  %call1 = call i16 @f1()
+  call void @f2()
+  %call2 = call i16 @f1()
+  ret void
+}
diff --git a/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll
new file mode 100644
index 000000000000..513379d0bd01
--- /dev/null
+++ b/test/Transforms/GVN/PRE/2017-06-28-pre-load-dbgloc.ll
@@ -0,0 +1,79 @@
+; This test checks if debug loc is propagated to load/store created by GVN/Instcombine.
+; RUN: opt < %s -gvn -S | FileCheck %s --check-prefixes=ALL,GVN
+; RUN: opt < %s -gvn -instcombine -S | FileCheck %s --check-prefixes=ALL,INSTCOMBINE
+
+; struct node {
+;  int  *v;
+; struct desc *descs;
+; };
+
+; struct desc {
+;  struct node *node;
+; };
+
+; extern int bar(void *v, void* n);
+
+; int test(struct desc *desc)
+; {
+;  void *v, *n;
+;  v = !desc ? ((void *)0) : desc->node->v;  // Line 15
+;  n = &desc->node->descs[0];                // Line 16
+;  return bar(v, n);
+; }
+
+; Line 16, Column 13:
+;   n = &desc->node->descs[0];
+;              ^
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.desc = type { %struct.node* }
+%struct.node = type { i32*, %struct.desc* }
+
+define i32 @test(%struct.desc* readonly %desc) local_unnamed_addr #0 !dbg !4 {
+entry:
+  %tobool = icmp eq %struct.desc* %desc, null
+  br i1 %tobool, label %cond.end, label %cond.false, !dbg !9
+; ALL: br i1 %tobool, label %entry.cond.end_crit_edge, label %cond.false, !dbg [[LOC_15_6:![0-9]+]]
+; ALL: entry.cond.end_crit_edge:
+; GVN: %.pre = load %struct.node*, %struct.node** null, align 8, !dbg [[LOC_16_13:![0-9]+]]
+; INSTCOMBINE:store %struct.node* undef, %struct.node** null, align 536870912, !dbg [[LOC_16_13:![0-9]+]]
+
+cond.false:
+  %0 = bitcast %struct.desc* %desc to i8***, !dbg !11
+  %1 = load i8**, i8*** %0, align 8, !dbg !11
+  %2 = load i8*, i8** %1, align 8
+  br label %cond.end, !dbg !9
+
+cond.end:
+  %3 = phi i8* [ %2, %cond.false ], [ null, %entry ], !dbg !9
+  %node2 = getelementptr inbounds %struct.desc, %struct.desc* %desc, i64 0, i32 0
+  %4 = load %struct.node*, %struct.node** %node2, align 8, !dbg !10
+  %descs = getelementptr inbounds %struct.node, %struct.node* %4, i64 0, i32 1
+  %5 = bitcast %struct.desc** %descs to i8**
+  %6 = load i8*, i8** %5, align 8
+  %call = tail call i32 @bar(i8* %3, i8* %6)
+  ret i32 %call
+}
+
+declare i32 @bar(i8*, i8*) local_unnamed_addr #1
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.c", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !5, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !{}
+!9 = !DILocation(line: 15, column: 6, scope: !4)
+!10 = !DILocation(line: 16, column: 13, scope: !4)
+!11 = !DILocation(line: 15, column: 34, scope: !4)
+
+;ALL: [[SCOPE:![0-9]+]] = distinct  !DISubprogram(name: "test",{{.*}}
+;ALL: [[LOC_15_6]] = !DILocation(line: 15, column: 6, scope: [[SCOPE]])
+;ALL: [[LOC_16_13]] = !DILocation(line: 16, column: 13, scope: [[SCOPE]])
diff --git a/test/Transforms/GVN/PRE/phi-translate.ll b/test/Transforms/GVN/PRE/phi-translate.ll
index 1f6c7c8d33ea..55f5fd6465b6 100644
--- a/test/Transforms/GVN/PRE/phi-translate.ll
+++ b/test/Transforms/GVN/PRE/phi-translate.ll
@@ -6,12 +6,12 @@ target datalayout = "e-p:64:64:64"
 ; CHECK: entry.end_crit_edge:
 ; CHECK: %[[INDEX:[a-z0-9.]+]] = sext i32 %x to i64{{$}}
 ; CHECK: %[[ADDRESS:[a-z0-9.]+]] = getelementptr [100 x i32], [100 x i32]* @G, i64 0, i64 %[[INDEX]]{{$}}
-; CHECK:   %n.pre = load i32, i32* %[[ADDRESS]]{{$}}
+; CHECK:   %n.pre = load i32, i32* %[[ADDRESS]], !dbg [[N_LOC:![0-9]+]]
 ; CHECK: br label %end
 ; CHECK: then:
 ; CHECK:   store i32 %z
 ; CHECK: end:
-; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC:![0-9]+]]
+; CHECK:   %n = phi i32 [ %n.pre, %entry.end_crit_edge ], [ %z, %then ], !dbg [[N_LOC]]
 ; CHECK:   ret i32 %n
 
 ; CHECK: [[N_LOC]] = !DILocation(line: 47, column: 1, scope: !{{.*}})
diff --git a/test/Transforms/GlobalOpt/pr33686.ll b/test/Transforms/GlobalOpt/pr33686.ll
new file mode 100644
index 000000000000..d6bb98735f4e
--- /dev/null
+++ b/test/Transforms/GlobalOpt/pr33686.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -globalopt %s | FileCheck %s
+
+@glob = external global i16, align 1
+
+define void @beth() {
+; CHECK-LABEL: @beth(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+
+notreachable:
+  %patatino = select i1 undef, i16* @glob, i16* %patatino
+  br label %notreachable
+}
diff --git a/test/Transforms/IRCE/eq_ne.ll b/test/Transforms/IRCE/eq_ne.ll
new file mode 100644
index 000000000000..1b1ffe6b94ba
--- /dev/null
+++ b/test/Transforms/IRCE/eq_ne.ll
@@ -0,0 +1,257 @@
+; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s
+
+; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_02: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_03: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_04: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_05: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_06: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_07: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK-NOT: irce: in function test_08: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+
+; Show that IRCE can turn 'ne' condition to 'slt' in increasing IV.
+define void @test_01(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK:      test_01
+; CHECK:        main.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, 100
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that if n is not known to be greater than the starting value, IRCE
+; doesn't apply.
+define void @test_02(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_02(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, -100
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE can turn 'eq' condition to 'sge' in increasing IV.
+define void @test_03(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_03(
+; CHECK:        main.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next, %in.bounds ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp slt i32 [[PSEUDO_PHI]], 100
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, 100
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that if n is not known to be greater than the starting value, IRCE
+; doesn't apply.
+define void @test_04(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_04(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, -100
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE can turn 'ne' condition to 'sgt' in decreasing IV.
+define void @test_05(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_05(
+; CHECK:        preloop.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, 0
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE cannot turn 'ne' condition to 'sgt' in decreasing IV if the end
+; value is not proved to be less than the start value.
+define void @test_06(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_06(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp ne i32 %idx.next, 120
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE can turn 'eq' condition to 'slt' in decreasing IV.
+define void @test_07(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_07(
+; CHECK:        preloop.exit.selector:
+; CHECK-NEXT:     [[PSEUDO_PHI:%[^ ]+]] = phi i32 [ %idx.next.preloop, %in.bounds.preloop ]
+; CHECK-NEXT:     [[COND:%[^ ]+]] = icmp sgt i32 [[PSEUDO_PHI]], 0
+; CHECK-NEXT:     br i1 [[COND]]
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, 0
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Show that IRCE cannot turn 'eq' condition to 'slt' in decreasing IV if the end
+; value is not proved to be less than the start value.
+define void @test_08(i32* %arr, i32* %a_len_ptr) #0 {
+
+; CHECK: test_08(
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 100, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds, !prof !1
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp eq i32 %idx.next, 120
+  br i1 %next, label %exit, label %loop
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 50}
+!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/Transforms/IRCE/pre_post_loops.ll b/test/Transforms/IRCE/pre_post_loops.ll
new file mode 100644
index 000000000000..2cd2e29104fe
--- /dev/null
+++ b/test/Transforms/IRCE/pre_post_loops.ll
@@ -0,0 +1,117 @@
+; RUN: opt -verify-loop-info -irce-print-changed-loops -irce -S < %s 2>&1 | FileCheck %s
+
+; CHECK: irce: in function test_01: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+; CHECK: irce: in function test_02: constrained Loop at depth 1 containing: %loop<header><exiting>,%in.bounds<latch><exiting>
+
+; Iterate from 0 to SINT_MAX, check that the post-loop is generated.
+define void @test_01(i32* %arr, i32* %a_len_ptr) {
+
+; CHECK:       test_01(
+; CHECK:       entry:
+; CHECK-NEXT:    %exit.mainloop.at = load i32, i32* %a_len_ptr
+; CHECK:       loop:
+; CHECK-NEXT:    %idx = phi i32 [ %idx.next, %in.bounds ], [ 0, %loop.preheader ]
+; CHECK-NEXT:    %idx.next = add i32 %idx, 1
+; CHECK-NEXT:    %abc = icmp slt i32 %idx, %exit.mainloop.at
+; CHECK-NEXT:    br i1 true, label %in.bounds,
+; CHECK:       in.bounds:
+; CHECK-NEXT:    %addr = getelementptr i32, i32* %arr, i32 %idx
+; CHECK-NEXT:    store i32 0, i32* %addr
+; CHECK-NEXT:    %next = icmp slt i32 %idx.next, 2147483647
+; CHECK-NEXT:    [[COND:%[^ ]+]] = icmp slt i32 %idx.next, %exit.mainloop.at
+; CHECK-NEXT:    br i1 [[COND]], label %loop, label %main.exit.selector
+; CHECK:       main.pseudo.exit:
+; CHECK-NEXT:    %idx.copy = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ]
+; CHECK-NEXT:    %indvar.end = phi i32 [ 0, %entry ], [ %idx.next.lcssa, %main.exit.selector ]
+; CHECK-NEXT:    br label %postloop
+; CHECK:       postloop:
+; CHECK-NEXT:    br label %loop.postloop
+; CHECK:       loop.postloop:
+; CHECK-NEXT:    %idx.postloop = phi i32 [ %idx.copy, %postloop ], [ %idx.next.postloop, %in.bounds.postloop ]
+; CHECK-NEXT:    %idx.next.postloop = add i32 %idx.postloop, 1
+; CHECK-NEXT:    %abc.postloop = icmp slt i32 %idx.postloop, %exit.mainloop.at
+; CHECK-NEXT:    br i1 %abc.postloop, label %in.bounds.postloop, label %out.of.bounds.loopexit
+; CHECK:       in.bounds.postloop:
+; CHECK-NEXT:    %addr.postloop = getelementptr i32, i32* %arr, i32 %idx.postloop
+; CHECK-NEXT:    store i32 0, i32* %addr.postloop
+; CHECK-NEXT:    %next.postloop = icmp slt i32 %idx.next.postloop, 2147483647
+; CHECK-NEXT:    br i1 %next.postloop, label %loop.postloop, label %exit.loopexit
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, 1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp slt i32 %idx.next, 2147483647
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+; Iterate from SINT_MAX to 0, check that the pre-loop is generated.
+define void @test_02(i32* %arr, i32* %a_len_ptr) {
+
+; CHECK:      test_02(
+; CHECK:      entry:
+; CHECK-NEXT:   %len = load i32, i32* %a_len_ptr, !range !0
+; CHECH-NEXT:    br i1 true, label %loop.preloop.preheader
+; CHECK:      mainloop:
+; CHECK-NEXT:   br label %loop
+; CHECK:      loop:
+; CHECK-NEXT:   %idx = phi i32 [ %idx.preloop.copy, %mainloop ], [ %idx.next, %in.bounds ]
+; CHECK-NEXT:   %idx.next = add i32 %idx, -1
+; CHECK-NEXT:   %abc = icmp slt i32 %idx, %len
+; CHECK-NEXT:   br i1 true, label %in.bounds
+; CHECK:      in.bounds:
+; CHECK-NEXT:   %addr = getelementptr i32, i32* %arr, i32 %idx
+; CHECK-NEXT:   store i32 0, i32* %addr
+; CHECK-NEXT:   %next = icmp sgt i32 %idx.next, -1
+; CHECK-NEXT:   br i1 %next, label %loop, label %exit.loopexit
+; CHECK:      loop.preloop:
+; CHECK-NEXT:   %idx.preloop = phi i32 [ %idx.next.preloop, %in.bounds.preloop ], [ 2147483647, %loop.preloop.preheader ]
+; CHECK-NEXT:   %idx.next.preloop = add i32 %idx.preloop, -1
+; CHECK-NEXT:   %abc.preloop = icmp slt i32 %idx.preloop, %len
+; CHECK-NEXT:   br i1 %abc.preloop, label %in.bounds.preloop, label %out.of.bounds.loopexit
+; CHECK:      in.bounds.preloop:
+; CHECK-NEXT:   %addr.preloop = getelementptr i32, i32* %arr, i32 %idx.preloop
+; CHECK-NEXT:   store i32 0, i32* %addr.preloop
+; CHECK-NEXT:   %next.preloop = icmp sgt i32 %idx.next.preloop, -1
+; CHECK-NEXT:   [[COND:%[^ ]+]] = icmp sgt i32 %idx.next.preloop, -1
+; CHECK-NEXT:   br i1 [[COND]], label %loop.preloop, label %preloop.exit.selector
+
+entry:
+  %len = load i32, i32* %a_len_ptr, !range !0
+  br label %loop
+
+loop:
+  %idx = phi i32 [ 2147483647, %entry ], [ %idx.next, %in.bounds ]
+  %idx.next = add i32 %idx, -1
+  %abc = icmp slt i32 %idx, %len
+  br i1 %abc, label %in.bounds, label %out.of.bounds
+
+in.bounds:
+  %addr = getelementptr i32, i32* %arr, i32 %idx
+  store i32 0, i32* %addr
+  %next = icmp sgt i32 %idx.next, -1
+  br i1 %next, label %loop, label %exit
+
+out.of.bounds:
+  ret void
+
+exit:
+  ret void
+}
+
+!0 = !{i32 0, i32 50}
diff --git a/test/Transforms/Inline/AArch64/ext.ll b/test/Transforms/Inline/AArch64/ext.ll
new file mode 100644
index 000000000000..04095c04ee86
--- /dev/null
+++ b/test/Transforms/Inline/AArch64/ext.ll
@@ -0,0 +1,249 @@
+; REQUIRES: asserts
+; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define i32 @outer1(i32* %ptr, i32 %i) {
+  %C = call i32 @inner1(i32* %ptr, i32 %i)
+  ret i32 %C
+}
+
+; sext can be folded into gep.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner1(i32* %ptr, i32 %i) {
+  %E = sext i32 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i32 @outer2(i32* %ptr, i32 %i) {
+  %C = call i32 @inner2(i32* %ptr, i32 %i)
+  ret i32 %C
+}
+
+; zext from i32 to i64 is free.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner2(i32* %ptr, i32 %i) {
+  %E = zext i32 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i32 @outer3(i32* %ptr, i16 %i) {
+  %C = call i32 @inner3(i32* %ptr, i16 %i)
+  ret i32 %C
+}
+
+; zext can be folded into gep.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner3(i32* %ptr, i16 %i) {
+  %E = zext i16 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i16 @outer4(i8* %ptr) {
+  %C = call i16 @inner4(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner4
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner4(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i16
+  ret i16 %E
+}
+
+define i16 @outer5(i8* %ptr) {
+  %C = call i16 @inner5(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner5
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner5(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i16
+  ret i16 %E
+}
+
+define i32 @outer6(i8* %ptr) {
+  %C = call i32 @inner6(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner6
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner6(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer7(i8* %ptr) {
+  %C = call i32 @inner7(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner7
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner7(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer8(i16* %ptr) {
+  %C = call i32 @inner8(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner8
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner8(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer9(i16* %ptr) {
+  %C = call i32 @inner9(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner9
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner9(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i32
+  ret i32 %E
+}
+
+define i64 @outer10(i8* %ptr) {
+  %C = call i64 @inner10(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner10
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner10(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer11(i8* %ptr) {
+  %C = call i64 @inner11(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner11
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner11(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer12(i16* %ptr) {
+  %C = call i64 @inner12(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner12
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner12(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer13(i16* %ptr) {
+  %C = call i64 @inner13(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner13
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner13(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer14(i32* %ptr) {
+  %C = call i64 @inner14(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner14
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner14(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = zext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer15(i32* %ptr) {
+  %C = call i64 @inner15(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner15
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner15(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = sext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer16(i32 %V1, i64 %V2) {
+  %C = call i64 @inner16(i32 %V1, i64 %V2)
+  ret i64 %C
+}
+
+; sext can be folded into shl.
+; CHECK: Analyzing call of inner16
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 4
+define i64 @inner16(i32 %V1, i64 %V2) {
+  %E = sext i32 %V1 to i64
+  %S = shl i64 %E, 3
+  %A = add i64 %V2, %S
+  ret i64 %A
+}
diff --git a/test/Transforms/Inline/PowerPC/ext.ll b/test/Transforms/Inline/PowerPC/ext.ll
new file mode 100644
index 000000000000..f7a409467b2c
--- /dev/null
+++ b/test/Transforms/Inline/PowerPC/ext.ll
@@ -0,0 +1,140 @@
+; REQUIRES: asserts
+; RUN: opt -inline -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define i16 @outer1(i8* %ptr) {
+  %C = call i16 @inner1(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner1(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i16
+  ret i16 %E
+}
+
+define i32 @outer2(i8* %ptr) {
+  %C = call i32 @inner2(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner2(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer3(i16* %ptr) {
+  %C = call i32 @inner3(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner3(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer4(i16* %ptr) {
+  %C = call i32 @inner4(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner4
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner4(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i32
+  ret i32 %E
+}
+
+define i64 @outer5(i8* %ptr) {
+  %C = call i64 @inner5(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner5
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner5(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer6(i16* %ptr) {
+  %C = call i64 @inner6(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner6
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner6(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer7(i16* %ptr) {
+  %C = call i64 @inner7(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner7
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner7(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer8(i32* %ptr) {
+  %C = call i64 @inner8(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner8
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner8(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = zext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer9(i32* %ptr) {
+  %C = call i64 @inner9(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner9
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner9(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = sext i32 %L to i64
+  ret i64 %E
+}
diff --git a/test/Transforms/Inline/PowerPC/lit.local.cfg b/test/Transforms/Inline/PowerPC/lit.local.cfg
new file mode 100644
index 000000000000..5d33887ff0a4
--- /dev/null
+++ b/test/Transforms/Inline/PowerPC/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/Inline/X86/ext.ll b/test/Transforms/Inline/X86/ext.ll
new file mode 100644
index 000000000000..bffda3852799
--- /dev/null
+++ b/test/Transforms/Inline/X86/ext.ll
@@ -0,0 +1,201 @@
+; REQUIRES: asserts
+; RUN: opt -inline -mtriple=x86_64-unknown-unknown -S -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define i32 @outer1(i32* %ptr, i32 %i) {
+  %C = call i32 @inner1(i32* %ptr, i32 %i)
+  ret i32 %C
+}
+
+; zext from i32 to i64 is free.
+; CHECK: Analyzing call of inner1
+; CHECK: NumInstructionsSimplified: 3
+; CHECK: NumInstructions: 4
+define i32 @inner1(i32* %ptr, i32 %i) {
+  %E = zext i32 %i to i64
+  %G = getelementptr inbounds i32, i32* %ptr, i64 %E
+  %L = load i32, i32* %G
+  ret i32 %L
+}
+
+define i16 @outer2(i8* %ptr) {
+  %C = call i16 @inner2(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner2
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner2(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i16
+  ret i16 %E
+}
+
+define i16 @outer3(i8* %ptr) {
+  %C = call i16 @inner3(i8* %ptr)
+  ret i16 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner3
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i16 @inner3(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i16
+  ret i16 %E
+}
+
+define i32 @outer4(i8* %ptr) {
+  %C = call i32 @inner4(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner4
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner4(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer5(i8* %ptr) {
+  %C = call i32 @inner5(i8* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner5
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner5(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer6(i16* %ptr) {
+  %C = call i32 @inner6(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner6
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner6(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i32
+  ret i32 %E
+}
+
+define i32 @outer7(i16* %ptr) {
+  %C = call i32 @inner7(i16* %ptr)
+  ret i32 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner7
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i32 @inner7(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i32
+  ret i32 %E
+}
+
+define i64 @outer8(i8* %ptr) {
+  %C = call i64 @inner8(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner8
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner8(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = zext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer9(i8* %ptr) {
+  %C = call i64 @inner9(i8* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner9
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner9(i8* %ptr) {
+  %L = load i8, i8* %ptr
+  %E = sext i8 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer10(i16* %ptr) {
+  %C = call i64 @inner10(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner10
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner10(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = zext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer11(i16* %ptr) {
+  %C = call i64 @inner11(i16* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner11
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner11(i16* %ptr) {
+  %L = load i16, i16* %ptr
+  %E = sext i16 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer12(i32* %ptr) {
+  %C = call i64 @inner12(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner12
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner12(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = zext i32 %L to i64
+  ret i64 %E
+}
+
+define i64 @outer13(i32* %ptr) {
+  %C = call i64 @inner13(i32* %ptr)
+  ret i64 %C
+}
+
+; It is an ExtLoad.
+; CHECK: Analyzing call of inner13
+; CHECK: NumInstructionsSimplified: 2
+; CHECK: NumInstructions: 3
+define i64 @inner13(i32* %ptr) {
+  %L = load i32, i32* %ptr
+  %E = sext i32 %L to i64
+  ret i64 %E
+}
diff --git a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
index 3c4e08b5b515..905357817509 100644
--- a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
+++ b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll
@@ -1,7 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; CHECK: llvm.umul.with.overflow
 define i32 @sterix(i32, i8, i64) {
+; CHECK-LABEL: @sterix(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[TMP0:%.*]] to i64
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i8 [[TMP1:%.*]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[CONV1]], 1945964878
+; CHECK-NEXT:    [[SH_PROM:%.*]] = trunc i64 [[TMP2:%.*]] to i32
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], [[SH_PROM]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[MUL3:%.*]] = mul nuw nsw i64 [[CONV]], [[CONV2]]
+; CHECK-NEXT:    [[CONV6:%.*]] = and i64 [[MUL3]], 4294967295
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[CONV6]], [[MUL3]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[MUL3]], [[TMP2]]
+; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[AND]] to i32
+; CHECK-NEXT:    [[TOBOOL7:%.*]] = icmp eq i32 [[CONV4]], 0
+; CHECK-NEXT:    [[PHITMP:%.*]] = zext i1 [[TOBOOL7]] to i32
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHITMP]], [[LOR_RHS]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
 entry:
   %conv = zext i32 %0 to i64
   %conv1 = sext i8 %1 to i32
diff --git a/test/Transforms/InstCombine/and-not-or.ll b/test/Transforms/InstCombine/and-not-or.ll
deleted file mode 100644
index a42140be2805..000000000000
--- a/test/Transforms/InstCombine/and-not-or.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep "and i32 %x, %y" | count 4
-; RUN: opt < %s -instcombine -S | not grep "or"
-
-define i32 @func1(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %n, %x
-	%a = and i32 %o, %y
-	ret i32 %a
-}
-
-define i32 @func2(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %x, %n
-	%a = and i32 %o, %y
-	ret i32 %a
-}
-
-define i32 @func3(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %n, %x
-	%a = and i32 %y, %o
-	ret i32 %a
-}
-
-define i32 @func4(i32 %x, i32 %y) nounwind {
-entry:
-	%n = xor i32 %y, -1
-	%o = or i32 %x, %n
-	%a = and i32 %y, %o
-	ret i32 %a
-}
diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll
index 7bb9b95b3179..c12662d4db0e 100644
--- a/test/Transforms/InstCombine/and.ll
+++ b/test/Transforms/InstCombine/and.ll
@@ -628,3 +628,195 @@ define i32 @test43(i32 %a, i32 %c, i32 %d) {
   %and = and i32 %or, %xor
   ret i32 %and
 }
+
+; (~y | x) & y -> x & y
+define i32 @test44(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test44(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %n, %x
+  %a = and i32 %o, %y
+  ret i32 %a
+}
+
+; (x | ~y) & y -> x & y
+define i32 @test45(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test45(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %x, %n
+  %a = and i32 %o, %y
+  ret i32 %a
+}
+
+; y & (~y | x) -> y | x
+define i32 @test46(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test46(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %n, %x
+  %a = and i32 %y, %o
+  ret i32 %a
+}
+
+; y & (x | ~y) -> y | x
+define i32 @test47(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @test47(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %n = xor i32 %y, -1
+  %o = or i32 %x, %n
+  %a = and i32 %y, %o
+  ret i32 %a
+}
+
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (X & (Y | ~X)) -> (X & Y), where 'not' is an inverted cmp
+
+define i1 @and_orn_cmp_1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @and_orn_cmp_1(
+; CHECK-NEXT:    [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp sgt i32 %a, %b
+  %x_inv = icmp sle i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %y, %x_inv
+  %and = and i1 %x, %or
+  ret i1 %and
+}
+
+; Commute the 'and':
+; ((Y | ~X) & X) -> (X & Y), where 'not' is an inverted cmp
+
+define <2 x i1> @and_orn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: @and_orn_cmp_2(
+; CHECK-NEXT:    [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], <i32 42, i32 47>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[AND]]
+;
+  %x = icmp sge <2 x i32> %a, %b
+  %x_inv = icmp slt <2 x i32> %a, %b
+  %y = icmp ugt <2 x i32> %c, <i32 42, i32 47>      ; thwart complexity-based ordering
+  %or = or <2 x i1> %y, %x_inv
+  %and = and <2 x i1> %or, %x
+  ret <2 x i1> %and
+}
+
+; Commute the 'or':
+; (X & (~X | Y)) -> (X & Y), where 'not' is an inverted cmp
+
+define i1 @and_orn_cmp_3(i72 %a, i72 %b, i72 %c) {
+; CHECK-LABEL: @and_orn_cmp_3(
+; CHECK-NEXT:    [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp ugt i72 %a, %b
+  %x_inv = icmp ule i72 %a, %b
+  %y = icmp ugt i72 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %x_inv, %y
+  %and = and i1 %x, %or
+  ret i1 %and
+}
+
+; Commute the 'and':
+; ((~X | Y) & X) -> (X & Y), where 'not' is an inverted cmp
+
+define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) {
+; CHECK-LABEL: @or_andn_cmp_4(
+; CHECK-NEXT:    [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], <i32 42, i32 43, i32 -1>
+; CHECK-NEXT:    [[AND:%.*]] = and <3 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <3 x i1> [[AND]]
+;
+  %x = icmp eq <3 x i32> %a, %b
+  %x_inv = icmp ne <3 x i32> %a, %b
+  %y = icmp ugt <3 x i32> %c, <i32 42, i32 43, i32 -1>      ; thwart complexity-based ordering
+  %or = or <3 x i1> %x_inv, %y
+  %and = and <3 x i1> %or, %x
+  ret <3 x i1> %and
+}
+
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (~X & (Y | X)) -> (~X & Y), where 'not' is an inverted cmp
+
+define i1 @andn_or_cmp_1(i37 %a, i37 %b, i37 %c) {
+; CHECK-LABEL: @andn_or_cmp_1(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp sgt i37 %a, %b
+  %x_inv = icmp sle i37 %a, %b
+  %y = icmp ugt i37 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %y, %x
+  %and = and i1 %x_inv, %or
+  ret i1 %and
+}
+
+; Commute the 'and':
+; ((Y | X) & ~X) -> (~X & Y), where 'not' is an inverted cmp
+
+define i1 @andn_or_cmp_2(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: @andn_or_cmp_2(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp sge i16 %a, %b
+  %x_inv = icmp slt i16 %a, %b
+  %y = icmp ugt i16 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %y, %x
+  %and = and i1 %or, %x_inv
+  ret i1 %and
+}
+
+; Commute the 'or':
+; (~X & (X | Y)) -> (~X & Y), where 'not' is an inverted cmp
+
+define <4 x i1> @andn_or_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: @andn_or_cmp_3(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], <i32 42, i32 0, i32 1, i32 -1>
+; CHECK-NEXT:    [[AND:%.*]] = and <4 x i1> [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret <4 x i1> [[AND]]
+;
+  %x = icmp ugt <4 x i32> %a, %b
+  %x_inv = icmp ule <4 x i32> %a, %b
+  %y = icmp ugt <4 x i32> %c, <i32 42, i32 0, i32 1, i32 -1>      ; thwart complexity-based ordering
+  %or = or <4 x i1> %x, %y
+  %and = and <4 x i1> %x_inv, %or
+  ret <4 x i1> %and
+}
+
+; Commute the 'and':
+; ((X | Y) & ~X) -> (~X & Y), where 'not' is an inverted cmp
+
+define i1 @andn_or_cmp_4(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @andn_or_cmp_4(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %x = icmp eq i32 %a, %b
+  %x_inv = icmp ne i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %or = or i1 %x, %y
+  %and = and i1 %or, %x_inv
+  ret i1 %and
+}
diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll
index 001ac58891e4..15772d158f62 100644
--- a/test/Transforms/InstCombine/and2.ll
+++ b/test/Transforms/InstCombine/and2.ll
@@ -98,8 +98,7 @@ define i64 @test9(i64 %x) {
 ; combine -x & 1 into x & 1
 define <2 x i64> @test9vec(<2 x i64> %x) {
 ; CHECK-LABEL: @test9vec(
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <2 x i64> zeroinitializer, [[X:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> [[SUB]], <i64 1, i64 1>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i64> %x, <i64 1, i64 1>
 ; CHECK-NEXT:    ret <2 x i64> [[AND]]
 ;
   %sub = sub nsw <2 x i64> <i64 0, i64 0>, %x
@@ -119,6 +118,88 @@ define i64 @test10(i64 %x) {
   ret i64 %add
 }
 
+; (1 << x) & 1 --> zext(x == 0)
+
+define i8 @and1_shl1_is_cmp_eq_0(i8 %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 %x, 0
+; CHECK-NEXT:    [[AND:%.*]] = zext i1 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[AND]]
+;
+  %sh = shl i8 1, %x
+  %and = and i8 %sh, 1
+  ret i8 %and
+}
+
+; Don't do it if the shift has another use.
+
+define i8 @and1_shl1_is_cmp_eq_0_multiuse(i8 %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_multiuse(
+; CHECK-NEXT:    [[SH:%.*]] = shl i8 1, %x
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[SH]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SH]], [[AND]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %sh = shl i8 1, %x
+  %and = and i8 %sh, 1
+  %add = add i8 %sh, %and
+  ret i8 %add
+}
+
+; (1 << x) & 1 --> zext(x == 0)
+
+define <2 x i8> @and1_shl1_is_cmp_eq_0_vec(<2 x i8> %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer
+; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[AND]]
+;
+  %sh = shl <2 x i8> <i8 1, i8 1>, %x
+  %and = and <2 x i8> %sh, <i8 1, i8 1>
+  ret <2 x i8> %and
+}
+
+; (1 >> x) & 1 --> zext(x == 0)
+
+define i8 @and1_lshr1_is_cmp_eq_0(i8 %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 %x, 0
+; CHECK-NEXT:    [[AND:%.*]] = zext i1 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[AND]]
+;
+  %sh = lshr i8 1, %x
+  %and = and i8 %sh, 1
+  ret i8 %and
+}
+
+; Don't do it if the shift has another use.
+
+define i8 @and1_lshr1_is_cmp_eq_0_multiuse(i8 %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_multiuse(
+; CHECK-NEXT:    [[SH:%.*]] = lshr i8 1, %x
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[SH]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SH]], [[AND]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %sh = lshr i8 1, %x
+  %and = and i8 %sh, 1
+  %add = add i8 %sh, %and
+  ret i8 %add
+}
+
+; (1 >> x) & 1 --> zext(x == 0)
+
+define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec(<2 x i8> %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> %x, zeroinitializer
+; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[AND]]
+;
+  %sh = lshr <2 x i8> <i8 1, i8 1>, %x
+  %and = and <2 x i8> %sh, <i8 1, i8 1>
+  ret <2 x i8> %and
+}
+
 ; The add in this test is unnecessary because the LSBs of the LHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
 define i32 @test11(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test11(
diff --git a/test/Transforms/InstCombine/element-atomic-memintrins.ll b/test/Transforms/InstCombine/element-atomic-memintrins.ll
new file mode 100644
index 000000000000..2e3bfd7b721d
--- /dev/null
+++ b/test/Transforms/InstCombine/element-atomic-memintrins.ll
@@ -0,0 +1,98 @@
+;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have
+;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
+;; verify that inst combine handles these intrinsics properly once they have been
+;; added to that class hierarchy.
+
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+;; ---- memset -----
+
+; Ensure 0-length memset isn't removed
+define void @test_memset_zero_length(i8* %dest) {
+  ; CHECK-LABEL: test_memset_zero_length
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
+  ret void
+}
+
+; Ensure that small-sized memsets don't convert to stores
+define void @test_memset_to_store(i8* %dest) {
+  ; CHECK-LABEL: test_memset_to_store
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
+  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
+  ret void
+}
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture writeonly, i8, i32, i32) nounwind argmemonly
+
+
+;; =========================================
+;; ----- memmove ------
+
+; memmove from a global constant source does not become memcpy
+@gconst = constant [8 x i8] c"0123456\00"
+define void @test_memmove_to_memcpy(i8* %dest) {
+  ; CHECK-LABEL: test_memmove_to_memcpy
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
+  ret void
+}
+
+define void @test_memmove_zero_length(i8* %dest, i8* %src) {
+  ; CHECK-LABEL: test_memmove_zero_length
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
+  ret void  
+}
+
+; memmove with src==dest is removed
+define void @test_memmove_removed(i8* %srcdest, i32 %sz) {
+  ; CHECK-LABEL: test_memmove_removed
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
+  ret void
+}
+
+; memmove with a small constant length is converted to a load/store pair
+define void @test_memmove_loadstore(i8* %dest, i8* %src) {
+  ; CHECK-LABEL: test_memmove_loadstore
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
+  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+  ; CHECK-NEXT: ret void
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+  ret void
+}
+
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly
diff --git a/test/Transforms/InstCombine/icmp-logical.ll b/test/Transforms/InstCombine/icmp-logical.ll
index faae2016e207..aa95cc5a1316 100644
--- a/test/Transforms/InstCombine/icmp-logical.ll
+++ b/test/Transforms/InstCombine/icmp-logical.ll
@@ -1,159 +1,138 @@
 ; RUN: opt -instcombine -S -o - %s | FileCheck %s
 
 define i1 @masked_and_notallzeroes(i32 %A) {
-; CHECK-LABEL: @masked_and_notallzeroes
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp ne i32 [[MASK]], 0
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_and_notallzeroes(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp ne i32 [[MASK1]], 0
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp ne i32 %mask1, 0
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp ne i32 %mask2, 0
-
   %res = and i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_allzeroes(i32 %A) {
-; CHECK-LABEL: @masked_or_allzeroes
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_allzeroes(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp eq i32 %mask1, 0
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, 0
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_and_notallones(i32 %A) {
-; CHECK-LABEL: @masked_and_notallones
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp ne i32 [[MASK]], 7
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_and_notallones(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp ne i32 [[MASK1]], 7
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp ne i32 %mask1, 7
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp ne i32 %mask2, 39
-
   %res = and i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_allones(i32 %A) {
-; CHECK-LABEL: @masked_or_allones
-; CHECK: [[MASK:%.*]] = and i32 %A, 7
-; CHECK: icmp eq i32 [[MASK]], 7
-; CHECK-NOT: and i32 %A, 39
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_allones(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASK1]], 7
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp eq i32 %mask1, 7
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, 39
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_and_notA(i32 %A) {
-; CHECK-LABEL: @masked_and_notA
-; CHECK: [[MASK:%.*]] = and i32 %A, 39
-; CHECK: icmp ne i32 [[MASK]], %A
-; CHECK-NOT: and i32 %A, 7
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_and_notA(
+; CHECK-NEXT:    [[MASK2:%.*]] = and i32 %A, 39
+; CHECK-NEXT:    [[TST2:%.*]] = icmp ne i32 [[MASK2]], %A
+; CHECK-NEXT:    ret i1 [[TST2]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp ne i32 %mask1, %A
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp ne i32 %mask2, %A
-
   %res = and i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_A(i32 %A) {
-; CHECK-LABEL: @masked_or_A
-; CHECK: [[MASK:%.*]] = and i32 %A, 39
-; CHECK: icmp eq i32 [[MASK]], %A
-; CHECK-NOT: and i32 %A, 7
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_A(
+; CHECK-NEXT:    [[MASK2:%.*]] = and i32 %A, 39
+; CHECK-NEXT:    [[TST2:%.*]] = icmp eq i32 [[MASK2]], %A
+; CHECK-NEXT:    ret i1 [[TST2]]
+;
   %mask1 = and i32 %A, 7
   %tst1 = icmp eq i32 %mask1, %A
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, %A
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @masked_or_allzeroes_notoptimised(i32 %A) {
-; CHECK-LABEL: @masked_or_allzeroes_notoptimised
-; CHECK: [[MASK:%.*]] = and i32 %A, 15
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK: [[MASK:%.*]] = and i32 %A, 39
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK: ret i1
-
+; CHECK-LABEL: @masked_or_allzeroes_notoptimised(
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 %A, 15
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASK1]], 0
+; CHECK-NEXT:    [[MASK2:%.*]] = and i32 %A, 39
+; CHECK-NEXT:    [[TST2:%.*]] = icmp eq i32 [[MASK2]], 0
+; CHECK-NEXT:    [[RES:%.*]] = or i1 [[TST1]], [[TST2]]
+; CHECK-NEXT:    ret i1 [[RES]]
+;
   %mask1 = and i32 %A, 15
   %tst1 = icmp eq i32 %mask1, 0
-
   %mask2 = and i32 %A, 39
   %tst2 = icmp eq i32 %mask2, 0
-
   %res = or i1 %tst1, %tst2
   ret i1 %res
 }
 
 define i1 @nomask_lhs(i32 %in) {
-; CHECK-LABEL: @nomask_lhs
-; CHECK: [[MASK:%.*]] = and i32 %in, 1
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK-NOT: icmp
-; CHECK: ret i1
+; CHECK-LABEL: @nomask_lhs(
+; CHECK-NEXT:    [[MASKED:%.*]] = and i32 %in, 1
+; CHECK-NEXT:    [[TST2:%.*]] = icmp eq i32 [[MASKED]], 0
+; CHECK-NEXT:    ret i1 [[TST2]]
+;
   %tst1 = icmp eq i32 %in, 0
-
   %masked = and i32 %in, 1
   %tst2 = icmp eq i32 %masked, 0
-
   %val = or i1 %tst1, %tst2
   ret i1 %val
 }
 
-
 define i1 @nomask_rhs(i32 %in) {
-; CHECK-LABEL: @nomask_rhs
-; CHECK: [[MASK:%.*]] = and i32 %in, 1
-; CHECK: icmp eq i32 [[MASK]], 0
-; CHECK-NOT: icmp
-; CHECK: ret i1
+; CHECK-LABEL: @nomask_rhs(
+; CHECK-NEXT:    [[MASKED:%.*]] = and i32 %in, 1
+; CHECK-NEXT:    [[TST1:%.*]] = icmp eq i32 [[MASKED]], 0
+; CHECK-NEXT:    ret i1 [[TST1]]
+;
   %masked = and i32 %in, 1
   %tst1 = icmp eq i32 %masked, 0
-
   %tst2 = icmp eq i32 %in, 0
-
   %val = or i1 %tst1, %tst2
   ret i1 %val
 }
 
+; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify.
+
 define i1 @fold_mask_cmps_to_false(i32 %x) {
-; CHECK-LABEL: @fold_mask_cmps_to_false
-; CHECK: ret i1 false
+; CHECK-LABEL: @fold_mask_cmps_to_false(
+; CHECK-NEXT:    ret i1 false
+;
   %1 = and i32 %x, 2147483647
   %2 = icmp eq i32 %1, 0
   %3 = icmp eq i32 %x, 2147483647
@@ -161,12 +140,46 @@ define i1 @fold_mask_cmps_to_false(i32 %x) {
   ret i1 %4
 }
 
+; TODO: This test simplifies to a constant, so the functionality and test could be in InstSimplify.
+
 define i1 @fold_mask_cmps_to_true(i32 %x) {
-; CHECK-LABEL: @fold_mask_cmps_to_true
-; CHECK: ret i1 true
+; CHECK-LABEL: @fold_mask_cmps_to_true(
+; CHECK-NEXT:    ret i1 true
+;
   %1 = and i32 %x, 2147483647
   %2 = icmp ne i32 %1, 0
   %3 = icmp ne i32 %x, 2147483647
   %4 = or i1 %3, %2
   ret i1 %4
 }
+
+; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401
+
+define i1 @cmpeq_bitwise(i8 %a, i8 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: @cmpeq_bitwise(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 %c, %d
+; CHECK-NEXT:    [[CMP:%.*]] = and i1 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %xor1 = xor i8 %a, %b
+  %xor2 = xor i8 %c, %d
+  %or = or i8 %xor1, %xor2
+  %cmp = icmp eq i8 %or, 0
+  ret i1 %cmp
+}
+
+define <2 x i1> @cmpne_bitwise(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
+; CHECK-LABEL: @cmpne_bitwise(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i64> %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i64> %c, %d
+; CHECK-NEXT:    [[CMP:%.*]] = or <2 x i1> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %xor1 = xor <2 x i64> %a, %b
+  %xor2 = xor <2 x i64> %c, %d
+  %or = or <2 x i64> %xor1, %xor2
+  %cmp = icmp ne <2 x i64> %or, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index 947971c6c83b..be64f51b6c4c 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-define i32 @test1(i32 %x, i32 %y) nounwind {
+; X | ~(X | Y) --> X | ~Y
+
+define i32 @test1(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 %y, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], %x
@@ -13,7 +15,10 @@ define i32 @test1(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test2(i32 %x, i32 %y) nounwind {
+; Commute (rename) the inner 'or' operands:
+; Y | ~(X | Y) --> ~X | Y
+
+define i32 @test2(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], %y
@@ -25,7 +30,9 @@ define i32 @test2(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test3(i32 %x, i32 %y) nounwind {
+; X | ~(X ^ Y) --> X | ~Y
+
+define i32 @test3(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 %y, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], %x
@@ -37,7 +44,10 @@ define i32 @test3(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test4(i32 %x, i32 %y) nounwind {
+; Commute (rename) the 'xor' operands:
+; Y | ~(X ^ Y) --> ~X | Y
+
+define i32 @test4(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], %y
@@ -49,7 +59,7 @@ define i32 @test4(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test5(i32 %x, i32 %y) nounwind {
+define i32 @test5(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -59,7 +69,7 @@ define i32 @test5(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test6(i32 %x, i32 %y) nounwind {
+define i32 @test6(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    ret i32 -1
 ;
@@ -69,7 +79,7 @@ define i32 @test6(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test7(i32 %x, i32 %y) nounwind {
+define i32 @test7(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 %x, %y
 ; CHECK-NEXT:    ret i32 [[Z]]
@@ -79,7 +89,7 @@ define i32 @test7(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test8(i32 %x, i32 %y) nounwind {
+define i32 @test8(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[X_NOT]], %y
@@ -91,7 +101,7 @@ define i32 @test8(i32 %x, i32 %y) nounwind {
   ret i32 %z
 }
 
-define i32 @test9(i32 %x, i32 %y) nounwind {
+define i32 @test9(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i32 %y, -1
 ; CHECK-NEXT:    [[Z:%.*]] = or i32 [[Y_NOT]], %x
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 764fe4503b5e..fb56449ba4d4 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -397,14 +397,74 @@ define <2 x i132> @orsext_to_sel_vec_swap(<2 x i132> %x, <2 x i1> %y) {
   ret <2 x i132> %or
 }
 
-define i32 @test39(i32 %a, i32 %b) {
-; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 %b, %a
+; (~A & B) | A --> A | B
+
+define i32 @test39a(i32 %a, float %b) {
+; CHECK-LABEL: @test39a(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
-  %xor = xor i32 %a, -1
-  %and = and i32 %xor, %b
-  %or = or i32 %and, %a
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %nota, %b1
+  %or = or i32 %and, %a1
+  ret i32 %or
+}
+
+; Commute 'and' operands:
+; (B & ~A) | A --> A | B
+
+define i32 @test39b(i32 %a, float %b) {
+; CHECK-LABEL: @test39b(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %b1, %nota
+  %or = or i32 %and, %a1
+  ret i32 %or
+}
+
+; Commute 'or' operands:
+; A | (~A & B) --> A | B
+
+define i32 @test39c(i32 %a, float %b) {
+; CHECK-LABEL: @test39c(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %nota, %b1
+  %or = or i32 %a1, %and
+  ret i32 %or
+}
+
+; Commute 'and' operands:
+; A | (B & ~A) --> A | B
+
+define i32 @test39d(i32 %a, float %b) {
+; CHECK-LABEL: @test39d(
+; CHECK-NEXT:    [[A1:%.*]] = mul i32 %a, 42
+; CHECK-NEXT:    [[B1:%.*]] = bitcast float %b to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[A1]], [[B1]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %a1 = mul i32 %a, 42          ; thwart complexity-based ordering
+  %b1 = bitcast float %b to i32 ; thwart complexity-based ordering
+  %nota = xor i32 %a1, -1
+  %and = and i32 %b1, %nota
+  %or = or i32 %a1, %and
   ret i32 %or
 }
 
@@ -456,60 +516,6 @@ define i32 @test40d(i32 %a, i32 %b) {
   ret i32 %or
 }
 
-define i32 @test41(i32 %a, i32 %b) {
-; CHECK-LABEL: @test41(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %and = and i32 %a, %b
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %nega, %b
-  %or = or i32 %and, %xor
-  ret i32 %or
-}
-
-; (~A ^ B) | (A & B) -> (~A ^ B)
-
-define i32 @test42(i32 %a, i32 %b) {
-; CHECK-LABEL: @test42(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %nega, %b
-  %and = and i32 %a, %b
-  %or = or i32 %xor, %and
-  ret i32 %or
-}
-
-define i32 @test42_commuted_and(i32 %a, i32 %b) {
-; CHECK-LABEL: @test42_commuted_and(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %nega, %b
-  %and = and i32 %b, %a
-  %or = or i32 %xor, %and
-  ret i32 %or
-}
-
-define i32 @test42_commuted_xor(i32 %a, i32 %b) {
-; CHECK-LABEL: @test42_commuted_xor(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[OR:%.*]] = xor i32 [[TMP1]], %b
-; CHECK-NEXT:    ret i32 [[OR]]
-;
-  %nega = xor i32 %a, -1
-  %xor = xor i32 %b, %nega
-  %and = and i32 %a, %b
-  %or = or i32 %xor, %and
-  ret i32 %or
-}
-
 define i32 @test45(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test45(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %x, %z
@@ -648,41 +654,146 @@ final:
   ret <2 x i32> %value
 }
 
-define i8 @test51(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @test51(
-; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
-; CHECK-NEXT:    ret i8 [[X]]
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (X | (Y & ~X)) -> (X | Y), where 'not' is an inverted cmp
+
+define i1 @or_andn_cmp_1(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @or_andn_cmp_1(
+; CHECK-NEXT:    [[X:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
-  %w = mul i8 %b, %c
-  %z = xor i8 %a, -1
-  %y = and i8 %w, %z
-  %x = or i8 %y, %a
-  ret i8 %x
+  %x = icmp sgt i32 %a, %b
+  %x_inv = icmp sle i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %y, %x_inv
+  %or = or i1 %x, %and
+  ret i1 %or
 }
 
-define i8 @test52(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @test52(
-; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
-; CHECK-NEXT:    ret i8 [[X]]
+; Commute the 'or':
+; ((Y & ~X) | X) -> (X | Y), where 'not' is an inverted cmp
+
+define <2 x i1> @or_andn_cmp_2(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: @or_andn_cmp_2(
+; CHECK-NEXT:    [[X:%.*]] = icmp sge <2 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <2 x i32> [[C:%.*]], <i32 42, i32 47>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[OR]]
 ;
-  %w = mul i8 %b, %c
-  %z = xor i8 %w, -1
-  %y = and i8 %z, %a
-  %x = or i8 %w, %y
-  ret i8 %x
+  %x = icmp sge <2 x i32> %a, %b
+  %x_inv = icmp slt <2 x i32> %a, %b
+  %y = icmp ugt <2 x i32> %c, <i32 42, i32 47>      ; thwart complexity-based ordering
+  %and = and <2 x i1> %y, %x_inv
+  %or = or <2 x i1> %and, %x
+  ret <2 x i1> %or
 }
 
-define i8 @test53(i8 %a, i8 %b, i8 %c) {
-; CHECK-LABEL: @test53(
-; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
-; CHECK-NEXT:    ret i8 [[X]]
+; Commute the 'and':
+; (X | (~X & Y)) -> (X | Y), where 'not' is an inverted cmp
+
+define i1 @or_andn_cmp_3(i72 %a, i72 %b, i72 %c) {
+; CHECK-LABEL: @or_andn_cmp_3(
+; CHECK-NEXT:    [[X:%.*]] = icmp ugt i72 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i72 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
-  %w = mul i8 %b, %c
-  %z = xor i8 %w, -1
-  %y = and i8 %z, %a
-  %x = or i8 %w, %y
-  ret i8 %x
+  %x = icmp ugt i72 %a, %b
+  %x_inv = icmp ule i72 %a, %b
+  %y = icmp ugt i72 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %x_inv, %y
+  %or = or i1 %x, %and
+  ret i1 %or
+}
+
+; Commute the 'or':
+; ((~X & Y) | X) -> (X | Y), where 'not' is an inverted cmp
+
+define <3 x i1> @or_andn_cmp_4(<3 x i32> %a, <3 x i32> %b, <3 x i32> %c) {
+; CHECK-LABEL: @or_andn_cmp_4(
+; CHECK-NEXT:    [[X:%.*]] = icmp eq <3 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <3 x i32> [[C:%.*]], <i32 42, i32 43, i32 -1>
+; CHECK-NEXT:    [[OR:%.*]] = or <3 x i1> [[Y]], [[X]]
+; CHECK-NEXT:    ret <3 x i1> [[OR]]
+;
+  %x = icmp eq <3 x i32> %a, %b
+  %x_inv = icmp ne <3 x i32> %a, %b
+  %y = icmp ugt <3 x i32> %c, <i32 42, i32 43, i32 -1>      ; thwart complexity-based ordering
+  %and = and <3 x i1> %x_inv, %y
+  %or = or <3 x i1> %and, %x
+  ret <3 x i1> %or
+}
+
+; In the next 4 tests, vary the types and predicates for extra coverage.
+; (~X | (Y & X)) -> (~X | Y), where 'not' is an inverted cmp
+
+define i1 @orn_and_cmp_1(i37 %a, i37 %b, i37 %c) {
+; CHECK-LABEL: @orn_and_cmp_1(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp sle i37 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i37 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %x = icmp sgt i37 %a, %b
+  %x_inv = icmp sle i37 %a, %b
+  %y = icmp ugt i37 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %y, %x
+  %or = or i1 %x_inv, %and
+  ret i1 %or
+}
+
+; Commute the 'or':
+; ((Y & X) | ~X) -> (~X | Y), where 'not' is an inverted cmp
+
+define i1 @orn_and_cmp_2(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: @orn_and_cmp_2(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp slt i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i16 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %x = icmp sge i16 %a, %b
+  %x_inv = icmp slt i16 %a, %b
+  %y = icmp ugt i16 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %y, %x
+  %or = or i1 %and, %x_inv
+  ret i1 %or
+}
+
+; Commute the 'and':
+; (~X | (X & Y)) -> (~X | Y), where 'not' is an inverted cmp
+
+define <4 x i1> @orn_and_cmp_3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: @orn_and_cmp_3(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ule <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt <4 x i32> [[C:%.*]], <i32 42, i32 0, i32 1, i32 -1>
+; CHECK-NEXT:    [[OR:%.*]] = or <4 x i1> [[X_INV]], [[Y]]
+; CHECK-NEXT:    ret <4 x i1> [[OR]]
+;
+  %x = icmp ugt <4 x i32> %a, %b
+  %x_inv = icmp ule <4 x i32> %a, %b
+  %y = icmp ugt <4 x i32> %c, <i32 42, i32 0, i32 1, i32 -1>      ; thwart complexity-based ordering
+  %and = and <4 x i1> %x, %y
+  %or = or <4 x i1> %x_inv, %and
+  ret <4 x i1> %or
+}
+
+; Commute the 'or':
+; ((X & Y) | ~X) -> (~X | Y), where 'not' is an inverted cmp
+
+define i1 @orn_and_cmp_4(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @orn_and_cmp_4(
+; CHECK-NEXT:    [[X_INV:%.*]] = icmp ne i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = icmp ugt i32 [[C:%.*]], 42
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[Y]], [[X_INV]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %x = icmp eq i32 %a, %b
+  %x_inv = icmp ne i32 %a, %b
+  %y = icmp ugt i32 %c, 42      ; thwart complexity-based ordering
+  %and = and i1 %x, %y
+  %or = or i1 %and, %x_inv
+  ret i1 %or
 }
diff --git a/test/Transforms/InstCombine/pr33765.ll b/test/Transforms/InstCombine/pr33765.ll
new file mode 100644
index 000000000000..99ed0d13b5cf
--- /dev/null
+++ b/test/Transforms/InstCombine/pr33765.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -instcombine | FileCheck %s
+
+@glob = external global i16
+
+define void @patatino(i8 %beth) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[BETH:%.*]] to i32
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN9:%.*]], label [[IF_THEN9]]
+; CHECK:       if.then9:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[TINKY:%.*]] = load i16, i16* @glob, align 2
+; CHECK-NEXT:    [[CONV131:%.*]] = zext i16 [[TINKY]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[MUL]], [[CONV131]]
+; CHECK-NEXT:    [[CONV14:%.*]] = trunc i32 [[AND]] to i16
+; CHECK-NEXT:    store i16 [[CONV14]], i16* @glob, align 2
+; CHECK-NEXT:    ret void
+;
+  %conv = zext i8 %beth to i32
+  %mul = mul nuw nsw i32 %conv, %conv
+  %conv3 = and i32 %mul, 255
+  %tobool8 = icmp ne i32 %mul, %conv3
+  br i1 %tobool8, label %if.then9, label %if.then9
+
+if.then9:
+  %tinky = load i16, i16* @glob
+  %conv13 = sext i16 %tinky to i32
+  %and = and i32 %mul, %conv13
+  %conv14 = trunc i32 %and to i16
+  store i16 %conv14, i16* @glob
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 6a3cf7edd7dc..5e84ec54971a 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -280,10 +280,85 @@ cond.false.15.i:                                  ; preds = %cond.false.10.i
   ret i32 %j.add3
 
 ; CHECK-LABEL: @unfold3
-; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i 
+; CHECK: br i1 %cmp.i, label %.exit.thread2, label %cond.false.i
 ; CHECK: br i1 %cmp4.i, label %.exit.thread, label %cond.false.6.i
 ; CHECK: br i1 %cmp8.i, label %.exit.thread2, label %cond.false.10.i
 ; CHECK: br i1 %cmp13.i, label %.exit.thread, label %.exit
 ; CHECK: br i1 %phitmp, label %.exit.thread, label %.exit.thread2
 ; CHECK: br label %.exit.thread2
 }
+
+define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind {
+entry:
+  %add3 = add nsw i32 %j, 2
+  %cmp.i = icmp slt i32 %u, %v
+  br i1 %cmp.i, label %.exit, label %cond.false.i
+
+cond.false.i:                                     ; preds = %entry
+  %cmp4.i = icmp sgt i32 %u, %v
+  br i1 %cmp4.i, label %.exit, label %cond.false.6.i
+
+cond.false.6.i:                                   ; preds = %cond.false.i
+  %cmp8.i = icmp slt i32 %w, %x
+  br i1 %cmp8.i, label %.exit, label %cond.false.10.i
+
+cond.false.10.i:                                  ; preds = %cond.false.6.i
+  %cmp13.i = icmp sgt i32 %w, %x
+  br i1 %cmp13.i, label %.exit, label %cond.false.15.i
+
+cond.false.15.i:                                  ; preds = %cond.false.10.i
+  %cmp19.i = icmp sge i32 %y, %z
+  %conv = zext i1 %cmp19.i to i32
+  br label %.exit
+
+.exit:                                  ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i
+  %cond23.i = phi i32 [ 1, %entry ], [ 0, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 0, %cond.false.10.i ]
+  %lnot.i18 = icmp eq i32 %cond23.i, 1
+  %j.add3 = select i1 %lnot.i18, i32 %j, i32 %add3
+  ret i32 %j.add3
+
+; CHECK-LABEL: @unfold4
+; CHECK: br i1 %cmp.i, label %.exit.thread, label %cond.false.i
+; CHECK: br i1 %cmp4.i, label %.exit.thread3, label %cond.false.6.i
+; CHECK: br i1 %cmp8.i, label %.exit.thread, label %cond.false.10.i
+; CHECK: br i1 %cmp13.i, label %.exit.thread3, label %.exit
+; CHECK: br i1 %lnot.i18, label %.exit.thread, label %.exit.thread3
+; CHECK: br label %.exit.thread3
+}
+
+define i32 @unfold5(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) nounwind {
+entry:
+  %add3 = add nsw i32 %j, 2
+  %cmp.i = icmp slt i32 %u, %v
+  br i1 %cmp.i, label %.exit, label %cond.false.i
+
+cond.false.i:                                     ; preds = %entry
+  %cmp4.i = icmp sgt i32 %u, %v
+  br i1 %cmp4.i, label %.exit, label %cond.false.6.i
+
+cond.false.6.i:                                   ; preds = %cond.false.i
+  %cmp8.i = icmp slt i32 %w, %x
+  br i1 %cmp8.i, label %.exit, label %cond.false.10.i
+
+cond.false.10.i:                                  ; preds = %cond.false.6.i
+  %cmp13.i = icmp sgt i32 %w, %x
+  br i1 %cmp13.i, label %.exit, label %cond.false.15.i
+
+cond.false.15.i:                                  ; preds = %cond.false.10.i
+  %cmp19.i = icmp sge i32 %y, %z
+  %conv = zext i1 %cmp19.i to i32
+  br label %.exit
+
+.exit:                                  ; preds = %entry, %cond.false.i, %cond.false.6.i, %cond.false.10.i, %cond.false.15.i
+  %cond23.i = phi i32 [ 2, %entry ], [ 3, %cond.false.i ], [ 1, %cond.false.6.i ], [ %conv, %cond.false.15.i ], [ 7, %cond.false.10.i ]
+  %lnot.i18 = icmp sgt i32 %cond23.i, 5
+  %j.add3 = select i1 %lnot.i18, i32 %j, i32 %cond23.i
+  ret i32 %j.add3
+
+; CHECK-LABEL: @unfold5
+; CHECK: br i1 %cmp.i, label %.exit, label %cond.false.i
+; CHECK: br i1 %cmp4.i, label %.exit, label %cond.false.6.i
+; CHECK: br i1 %cmp8.i, label %.exit, label %cond.false.10.i
+; CHECK: br i1 %cmp13.i, label %.exit, label %cond.false.15.i
+; CHECK: br label %.exit
+}
diff --git a/test/Transforms/LoopInterchange/current-limitations-lcssa.ll b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll
new file mode 100644
index 000000000000..df6c6cfdbcb5
--- /dev/null
+++ b/test/Transforms/LoopInterchange/current-limitations-lcssa.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+
+;; FIXME:
+;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
+;;     for(gi=1;gi<N;gi++)
+;;       for(gj=1;gj<M;gj++)
+;;         A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
+
+@gi = common global i32 0
+@gj = common global i32 0
+
+define void @interchange_07(i32 %N, i32 %M){
+entry:
+  store i32 1, i32* @gi
+  %cmp21 = icmp sgt i32 %N, 1
+  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:
+  %cmp218 = icmp sgt i32 %M, 1
+  %gi.promoted = load i32, i32* @gi
+  %0 = add i32 %M, -1
+  %1 = sext i32 %gi.promoted to i64
+  %2 = sext i32 %N to i64
+  %3 = add i32 %gi.promoted, 1
+  %4 = icmp slt i32 %3, %N
+  %smax = select i1 %4, i32 %N, i32 %3
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
+  br i1 %cmp218, label %for.body3, label %for.inc14
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
+  %5 = add nsw i64 %indvars.iv, -1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
+  %6 = load i32, i32* %arrayidx5
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+  %7 = load i32, i32* %arrayidx9
+  %add = add nsw i32 %7, %6
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
+  store i32 %add, i32* %arrayidx13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc14, label %for.body3
+
+for.inc14:
+  %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
+  %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
+  %cmp = icmp slt i64 %indvars.iv.next26, %2
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
+
+for.cond.for.end16_crit_edge:
+  store i32 %inc.lcssa23, i32* @gj
+  store i32 %smax, i32* @gi
+  br label %for.end16
+
+for.end16:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_07
+; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
+; CHECK:   %5 = add nsw i64 %indvars.iv, -1
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
+; CHECK:   %6 = load i32, i32* %arrayidx5
+; CHECK:   %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
diff --git a/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll b/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
new file mode 100644
index 000000000000..c3b0b9291424
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
@@ -0,0 +1,118 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Test that a flow dependency in outer loop doesn't prevent interchange in
+;; loops i and j.
+;;
+;;  for (int k = 0; k < 100; ++k) {
+;;    T[k] = fn1();
+;;    for (int i = 0; i < 1000; ++i)
+;;      for(int j = 1; j < 1000; ++j)
+;;        Arr[j][i] = Arr[j][i]+k;
+;;    fn2(T[k]);
+;;  }
+
+@T = internal global [100 x double] zeroinitializer, align 4
+@Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4
+
+define void @interchange_09(i32 %k) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup4
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup4, %entry
+  %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ]
+  %call = call double @fn1()
+  %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
+  store double %call, double* %arrayidx, align 8
+  br label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.body
+  %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ]
+  br label %for.body9
+
+for.cond.cleanup4:                                ; preds = %for.cond.cleanup8
+  %tmp = load double, double* %arrayidx, align 8
+  call void @fn2(double %tmp)
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
+  br i1 %exitcond47, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup8:                                ; preds = %for.body9
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
+  br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4
+
+for.body9:                                        ; preds = %for.body9, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
+  %tmp1 = load i32, i32* %arrayidx13, align 4
+  %tmp2 = trunc i64 %indvars.iv45 to i32
+  %add = add nsw i32 %tmp1, %tmp2
+  store i32 %add, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body9, label %for.cond.cleanup8
+}
+
+declare double @fn1()
+declare void @fn2(double)
+
+
+;; After interchange %indvars.iv (j) should increment as the middle loop.
+;; After interchange %indvars.iv42 (i) should increment with the inner most loop.
+
+; CHECK-LABEL: @interchange_09
+
+; CHECK: for.body:
+; CHECK:   %indvars.iv45 = phi i64 [ %indvars.iv.next46, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
+; CHECK:   %call = call double @fn1()
+; CHECK:   %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
+; CHECK:   store double %call, double* %arrayidx, align 8
+; CHECK:   br label %for.body9.preheader
+
+; CHECK: for.cond6.preheader.preheader:
+; CHECK:   br label %for.cond6.preheader
+
+; CHECK: for.cond6.preheader:
+; CHECK:   %indvars.iv42 = phi i64 [ %indvars.iv.next43, %for.cond.cleanup8 ], [ 0, %for.cond6.preheader.preheader ]
+; CHECK:   br label %for.body9.split1
+
+; CHECK: for.body9.preheader:
+; CHECK:   br label %for.body9
+
+; CHECK: for.cond.cleanup4:
+; CHECK:   %tmp = load double, double* %arrayidx, align 8
+; CHECK:   call void @fn2(double %tmp)
+; CHECK:   %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+; CHECK:   %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
+; CHECK:   br i1 %exitcond47, label %for.body, label %for.cond.cleanup
+
+; CHECK: for.cond.cleanup8:
+; CHECK:   %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+; CHECK:   %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
+; CHECK:   br i1 %exitcond44, label %for.cond6.preheader, label %for.body9.split
+
+; CHECK: for.body9:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body9.split ], [ 1, %for.body9.preheader ]
+; CHECK:   br label %for.cond6.preheader.preheader
+
+; CHECK: for.body9.split1:
+; CHECK:   %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
+; CHECK:   store i32 %add, i32* %arrayidx13, align 4
+; CHECK:   br label %for.cond.cleanup8
+
+; CHECK: for.body9.split:
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 1000
+; CHECK:   br i1 %exitcond, label %for.body9, label %for.cond.cleanup4
diff --git a/test/Transforms/LoopInterchange/interchange-not-profitable.ll b/test/Transforms/LoopInterchange/interchange-not-profitable.ll
new file mode 100644
index 000000000000..67a63cab08bd
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-not-profitable.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;; Loops should not be interchanged in this case as it is not profitable.
+;;  for(int i=0;i<100;i++)
+;;    for(int j=0;j<100;j++)
+;;      A[i][j] = A[i][j]+k;
+
+define void @interchange_03(i32 %k) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %0, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
+  br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
+
+for.end12:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_03
+; CHECK: entry:
+; CHECK:   br label %for.cond1.preheader.preheader
+; CHECK: for.cond1.preheader.preheader:                    ; preds = %entry
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
+; CHECK:   %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:  br label %for.body3.preheader
+; CHECK: for.body3.preheader:                              ; preds = %for.cond1.preheader
+; CHECK:   br label %for.body3
+; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
+; CHECK:   %0 = load i32, i32* %arrayidx5
+; CHECK:   %add = add nsw i32 %0, %k
+; CHECK:   store i32 %add, i32* %arrayidx5
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 100
+; CHECK:   br i1 %exitcond, label %for.inc10, label %for.body3
+; CHECK: for.inc10:                                        ; preds = %for.body3
+; CHECK:   %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+; CHECK:   %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
+; CHECK:   br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
+; CHECK: for.end12:                                        ; preds = %for.inc10
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/interchange-output-dependencies.ll b/test/Transforms/LoopInterchange/interchange-output-dependencies.ll
new file mode 100644
index 000000000000..98deba96f8c6
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-output-dependencies.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+
+;; Test to make sure we can handle output dependencies.
+;;
+;;  for (int i = 0; i < 2; ++i)
+;;    for(int j = 0; j < 3; ++j) {
+;;      A[j][i] = i;
+;;      A[j][i+1] = j;
+;;    }
+
+@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
+
+define void @interchange_10() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body4
+  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
+  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit
+  ret void
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+  %tmp = trunc i64 %indvars.iv26 to i32
+  store i32 %tmp, i32* %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+  %tmp1 = trunc i64 %indvars.iv to i32
+  store i32 %tmp1, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 3
+  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
+}
+
+; CHECK-LABEL: @interchange_10
+; CHECK: entry:
+; CHECK:   br label %for.body4.preheader
+
+; CHECK: for.cond1.preheader.preheader:
+; CHECK:   br label %for.cond1.preheader
+
+; CHECK: for.cond.loopexit:
+; CHECK:   %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+; CHECK:   br i1 %exitcond28, label %for.cond1.preheader, label %for.body4.split
+
+; CHECK: for.cond1.preheader:
+; CHECK:   %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond.loopexit ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:   %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+; CHECK:   br label %for.body4.split1
+
+; CHECK: for.body4.preheader:
+; CHECK:   br label %for.body4
+
+; CHECK: for.cond.cleanup:
+; CHECK:   ret void
+
+; CHECK: for.body4:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.split ], [ 0, %for.body4.preheader ]
+; CHECK:   br label %for.cond1.preheader.preheader
+
+; CHECK: for.body4.split1:
+; CHECK:   %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+; CHECK:   %tmp = trunc i64 %indvars.iv26 to i32
+; CHECK:   store i32 %tmp, i32* %arrayidx6, align 4
+; CHECK:   %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+; CHECK:   %tmp1 = trunc i64 %indvars.iv to i32
+; CHECK:   store i32 %tmp1, i32* %arrayidx10, align 4
+; CHECK:   br label %for.cond.loopexit
+
+; CHECK: for.body4.split:
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 3
+; CHECK:   br i1 %exitcond, label %for.body4, label %for.cond.cleanup
diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-down.ll b/test/Transforms/LoopInterchange/interchange-simple-count-down.ll
new file mode 100644
index 000000000000..70ba5940257f
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-simple-count-down.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;; for(int i=0;i<100;i++)
+;;   for(int j=100;j>=0;j--)
+;;     A[j][i] = A[j][i]+k;
+
+define void @interchange_02(i32 %k) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ]
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %0, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp2 = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond = icmp eq i64 %indvars.iv.next20, 100
+  br i1 %exitcond, label %for.end11, label %for.cond1.preheader
+
+for.end11:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_02
+; CHECK: entry:
+; CHECK:   br label %for.body3.preheader
+; CHECK: for.cond1.preheader.preheader:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
+; CHECK:   %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:   br label %for.body3.split1
+; CHECK: for.body3.preheader:
+; CHECK:   br label %for.body3
+; CHECK: for.body3:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ]
+; CHECK:   br label %for.cond1.preheader.preheader
+; CHECK: for.body3.split1:                                 ; preds = %for.cond1.preheader
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
+; CHECK:   %0 = load i32, i32* %arrayidx5
+; CHECK:   %add = add nsw i32 %0, %k
+; CHECK:   store i32 %add, i32* %arrayidx5
+; CHECK:   br label %for.inc10
+; CHECK: for.body3.split:
+; CHECK:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK:   %cmp2 = icmp sgt i64 %indvars.iv, 0
+; CHECK:   br i1 %cmp2, label %for.body3, label %for.end11
+; CHECK: for.inc10:
+; CHECK:   %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next20, 100
+; CHECK:   br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader
+; CHECK: for.end11:
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/interchange-simple-count-up.ll b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll
new file mode 100644
index 000000000000..4febe0269810
--- /dev/null
+++ b/test/Transforms/LoopInterchange/interchange-simple-count-up.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;;  for(int i=0;i<N;i++)
+;;    for(int j=1;j<N;j++)
+;;      A[j][i] = A[j][i]+k;
+
+define void @interchange_01(i32 %k, i32 %N) {
+entry:
+  %cmp21 = icmp sgt i32 %N, 0
+  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12
+
+for.cond1.preheader.lr.ph:
+  %cmp219 = icmp sgt i32 %N, 1
+  %0 = add i32 %N, -1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
+  br i1 %cmp219, label %for.body3, label %for.inc10
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %1 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %1, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
+  br i1 %exitcond26, label %for.end12, label %for.cond1.preheader
+
+for.end12:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_01
+; CHECK: entry:
+; CHECK:   %cmp21 = icmp sgt i32 %N, 0
+; CHECK:   br i1 %cmp21, label %for.body3.preheader, label %for.end12
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
+; CHECK:   %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
+; CHECK:   br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit
+; CHECK: for.body3.preheader:
+; CHECK:   %cmp219 = icmp sgt i32 %N, 1
+; CHECK:   %0 = add i32 %N, -1
+; CHECK:   br label %for.body3
+; CHECK: for.body3:
+; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ]
+; CHECK:   br label %for.cond1.preheader.lr.ph
+; CHECK: for.body3.split1:
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+; CHECK:   %1 = load i32, i32* %arrayidx5
+; CHECK:   %add = add nsw i32 %1, %k
+; CHECK:   store i32 %add, i32* %arrayidx5
+; CHECK:   br label %for.inc10.loopexit
+; CHECK: for.body3.split:
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %lftr.wideiv = trunc i64 %indvars.iv to i32
+; CHECK:   %exitcond = icmp eq i32 %lftr.wideiv, %0
+; CHECK:   br i1 %exitcond, label %for.end12.loopexit, label %for.body3
+; CHECK: for.inc10.loopexit:
+; CHECK:   br label %for.inc10
+; CHECK: for.inc10:
+; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+; CHECK:   %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
+; CHECK:   %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
+; CHECK:   br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader
+; CHECK: for.end12.loopexit:
+; CHECK:   br label %for.end12
+; CHECK: for.end12:
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/interchange.ll b/test/Transforms/LoopInterchange/interchange.ll
deleted file mode 100644
index 77b33e43bedc..000000000000
--- a/test/Transforms/LoopInterchange/interchange.ll
+++ /dev/null
@@ -1,749 +0,0 @@
-; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
-;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@A = common global [100 x [100 x i32]] zeroinitializer
-@B = common global [100 x i32] zeroinitializer
-@C = common global [100 x [100 x i32]] zeroinitializer
-@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
-
-declare void @foo(...)
-
-;;--------------------------------------Test case 01------------------------------------
-;;  for(int i=0;i<N;i++)
-;;    for(int j=1;j<N;j++)
-;;      A[j][i] = A[j][i]+k;
-
-define void @interchange_01(i32 %k, i32 %N) {
-entry:
-  %cmp21 = icmp sgt i32 %N, 0
-  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end12
-
-for.cond1.preheader.lr.ph:
-  %cmp219 = icmp sgt i32 %N, 1
-  %0 = add i32 %N, -1
-  br label %for.cond1.preheader
-
-for.cond1.preheader: 
-  %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
-  br i1 %cmp219, label %for.body3, label %for.inc10
-
-for.body3:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
-  %1 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %1, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:
-  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-  %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
-  %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
-  br i1 %exitcond26, label %for.end12, label %for.cond1.preheader
-
-for.end12:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_01
-; CHECK: entry:
-; CHECK:   %cmp21 = icmp sgt i32 %N, 0
-; CHECK:   br i1 %cmp21, label %for.body3.preheader, label %for.end12
-; CHECK: for.cond1.preheader.lr.ph:                        
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:                              
-; CHECK:   %indvars.iv23 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next24, %for.inc10 ]
-; CHECK:   br i1 %cmp219, label %for.body3.split1, label %for.end12.loopexit
-; CHECK: for.body3.preheader:                              
-; CHECK:   %cmp219 = icmp sgt i32 %N, 1
-; CHECK:   %0 = add i32 %N, -1
-; CHECK:   br label %for.body3
-; CHECK: for.body3:                                        
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 1, %for.body3.preheader ]
-; CHECK:   br label %for.cond1.preheader.lr.ph
-; CHECK: for.body3.split1:                                 
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
-; CHECK:   %1 = load i32, i32* %arrayidx5
-; CHECK:   %add = add nsw i32 %1, %k
-; CHECK:   store i32 %add, i32* %arrayidx5
-; CHECK:   br label %for.inc10.loopexit
-; CHECK: for.body3.split:                                  
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %lftr.wideiv = trunc i64 %indvars.iv to i32
-; CHECK:   %exitcond = icmp eq i32 %lftr.wideiv, %0
-; CHECK:   br i1 %exitcond, label %for.end12.loopexit, label %for.body3
-; CHECK: for.inc10.loopexit:                               
-; CHECK:   br label %for.inc10
-; CHECK: for.inc10:                                        
-; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-; CHECK:   %lftr.wideiv25 = trunc i64 %indvars.iv23 to i32
-; CHECK:   %exitcond26 = icmp eq i32 %lftr.wideiv25, %0
-; CHECK:   br i1 %exitcond26, label %for.body3.split, label %for.cond1.preheader
-; CHECK: for.end12.loopexit:                               
-; CHECK:   br label %for.end12
-; CHECK: for.end12:                                        
-; CHECK:   ret void
-
-;;--------------------------------------Test case 02-------------------------------------
-
-;; for(int i=0;i<100;i++)
-;;   for(int j=100;j>=0;j--)
-;;     A[j][i] = A[j][i]+k;
-
-define void @interchange_02(i32 %k) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc10 ]
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 100, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %0, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nsw i64 %indvars.iv, -1
-  %cmp2 = icmp sgt i64 %indvars.iv, 0
-  br i1 %cmp2, label %for.body3, label %for.inc10
-
-for.inc10:
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond = icmp eq i64 %indvars.iv.next20, 100
-  br i1 %exitcond, label %for.end11, label %for.cond1.preheader
-
-for.end11:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_02
-; CHECK: entry:
-; CHECK:   br label %for.body3.preheader
-; CHECK: for.cond1.preheader.preheader: 
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:  
-; CHECK:   %indvars.iv19 = phi i64 [ %indvars.iv.next20, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:   br label %for.body3.split1
-; CHECK: for.body3.preheader: 
-; CHECK:   br label %for.body3
-; CHECK: for.body3: 
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3.split ], [ 100, %for.body3.preheader ]
-; CHECK:   br label %for.cond1.preheader.preheader
-; CHECK: for.body3.split1:                                 ; preds = %for.cond1.preheader
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv19
-; CHECK:   %0 = load i32, i32* %arrayidx5
-; CHECK:   %add = add nsw i32 %0, %k
-; CHECK:   store i32 %add, i32* %arrayidx5
-; CHECK:   br label %for.inc10
-; CHECK: for.body3.split:
-; CHECK:   %indvars.iv.next = add nsw i64 %indvars.iv, -1
-; CHECK:   %cmp2 = icmp sgt i64 %indvars.iv, 0
-; CHECK:   br i1 %cmp2, label %for.body3, label %for.end11
-; CHECK: for.inc10:
-; CHECK:   %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next20, 100
-; CHECK:   br i1 %exitcond, label %for.body3.split, label %for.cond1.preheader
-; CHECK: for.end11: 
-; CHECK:   ret void
-
-;;--------------------------------------Test case 03-------------------------------------
-;; Loops should not be interchanged in this case as it is not profitable.
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<100;j++)
-;;      A[i][j] = A[i][j]+k;
-
-define void @interchange_03(i32 %k) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc10 ]
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %0, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:
-  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-  %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
-  br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
-
-for.end12:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_03
-; CHECK: entry:
-; CHECK:   br label %for.cond1.preheader.preheader
-; CHECK: for.cond1.preheader.preheader:                    ; preds = %entry
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
-; CHECK:   %indvars.iv21 = phi i64 [ %indvars.iv.next22, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:  br label %for.body3.preheader
-; CHECK: for.body3.preheader:                              ; preds = %for.cond1.preheader
-; CHECK:   br label %for.body3
-; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv21, i64 %indvars.iv
-; CHECK:   %0 = load i32, i32* %arrayidx5
-; CHECK:   %add = add nsw i32 %0, %k
-; CHECK:   store i32 %add, i32* %arrayidx5
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 100
-; CHECK:   br i1 %exitcond, label %for.inc10, label %for.body3
-; CHECK: for.inc10:                                        ; preds = %for.body3
-; CHECK:   %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-; CHECK:   %exitcond23 = icmp eq i64 %indvars.iv.next22, 100
-; CHECK:   br i1 %exitcond23, label %for.end12, label %for.cond1.preheader
-; CHECK: for.end12:                                        ; preds = %for.inc10
-; CHECK:   ret void
-
-
-;;--------------------------------------Test case 04-------------------------------------
-;; Loops should not be interchanged in this case as it is not legal due to dependency.
-;;  for(int j=0;j<99;j++)
-;;   for(int i=0;i<99;i++)
-;;       A[j][i+1] = A[j+1][i]+k;
-
-define void @interchange_04(i32 %k){
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
-  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
-  %0 = load i32, i32* %arrayidx5
-  %add6 = add nsw i32 %0, %k
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
-  store i32 %add6, i32* %arrayidx11
-  %exitcond = icmp eq i64 %indvars.iv.next, 99
-  br i1 %exitcond, label %for.inc12, label %for.body3
-
-for.inc12:
-  %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
-  br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
-
-for.end14:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_04
-; CHECK: entry:
-; CHECK:   br label %for.cond1.preheader
-; CHECK: for.cond1.preheader:                              ; preds = %for.inc12, %entry
-; CHECK:   %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
-; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
-; CHECK:   br label %for.body3
-; CHECK: for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
-; CHECK:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
-; CHECK:   %0 = load i32, i32* %arrayidx5
-; CHECK:   %add6 = add nsw i32 %0, %k
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
-; CHECK:   store i32 %add6, i32* %arrayidx11
-; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 99
-; CHECK:   br i1 %exitcond, label %for.inc12, label %for.body3
-; CHECK: for.inc12:                                        ; preds = %for.body3
-; CHECK:   %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
-; CHECK:   br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
-; CHECK: for.end14:                                        ; preds = %for.inc12
-; CHECK:   ret void
-
-
-
-;;--------------------------------------Test case 05-------------------------------------
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    B[j] = j+k;
-;;    for(int i=0;i<N;i++)
-;;      A[j][i] = A[j][i]+B[j];
-;;  }
-
-define void @interchange_05(i32 %k, i32 %N){
-entry:
-  %cmp30 = icmp sgt i32 %N, 0
-  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
-
-for.body.lr.ph:
-  %0 = add i32 %N, -1
-  %1 = zext i32 %k to i64
-  br label %for.body
-
-for.body:
-  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
-  %2 = add nsw i64 %indvars.iv32, %1
-  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
-  %3 = trunc i64 %2 to i32
-  store i32 %3, i32* %arrayidx
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
-  %4 = load i32, i32* %arrayidx7
-  %add10 = add nsw i32 %3, %4
-  store i32 %add10, i32* %arrayidx7
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc15, label %for.body3
-
-for.inc15:
-  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
-  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
-  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
-  br i1 %exitcond36, label %for.end17, label %for.body
-
-for.end17:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_05
-; CHECK: entry:
-; CHECK: %cmp30 = icmp sgt i32 %N, 0
-; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17
-; CHECK: for.body.lr.ph:
-; CHECK: %0 = add i32 %N, -1
-; CHECK: %1 = zext i32 %k to i64
-; CHECK: br label %for.body
-; CHECK: for.body:
-; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
-; CHECK: %2 = add nsw i64 %indvars.iv32, %1
-; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
-; CHECK: %3 = trunc i64 %2 to i32
-; CHECK: store i32 %3, i32* %arrayidx
-; CHECK: br label %for.body3.preheader
-; CHECK: for.body3.preheader:
-; CHECK: br label %for.body3
-; CHECK: for.body3:
-; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
-; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
-; CHECK: %4 = load i32, i32* %arrayidx7
-; CHECK: %add10 = add nsw i32 %3, %4
-; CHECK: store i32 %add10, i32* %arrayidx7
-; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32
-; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0
-; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3
-; CHECK: for.inc15:
-; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
-; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
-; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
-; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body
-; CHECK: for.end17.loopexit:
-; CHECK: br label %for.end17
-; CHECK: for.end17:
-; CHECK: ret void
-
-
-;;--------------------------------------Test case 06-------------------------------------
-;; Loops not tightly nested are not interchanged
-;;  for(int j=0;j<N;j++) {
-;;    foo();
-;;    for(int i=2;i<N;i++)
-;;      A[j][i] = A[j][i]+k;
-;;  }
-
-define void @interchange_06(i32 %k, i32 %N) {
-entry:
-  %cmp22 = icmp sgt i32 %N, 0
-  br i1 %cmp22, label %for.body.lr.ph, label %for.end12
-
-for.body.lr.ph:
-  %0 = add i32 %N, -1
-  br label %for.body
-
-for.body:
-  %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ]
-  tail call void (...) @foo()
-  br label %for.body3
-
-for.body3:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ]
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %1, %k
-  store i32 %add, i32* %arrayidx5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc10, label %for.body3
-
-for.inc10:
-  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
-  %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32
-  %exitcond27 = icmp eq i32 %lftr.wideiv26, %0
-  br i1 %exitcond27, label %for.end12, label %for.body
-
-for.end12:
-  ret void
-}
-;; Here we are checking if the inner phi is not split then we have not interchanged.
-; CHECK-LABEL: @interchange_06
-; CHECK:  phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ]
-; CHECK-NEXT: getelementptr
-; CHECK-NEXT: %1 = load
-
-;;--------------------------------------Test case 07-------------------------------------
-;; FIXME:
-;; Test for interchange when we have an lcssa phi. This should ideally be interchanged but it is currently not supported.
-;;     for(gi=1;gi<N;gi++)
-;;       for(gj=1;gj<M;gj++)
-;;         A[gj][gi] = A[gj - 1][gi] + C[gj][gi];
-
-@gi = common global i32 0
-@gj = common global i32 0
-
-define void @interchange_07(i32 %N, i32 %M){
-entry:
-  store i32 1, i32* @gi
-  %cmp21 = icmp sgt i32 %N, 1
-  br i1 %cmp21, label %for.cond1.preheader.lr.ph, label %for.end16
-
-for.cond1.preheader.lr.ph: 
-  %cmp218 = icmp sgt i32 %M, 1
-  %gi.promoted = load i32, i32* @gi
-  %0 = add i32 %M, -1
-  %1 = sext i32 %gi.promoted to i64
-  %2 = sext i32 %N to i64
-  %3 = add i32 %gi.promoted, 1
-  %4 = icmp slt i32 %3, %N
-  %smax = select i1 %4, i32 %N, i32 %3
-  br label %for.cond1.preheader
-
-for.cond1.preheader: 
-  %indvars.iv25 = phi i64 [ %1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next26, %for.inc14 ]
-  br i1 %cmp218, label %for.body3, label %for.inc14
-
-for.body3: 
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.cond1.preheader ]
-  %5 = add nsw i64 %indvars.iv, -1
-  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
-  %6 = load i32, i32* %arrayidx5
-  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-  %7 = load i32, i32* %arrayidx9
-  %add = add nsw i32 %7, %6
-  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-  store i32 %add, i32* %arrayidx13
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %0
-  br i1 %exitcond, label %for.inc14, label %for.body3
-
-for.inc14: 
-  %inc.lcssa23 = phi i32 [ 1, %for.cond1.preheader ], [ %M, %for.body3 ]
-  %indvars.iv.next26 = add nsw i64 %indvars.iv25, 1
-  %cmp = icmp slt i64 %indvars.iv.next26, %2
-  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end16_crit_edge
-
-for.cond.for.end16_crit_edge: 
-  store i32 %inc.lcssa23, i32* @gj
-  store i32 %smax, i32* @gi
-  br label %for.end16
-
-for.end16:
-  ret void
-}
-
-; CHECK-LABEL: @interchange_07
-; CHECK: for.body3:                                        ; preds = %for.body3.preheader, %for.body3
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 1, %for.body3.preheader ]
-; CHECK:   %5 = add nsw i64 %indvars.iv, -1
-; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %5, i64 %indvars.iv25
-; CHECK:   %6 = load i32, i32* %arrayidx5
-; CHECK:   %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %indvars.iv, i64 %indvars.iv25
-
-;;------------------------------------------------Test case 08-------------------------------
-;; Test for interchange in loop nest greater than 2.
-;;  for(int i=0;i<100;i++)
-;;    for(int j=0;j<100;j++)
-;;      for(int k=0;k<100;k++)
-;;        D[i][k][j] = D[i][k][j]+t;
-
-define void @interchange_08(i32 %t){
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.inc15, %entry
-  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
-  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
-  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %0, %t
-  store i32 %add, i32* %arrayidx8
-  %inc = add nuw nsw i32 %k.026, 1
-  %exitcond = icmp eq i32 %inc, 100
-  br i1 %exitcond, label %for.inc12, label %for.body6
-
-for.inc12:                                        ; preds = %for.body6
-  %inc13 = add nuw nsw i32 %j.027, 1
-  %exitcond29 = icmp eq i32 %inc13, 100
-  br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
-
-for.inc15:                                        ; preds = %for.inc12
-  %inc16 = add nuw nsw i32 %i.028, 1
-  %exitcond30 = icmp eq i32 %inc16, 100
-  br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
-
-for.end17:                                        ; preds = %for.inc15
-  ret void
-}
-; CHECK-LABEL: @interchange_08
-; CHECK:   entry:
-; CHECK:     br label %for.cond1.preheader.preheader
-; CHECK:   for.cond1.preheader.preheader:                    ; preds = %entry
-; CHECK:     br label %for.cond1.preheader
-; CHECK:   for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc15
-; CHECK:     %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:     br label %for.body6.preheader
-; CHECK:   for.cond4.preheader.preheader:                    ; preds = %for.body6
-; CHECK:     br label %for.cond4.preheader
-; CHECK:   for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc12
-; CHECK:     %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ]
-; CHECK:     br label %for.body6.split1
-; CHECK:   for.body6.preheader:                              ; preds = %for.cond1.preheader
-; CHECK:     br label %for.body6
-; CHECK:   for.body6:                                        ; preds = %for.body6.preheader, %for.body6.split
-; CHECK:     %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ]
-; CHECK:     br label %for.cond4.preheader.preheader
-; CHECK:   for.body6.split1:                                 ; preds = %for.cond4.preheader
-; CHECK:     %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
-; CHECK:     %0 = load i32, i32* %arrayidx8
-; CHECK:     %add = add nsw i32 %0, %t
-; CHECK:     store i32 %add, i32* %arrayidx8
-; CHECK:     br label %for.inc12
-; CHECK:   for.body6.split:                                  ; preds = %for.inc12
-; CHECK:     %inc = add nuw nsw i32 %k.026, 1
-; CHECK:     %exitcond = icmp eq i32 %inc, 100
-; CHECK:     br i1 %exitcond, label %for.inc15, label %for.body6
-; CHECK:   for.inc12:                                        ; preds = %for.body6.split1
-; CHECK:     %inc13 = add nuw nsw i32 %j.027, 1
-; CHECK:     %exitcond29 = icmp eq i32 %inc13, 100
-; CHECK:     br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader
-; CHECK:   for.inc15:                                        ; preds = %for.body6.split
-; CHECK:     %inc16 = add nuw nsw i32 %i.028, 1
-; CHECK:     %exitcond30 = icmp eq i32 %inc16, 100
-; CHECK:     br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
-; CHECK:   for.end17:                                        ; preds = %for.inc15
-; CHECK:     ret void
-
-;;-----------------------------------Test case 09-------------------------------
-;; Test that a flow dependency in outer loop doesn't prevent interchange in
-;; loops i and j.
-;;
-;;  for (int k = 0; k < 100; ++k) {
-;;    T[k] = fn1();
-;;    for (int i = 0; i < 1000; ++i)
-;;      for(int j = 1; j < 1000; ++j)
-;;        Arr[j][i] = Arr[j][i]+k;
-;;    fn2(T[k]);
-;;  }
-
-@T = internal global [100 x double] zeroinitializer, align 4
-@Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4
-
-define void @interchange_09(i32 %k) {
-entry:
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup4
-  ret void
-
-for.body:                                         ; preds = %for.cond.cleanup4, %entry
-  %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ]
-  %call = call double @fn1()
-  %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
-  store double %call, double* %arrayidx, align 8
-  br label %for.cond6.preheader
-
-for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.body
-  %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ]
-  br label %for.body9
-
-for.cond.cleanup4:                                ; preds = %for.cond.cleanup8
-  %tmp = load double, double* %arrayidx, align 8
-  call void @fn2(double %tmp)
-  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
-  %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
-  br i1 %exitcond47, label %for.body, label %for.cond.cleanup
-
-for.cond.cleanup8:                                ; preds = %for.body9
-  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
-  %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
-  br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4
-
-for.body9:                                        ; preds = %for.body9, %for.cond6.preheader
-  %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ]
-  %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
-  %tmp1 = load i32, i32* %arrayidx13, align 4
-  %tmp2 = trunc i64 %indvars.iv45 to i32
-  %add = add nsw i32 %tmp1, %tmp2
-  store i32 %add, i32* %arrayidx13, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.body9, label %for.cond.cleanup8
-}
-
-declare double @fn1()
-declare void @fn2(double)
-
-
-
-
-
-;; After interchange %indvars.iv (j) should increment as the middle loop.
-;; After interchange %indvars.iv42 (i) should increment with the inner most loop.
-
-; CHECK-LABEL: @interchange_09
-
-; CHECK: for.body:
-; CHECK:   %indvars.iv45 = phi i64 [ %indvars.iv.next46, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
-; CHECK:   %call = call double @fn1()
-; CHECK:   %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
-; CHECK:   store double %call, double* %arrayidx, align 8
-; CHECK:   br label %for.body9.preheader
-
-; CHECK: for.cond6.preheader.preheader:
-; CHECK:   br label %for.cond6.preheader
-
-; CHECK: for.cond6.preheader:
-; CHECK:   %indvars.iv42 = phi i64 [ %indvars.iv.next43, %for.cond.cleanup8 ], [ 0, %for.cond6.preheader.preheader ]
-; CHECK:   br label %for.body9.split1
-
-; CHECK: for.body9.preheader:
-; CHECK:   br label %for.body9
-
-; CHECK: for.cond.cleanup4:
-; CHECK:   %tmp = load double, double* %arrayidx, align 8
-; CHECK:   call void @fn2(double %tmp)
-; CHECK:   %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
-; CHECK:   %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
-; CHECK:   br i1 %exitcond47, label %for.body, label %for.cond.cleanup
-
-; CHECK: for.cond.cleanup8:
-; CHECK:   %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
-; CHECK:   %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
-; CHECK:   br i1 %exitcond44, label %for.cond6.preheader, label %for.body9.split
-
-; CHECK: for.body9:
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body9.split ], [ 1, %for.body9.preheader ]
-; CHECK:   br label %for.cond6.preheader.preheader
-
-; CHECK: for.body9.split1:
-; CHECK:   %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
-; CHECK:   store i32 %add, i32* %arrayidx13, align 4
-; CHECK:   br label %for.cond.cleanup8
-
-; CHECK: for.body9.split:
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 1000
-; CHECK:   br i1 %exitcond, label %for.body9, label %for.cond.cleanup4
-
-
-;;-----------------------------------Test case 10-------------------------------
-;; Test to make sure we can handle output dependencies.
-;;
-;;  for (int i = 0; i < 2; ++i)
-;;    for(int j = 0; j < 3; ++j) {
-;;      A[j][i] = i;
-;;      A[j][i+1] = j;
-;;    }
-
-@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
-
-define void @interchange_10() {
-entry:
-  br label %for.cond1.preheader
-
-for.cond.loopexit:                                ; preds = %for.body4
-  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
-  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
-
-for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
-  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
-  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
-  br label %for.body4
-
-for.cond.cleanup:                                 ; preds = %for.cond.loopexit
-  ret void
-
-for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
-  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
-  %tmp = trunc i64 %indvars.iv26 to i32
-  store i32 %tmp, i32* %arrayidx6, align 4
-  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
-  %tmp1 = trunc i64 %indvars.iv to i32
-  store i32 %tmp1, i32* %arrayidx10, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 3
-  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
-}
-
-; CHECK-LABEL: @interchange_10
-; CHECK: entry:
-; CHECK:   br label %for.body4.preheader
-
-; CHECK: for.cond1.preheader.preheader:
-; CHECK:   br label %for.cond1.preheader
-
-; CHECK: for.cond.loopexit:
-; CHECK:   %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
-; CHECK:   br i1 %exitcond28, label %for.cond1.preheader, label %for.body4.split
-
-; CHECK: for.cond1.preheader:
-; CHECK:   %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond.loopexit ], [ 0, %for.cond1.preheader.preheader ]
-; CHECK:   %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
-; CHECK:   br label %for.body4.split1
-
-; CHECK: for.body4.preheader:
-; CHECK:   br label %for.body4
-
-; CHECK: for.cond.cleanup:
-; CHECK:   ret void
-
-; CHECK: for.body4:
-; CHECK:   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.split ], [ 0, %for.body4.preheader ]
-; CHECK:   br label %for.cond1.preheader.preheader
-
-; CHECK: for.body4.split1:
-; CHECK:   %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
-; CHECK:   %tmp = trunc i64 %indvars.iv26 to i32
-; CHECK:   store i32 %tmp, i32* %arrayidx6, align 4
-; CHECK:   %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
-; CHECK:   %tmp1 = trunc i64 %indvars.iv to i32
-; CHECK:   store i32 %tmp1, i32* %arrayidx10, align 4
-; CHECK:   br label %for.cond.loopexit
-
-; CHECK: for.body4.split:
-; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; CHECK:   %exitcond = icmp ne i64 %indvars.iv.next, 3
-; CHECK:   br i1 %exitcond, label %for.body4, label %for.cond.cleanup
diff --git a/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
new file mode 100644
index 000000000000..e14598cfdd60
--- /dev/null
+++ b/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -0,0 +1,220 @@
+; Test optimization remarks generated by the LoopInterchange pass.
+;
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-output=%t -pass-remarks-missed='loop-interchange' \
+; RUN:          -pass-remarks='loop-interchange' -S
+; RUN: cat %t |  FileCheck %s
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x [100 x i32]] zeroinitializer
+@C = common global [100 x i32] zeroinitializer
+
+;;---------------------------------------Test case 01---------------------------------
+;; Loops interchange is not profitable.
+;;   for(int i=1;i<N;i++)
+;;     for(int j=1;j<N;j++)
+;;       A[i-1][j-1] = A[i - 1][j-1] + B[i][j];
+
+define void @test01(i32 %N){
+entry:
+  %cmp31 = icmp sgt i32 %N, 1
+  br i1 %cmp31, label %for.cond1.preheader.lr.ph, label %for.end19
+
+for.cond1.preheader.lr.ph:
+  %0 = add i32 %N, -1
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %indvars.iv34 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next35, %for.inc17 ]
+  %1 = add nsw i64 %indvars.iv34, -1
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %2 = add nsw i64 %indvars.iv, -1
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %1, i64 %2
+  %3 = load i32, i32* %arrayidx6
+  %arrayidx10 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @B, i64 0, i64 %indvars.iv34, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx10
+  %add = add nsw i32 %4, %3
+  store i32 %add, i32* %arrayidx6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc17, label %for.body3
+
+for.inc17:
+  %indvars.iv.next35 = add nuw nsw i64 %indvars.iv34, 1
+  %lftr.wideiv37 = trunc i64 %indvars.iv34 to i32
+  %exitcond38 = icmp eq i32 %lftr.wideiv37, %0
+  br i1 %exitcond38, label %for.end19, label %for.body3.lr.ph
+
+for.end19:
+  ret void
+}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        test01
+; CHECK-NEXT: Args:
+; CHECK-NEXT:  - String:          'Interchanging loops is too costly (cost='
+; CHECK-NEXT:  - Cost:            '2'
+; CHECK-NEXT:  - String:          ', threshold='
+; CHECK-NEXT:  - Threshold:       '0'
+; CHECK-NEXT:  - String:          ') and it does not improve parallelism.'
+; CHECK-NEXT: ...
+
+;;--------------------------------------Test case 02------------------------------------
+;; [FIXME] This loop though valid is currently not interchanged due to the
+;; limitation that we cannot split the inner loop latch due to multiple use of inner induction
+;; variable.(used to increment the loop counter and to access A[j+1][i+1]
+;;  for(int i=0;i<N-1;i++)
+;;    for(int j=1;j<N-1;j++)
+;;      A[j+1][i+1] = A[j+1][i+1] + k;
+
+define void @test02(i32 %k, i32 %N) {
+ entry:
+   %sub = add nsw i32 %N, -1
+   %cmp26 = icmp sgt i32 %N, 1
+   br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end17
+
+ for.cond1.preheader.lr.ph:
+   %cmp324 = icmp sgt i32 %sub, 1
+   %0 = add i32 %N, -2
+   %1 = sext i32 %sub to i64
+   br label %for.cond1.preheader
+
+ for.cond.loopexit:
+   %cmp = icmp slt i64 %indvars.iv.next29, %1
+   br i1 %cmp, label %for.cond1.preheader, label %for.end17
+
+ for.cond1.preheader:
+   %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.cond.loopexit ]
+   %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+   br i1 %cmp324, label %for.body4, label %for.cond.loopexit
+
+ for.body4:
+   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.cond1.preheader ]
+   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+   %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29
+   %2 = load i32, i32* %arrayidx7
+   %add8 = add nsw i32 %2, %k
+   store i32 %add8, i32* %arrayidx7
+   %lftr.wideiv = trunc i64 %indvars.iv to i32
+   %exitcond = icmp eq i32 %lftr.wideiv, %0
+   br i1 %exitcond, label %for.cond.loopexit, label %for.body4
+
+ for.end17:
+   ret void
+}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedInsBetweenInduction
+; CHECK-NEXT: Function:        test02
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Found unsupported instruction between induction variable increment and branch.
+; CHECK-NEXT: ...
+
+;;-----------------------------------Test case 03-------------------------------
+;; Test to make sure we can handle output dependencies.
+;;
+;;  for (int i = 0; i < 2; ++i)
+;;    for(int j = 0; j < 3; ++j) {
+;;      A[j][i] = i;
+;;      A[j][i+1] = j;
+;;    }
+
+@A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
+
+define void @test03() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body4
+  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
+  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit
+  ret void
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+  %tmp = trunc i64 %indvars.iv26 to i32
+  store i32 %tmp, i32* %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+  %tmp1 = trunc i64 %indvars.iv to i32
+  store i32 %tmp1, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 3
+  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
+}
+
+; CHECK: --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        test03
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+
+;;--------------------------------------Test case 04-------------------------------------
+;; Loops not tightly nested are not interchanged
+;;  for(int j=0;j<N;j++) {
+;;    B[j] = j+k;
+;;    for(int i=0;i<N;i++)
+;;      A[j][i] = A[j][i]+B[j];
+;;  }
+
+define void @test04(i32 %k, i32 %N){
+entry:
+  %cmp30 = icmp sgt i32 %N, 0
+  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:
+  %0 = add i32 %N, -1
+  %1 = zext i32 %k to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+  %2 = add nsw i64 %indvars.iv32, %1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @C, i64 0, i64 %indvars.iv32
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx7
+  %add10 = add nsw i32 %3, %4
+  store i32 %add10, i32* %arrayidx7
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc15, label %for.body3
+
+for.inc15:
+  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+  br i1 %exitcond36, label %for.end17, label %for.body
+
+for.end17:
+  ret void
+}
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            NotTightlyNested
+; CHECK-NEXT: Function:        test04
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Cannot interchange loops because they are not tightly nested.
+; CHECK-NEXT: ...
diff --git a/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll b/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
new file mode 100644
index 000000000000..cf4f83baea82
--- /dev/null
+++ b/test/Transforms/LoopInterchange/not-interchanged-dependencies-1.ll
@@ -0,0 +1,64 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+
+;; Loops should not be interchanged in this case as it is not legal due to dependency.
+;;  for(int j=0;j<99;j++)
+;;   for(int i=0;i<99;i++)
+;;       A[j][i+1] = A[j+1][i]+k;
+
+define void @interchange_04(i32 %k){
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx5
+  %add6 = add nsw i32 %0, %k
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
+  store i32 %add6, i32* %arrayidx11
+  %exitcond = icmp eq i64 %indvars.iv.next, 99
+  br i1 %exitcond, label %for.inc12, label %for.body3
+
+for.inc12:
+  %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
+  br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
+
+for.end14:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_04
+; CHECK: entry:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:                              ; preds = %for.inc12, %entry
+; CHECK:   %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for.inc12 ]
+; CHECK:   %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+; CHECK:   br label %for.body3
+; CHECK: for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+; CHECK:   %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+; CHECK:   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next24, i64 %indvars.iv
+; CHECK:   %0 = load i32, i32* %arrayidx5
+; CHECK:   %add6 = add nsw i32 %0, %k
+; CHECK:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK:   %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv23, i64 %indvars.iv.next
+; CHECK:   store i32 %add6, i32* %arrayidx11
+; CHECK:   %exitcond = icmp eq i64 %indvars.iv.next, 99
+; CHECK:   br i1 %exitcond, label %for.inc12, label %for.body3
+; CHECK: for.inc12:                                        ; preds = %for.body3
+; CHECK:   %exitcond25 = icmp eq i64 %indvars.iv.next24, 99
+; CHECK:   br i1 %exitcond25, label %for.end14, label %for.cond1.preheader
+; CHECK: for.end14:                                        ; preds = %for.inc12
+; CHECK:   ret void
diff --git a/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll b/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
new file mode 100644
index 000000000000..1d4d22883a4f
--- /dev/null
+++ b/test/Transforms/LoopInterchange/not-interchanged-loop-nest-3.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Test for interchange in loop nest greater than 2.
+;;  for(int i=0;i<100;i++)
+;;    for(int j=0;j<100;j++)
+;;      for(int k=0;k<100;k++)
+;;        D[i][k][j] = D[i][k][j]+t;
+
+define void @interchange_08(i32 %t){
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc15, %entry
+  %i.028 = phi i32 [ 0, %entry ], [ %inc16, %for.inc15 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc12, %for.cond1.preheader
+  %j.027 = phi i32 [ 0, %for.cond1.preheader ], [ %inc13, %for.inc12 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.026 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+  %0 = load i32, i32* %arrayidx8
+  %add = add nsw i32 %0, %t
+  store i32 %add, i32* %arrayidx8
+  %inc = add nuw nsw i32 %k.026, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.inc12, label %for.body6
+
+for.inc12:                                        ; preds = %for.body6
+  %inc13 = add nuw nsw i32 %j.027, 1
+  %exitcond29 = icmp eq i32 %inc13, 100
+  br i1 %exitcond29, label %for.inc15, label %for.cond4.preheader
+
+for.inc15:                                        ; preds = %for.inc12
+  %inc16 = add nuw nsw i32 %i.028, 1
+  %exitcond30 = icmp eq i32 %inc16, 100
+  br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
+
+for.end17:                                        ; preds = %for.inc15
+  ret void
+}
+; CHECK-LABEL: @interchange_08
+; CHECK:   entry:
+; CHECK:     br label %for.cond1.preheader.preheader
+; CHECK:   for.cond1.preheader.preheader:                    ; preds = %entry
+; CHECK:     br label %for.cond1.preheader
+; CHECK:   for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc15
+; CHECK:     %i.028 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond1.preheader.preheader ]
+; CHECK:     br label %for.body6.preheader
+; CHECK:   for.cond4.preheader.preheader:                    ; preds = %for.body6
+; CHECK:     br label %for.cond4.preheader
+; CHECK:   for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc12
+; CHECK:     %j.027 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.cond4.preheader.preheader ]
+; CHECK:     br label %for.body6.split1
+; CHECK:   for.body6.preheader:                              ; preds = %for.cond1.preheader
+; CHECK:     br label %for.body6
+; CHECK:   for.body6:                                        ; preds = %for.body6.preheader, %for.body6.split
+; CHECK:     %k.026 = phi i32 [ %inc, %for.body6.split ], [ 0, %for.body6.preheader ]
+; CHECK:     br label %for.cond4.preheader.preheader
+; CHECK:   for.body6.split1:                                 ; preds = %for.cond4.preheader
+; CHECK:     %arrayidx8 = getelementptr inbounds [100 x [100 x [100 x i32]]], [100 x [100 x [100 x i32]]]* @D, i32 0, i32 %i.028, i32 %k.026, i32 %j.027
+; CHECK:     %0 = load i32, i32* %arrayidx8
+; CHECK:     %add = add nsw i32 %0, %t
+; CHECK:     store i32 %add, i32* %arrayidx8
+; CHECK:     br label %for.inc12
+; CHECK:   for.body6.split:                                  ; preds = %for.inc12
+; CHECK:     %inc = add nuw nsw i32 %k.026, 1
+; CHECK:     %exitcond = icmp eq i32 %inc, 100
+; CHECK:     br i1 %exitcond, label %for.inc15, label %for.body6
+; CHECK:   for.inc12:                                        ; preds = %for.body6.split1
+; CHECK:     %inc13 = add nuw nsw i32 %j.027, 1
+; CHECK:     %exitcond29 = icmp eq i32 %inc13, 100
+; CHECK:     br i1 %exitcond29, label %for.body6.split, label %for.cond4.preheader
+; CHECK:   for.inc15:                                        ; preds = %for.body6.split
+; CHECK:     %inc16 = add nuw nsw i32 %i.028, 1
+; CHECK:     %exitcond30 = icmp eq i32 %inc16, 100
+; CHECK:     br i1 %exitcond30, label %for.end17, label %for.cond1.preheader
+; CHECK:   for.end17:                                        ; preds = %for.inc15
+; CHECK:     ret void
diff --git a/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll b/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
new file mode 100644
index 000000000000..0cf91b09e65d
--- /dev/null
+++ b/test/Transforms/LoopInterchange/not-interchanged-tightly-nested.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -basicaa -loop-interchange -S | FileCheck %s
+;; We test the complete .ll for adjustment in outer loop header/latch and inner loop header/latch.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [100 x [100 x i32]] zeroinitializer
+@B = common global [100 x i32] zeroinitializer
+@C = common global [100 x [100 x i32]] zeroinitializer
+@D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Loops not tightly nested are not interchanged
+;;  for(int j=0;j<N;j++) {
+;;    B[j] = j+k;
+;;    for(int i=0;i<N;i++)
+;;      A[j][i] = A[j][i]+B[j];
+;;  }
+
+define void @interchange_05(i32 %k, i32 %N){
+entry:
+  %cmp30 = icmp sgt i32 %N, 0
+  br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+
+for.body.lr.ph:
+  %0 = add i32 %N, -1
+  %1 = zext i32 %k to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+  %2 = add nsw i64 %indvars.iv32, %1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+  %4 = load i32, i32* %arrayidx7
+  %add10 = add nsw i32 %3, %4
+  store i32 %add10, i32* %arrayidx7
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc15, label %for.body3
+
+for.inc15:
+  %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+  br i1 %exitcond36, label %for.end17, label %for.body
+
+for.end17:
+  ret void
+}
+
+; CHECK-LABEL: @interchange_05
+; CHECK: entry:
+; CHECK: %cmp30 = icmp sgt i32 %N, 0
+; CHECK: br i1 %cmp30, label %for.body.lr.ph, label %for.end17
+; CHECK: for.body.lr.ph:
+; CHECK: %0 = add i32 %N, -1
+; CHECK: %1 = zext i32 %k to i64
+; CHECK: br label %for.body
+; CHECK: for.body:
+; CHECK: %indvars.iv32 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next33, %for.inc15 ]
+; CHECK: %2 = add nsw i64 %indvars.iv32, %1
+; CHECK: %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @B, i64 0, i64 %indvars.iv32
+; CHECK: %3 = trunc i64 %2 to i32
+; CHECK: store i32 %3, i32* %arrayidx
+; CHECK: br label %for.body3.preheader
+; CHECK: for.body3.preheader:
+; CHECK: br label %for.body3
+; CHECK: for.body3:
+; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+; CHECK: %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv32, i64 %indvars.iv
+; CHECK: %4 = load i32, i32* %arrayidx7
+; CHECK: %add10 = add nsw i32 %3, %4
+; CHECK: store i32 %add10, i32* %arrayidx7
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, %0
+; CHECK: br i1 %exitcond, label %for.inc15, label %for.body3
+; CHECK: for.inc15:
+; CHECK: %indvars.iv.next33 = add nuw nsw i64 %indvars.iv32, 1
+; CHECK: %lftr.wideiv35 = trunc i64 %indvars.iv32 to i32
+; CHECK: %exitcond36 = icmp eq i32 %lftr.wideiv35, %0
+; CHECK: br i1 %exitcond36, label %for.end17.loopexit, label %for.body
+; CHECK: for.end17.loopexit:
+; CHECK: br label %for.end17
+; CHECK: for.end17:
+; CHECK: ret void
+
+
+declare void @foo(...)
+
+;; Loops not tightly nested are not interchanged
+;;  for(int j=0;j<N;j++) {
+;;    foo();
+;;    for(int i=2;i<N;i++)
+;;      A[j][i] = A[j][i]+k;
+;;  }
+
+define void @interchange_06(i32 %k, i32 %N) {
+entry:
+  %cmp22 = icmp sgt i32 %N, 0
+  br i1 %cmp22, label %for.body.lr.ph, label %for.end12
+
+for.body.lr.ph:
+  %0 = add i32 %N, -1
+  br label %for.body
+
+for.body:
+  %indvars.iv24 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next25, %for.inc10 ]
+  tail call void (...) @foo()
+  br label %for.body3
+
+for.body3:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv24, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %1, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.inc10, label %for.body3
+
+for.inc10:
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %lftr.wideiv26 = trunc i64 %indvars.iv24 to i32
+  %exitcond27 = icmp eq i32 %lftr.wideiv26, %0
+  br i1 %exitcond27, label %for.end12, label %for.body
+
+for.end12:
+  ret void
+}
+;; Here we are checking if the inner phi is not split then we have not interchanged.
+; CHECK-LABEL: @interchange_06
+; CHECK:  phi i64 [ %indvars.iv.next, %for.body3 ], [ 2, %for.body3.preheader ]
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: %1 = load
diff --git a/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll b/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
new file mode 100644
index 000000000000..6014775028ee
--- /dev/null
+++ b/test/Transforms/LoopUnroll/runtime-loop-multiexit-dom-verify.ll
@@ -0,0 +1,126 @@
+; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false -unroll-runtime-multi-exit=true -unroll-count=4  -verify-dom-info -S | FileCheck %s
+
+; REQUIRES: asserts
+; The tests below are for verifying dom tree after runtime unrolling
+; with multiple exit/exiting blocks.
+
+; We explicitly set the unroll count so that expensiveTripCount computation is allowed.
+
+; mergedexit block has edges from loop exit blocks.
+define i64 @test1() {
+; CHECK-LABEL: test1(
+; CHECK-LABEL: headerexit:
+; CHECK-NEXT:    %addphi = phi i64 [ %add.iv, %header ], [ %add.iv.1, %header.1 ], [ %add.iv.2, %header.2 ], [ %add.iv.3, %header.3 ]
+; CHECK-NEXT:    br label %mergedexit
+; CHECK-LABEL: latchexit:
+; CHECK-NEXT:    %shftphi = phi i64 [ %shft, %latch ], [ %shft.1, %latch.1 ], [ %shft.2, %latch.2 ], [ %shft.3, %latch.3 ]
+; CHECK-NEXT:    br label %mergedexit
+; CHECK-LABEL: mergedexit:
+; CHECK-NEXT:    %retval = phi i64 [ %addphi, %headerexit ], [ %shftphi, %latchexit ]
+; CHECK-NEXT:    ret i64 %retval
+entry:
+  br label %preheader
+
+preheader:                                              ; preds = %bb
+  %trip = zext i32 undef to i64
+  br label %header
+
+header:                                              ; preds = %latch, %preheader
+  %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ]
+  %add.iv = add nuw nsw i64 %iv, 2
+  %cmp1 = icmp ult i64 %add.iv, %trip
+  br i1 %cmp1, label %latch, label %headerexit
+
+latch:                                             ; preds = %header
+  %shft = ashr i64 %add.iv, 1
+  %cmp2 = icmp ult i64 %shft, %trip
+  br i1 %cmp2, label %header, label %latchexit
+
+headerexit:                                              ; preds = %header
+  %addphi = phi i64 [ %add.iv, %header ]
+  br label %mergedexit
+
+latchexit:                                              ; preds = %latch
+ %shftphi = phi i64 [ %shft, %latch ]
+  br label %mergedexit
+
+mergedexit:                                              ; preds = %latchexit, %headerexit
+  %retval = phi i64 [ %addphi, %headerexit ], [ %shftphi, %latchexit ]
+  ret i64 %retval
+}
+
+; mergedexit has edges from loop exit blocks and a block outside the loop.
+define  void @test2(i1 %cond, i32 %n) {
+; CHECK-LABEL: header.1:
+; CHECK-NEXT:    %add.iv.1 = add nuw nsw i64 %add.iv, 2
+; CHECK:         br i1 %cmp1.1, label %latch.1, label %headerexit
+; CHECK-LABEL: latch.3:
+; CHECK:         %cmp2.3 = icmp ult i64 %shft.3, %trip
+; CHECK-NEXT:    br i1 %cmp2.3, label %header, label %latchexit, !llvm.loop
+entry:
+  br i1 %cond, label %preheader, label %mergedexit
+
+preheader:                                              ; preds = %entry
+  %trip = zext i32 %n to i64
+  br label %header
+
+header:                                              ; preds = %latch, %preheader
+  %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ]
+  %add.iv = add nuw nsw i64 %iv, 2
+  %cmp1 = icmp ult i64 %add.iv, %trip
+  br i1 %cmp1, label %latch, label %headerexit
+
+latch:                                             ; preds = %header
+  %shft = ashr i64 %add.iv, 1
+  %cmp2 = icmp ult i64 %shft, %trip
+  br i1 %cmp2, label %header, label %latchexit
+
+headerexit:                                              ; preds = %header
+  br label %mergedexit
+
+latchexit:                                              ; preds = %latch
+  br label %mergedexit
+
+mergedexit:                                              ; preds = %latchexit, %headerexit, %entry
+  ret void
+}
+
+
+; exitsucc is from loop exit block only.
+define i64 @test3(i32 %n) {
+; CHECK-LABEL: test3(
+; CHECK-LABEL:  headerexit:
+; CHECK-NEXT:     br label %exitsucc
+; CHECK-LABEL:  latchexit:
+; CHECK-NEXT:     %shftphi = phi i64 [ %shft, %latch ], [ %shft.1, %latch.1 ], [ %shft.2, %latch.2 ], [ %shft.3, %latch.3 ]
+; CHECK-NEXT:     ret i64 %shftphi
+; CHECK-LABEL:  exitsucc:
+; CHECK-NEXT:     ret i64 96
+entry:
+  br label %preheader
+
+preheader:                                              ; preds = %bb
+  %trip = zext i32 %n to i64
+  br label %header
+
+header:                                              ; preds = %latch, %preheader
+  %iv = phi i64 [ 2, %preheader ], [ %add.iv, %latch ]
+  %add.iv = add nuw nsw i64 %iv, 2
+  %cmp1 = icmp ult i64 %add.iv, %trip
+  br i1 %cmp1, label %latch, label %headerexit
+
+latch:                                             ; preds = %header
+  %shft = ashr i64 %add.iv, 1
+  %cmp2 = icmp ult i64 %shft, %trip
+  br i1 %cmp2, label %header, label %latchexit
+
+headerexit:                                              ; preds = %header
+  br label %exitsucc
+
+latchexit:                                              ; preds = %latch
+  %shftphi = phi i64 [ %shft, %latch ]
+  ret i64 %shftphi
+
+exitsucc:                                              ; preds = %headerexit
+  ret i64 96
+}
diff --git a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index b5e914500fb4..31c564779fb2 100644
--- a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -86,10 +86,10 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; AUTO_VEC-NEXT:  entry:
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 %n, 1
 ; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1
-; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %min.iters.checked
-; AUTO_VEC:       min.iters.checked:
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.ph
+; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.body
+; AUTO_VEC:         br label %vector.body
 ; AUTO_VEC:       middle.block:
 ; AUTO_VEC:         [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
 ; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll
index 49d88323523c..f2d68fb4e62b 100644
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Make sure we are preserving debug info in the vectorized code.
 
 ; CHECK: for.body.lr.ph
-; CHECK:   cmp.zero = icmp eq i64 {{.*}}, 0, !dbg !{{[0-9]+}}
+; CHECK:   min.iters.check = icmp ult i64 {{.*}}, 2, !dbg !{{[0-9]+}}
 ; CHECK: vector.body
 ; CHECK:   index {{.*}}, !dbg ![[LOC:[0-9]+]]
 ; CHECK:   getelementptr inbounds i32, i32* %a, {{.*}}, !dbg ![[LOC]]
diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 0ff94c1450ac..508938958d59 100644
--- a/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -22,7 +22,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -79,7 +79,7 @@ for.exit:
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -144,7 +144,7 @@ scalar.body:
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
 ; CHECK:       scalar.ph:
-; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
+; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ]
 ; CHECK:       scalar.body:
 ; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
 ;
@@ -288,7 +288,7 @@ for.cond.cleanup3:
 
 ; UNROLL-NO-IC-LABEL: @PR30183(
 ; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
+; UNROLL-NO-IC:         [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label %vector.body
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll
index 8eec6e262c1a..a7cc4530ceb3 100644
--- a/test/Transforms/LoopVectorize/float-induction.ll
+++ b/test/Transforms/LoopVectorize/float-induction.ll
@@ -15,7 +15,7 @@
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop1(
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
@@ -37,7 +37,7 @@
 
 ; VEC4_INTERL2-LABEL: @fp_iv_loop1(
 ; VEC4_INTERL2:       vector.ph:
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
@@ -63,7 +63,7 @@
 
 ; VEC1_INTERL2-LABEL: @fp_iv_loop1(
 ; VEC1_INTERL2:       vector.ph:
-; VEC1_INTERL2-NEXT:    br label %vector.body
+; VEC1_INTERL2:         br label %vector.body
 ; VEC1_INTERL2:       vector.body:
 ; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; VEC1_INTERL2-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1
@@ -115,7 +115,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop2(
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
 ; VEC4_INTERL1-NEXT:    br label %vector.body
@@ -172,7 +172,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; VEC4_INTERL1:       for.body.lr.ph:
 ; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
 ; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
@@ -250,7 +250,7 @@ for.end:
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop4(
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:         br label %vector.body
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
@@ -289,7 +289,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
 ; VEC2_INTERL1_PRED_STORE:       vector.body:
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 7f381ae6ad7b..0d6e4b1e61b4 100644
--- a/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -13,24 +13,21 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[MIN_ITERS_CHECKED:%.*]]
-; CHECK:       min.iters.checked:
-; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[CMP_ZERO:%.*]] = icmp eq i64 [[N_VEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_ZERO]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
 ; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[N]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -55,10 +52,10 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[MIN_ITERS_CHECKED]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll
index 33e8ed067160..b37537efcc51 100644
--- a/test/Transforms/LoopVectorize/induction-step.ll
+++ b/test/Transforms/LoopVectorize/induction-step.ll
@@ -15,7 +15,7 @@
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @int_inc, align 4
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
@@ -86,7 +86,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; CHECK-LABEL: @induction_with_loop_inv(
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0
 ; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 7e9e6b1cdc8e..d77806da59be 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -501,13 +501,13 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; condition and branch directly to the scalar loop.
 
 ; CHECK-LABEL: max_i32_backedgetaken
-; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked
+; CHECK:  br i1 true, label %scalar.ph, label %vector.ph
 
 ; CHECK: middle.block:
 ; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
 ; CHECK: scalar.ph:
 ; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ]
 
 define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 1e8b982363d8..89c0ac109167 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ;
 ; CHECK-LABEL: @interleaved_with_cond_store_0(
 ;
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
@@ -58,7 +58,7 @@ for.end:
 ;
 ; CHECK-LABEL: @interleaved_with_cond_store_1(
 ;
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
@@ -117,7 +117,7 @@ for.end:
 ;
 ; CHECK-LABEL: @interleaved_with_cond_store_2(
 ;
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll
index d84dc42bdf54..530c2f66552a 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -338,7 +338,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; }
 
 ; CHECK-LABEL: @even_load_dynamic_tc(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -579,7 +579,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; }
 
 ; CHECK-LABEL: @PR27626_0(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -627,7 +627,7 @@ for.end:
 ; }
 
 ; CHECK-LABEL: @PR27626_1(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -680,7 +680,7 @@ for.end:
 ; }
 
 ; CHECK-LABEL: @PR27626_2(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
@@ -728,7 +728,7 @@ for.end:
 ; }
 
 ; CHECK-LABEL: @PR27626_3(
-; CHECK: min.iters.checked:
+; CHECK: vector.ph:
 ; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
 ; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
 ; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
diff --git a/test/Transforms/LoopVectorize/iv_outside_user.ll b/test/Transforms/LoopVectorize/iv_outside_user.ll
index 8a44af96e7f4..265188886996 100644
--- a/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -135,7 +135,7 @@ for.end:
 }
 
 ; CHECK-LABEL: @PR30742
-; CHECK: min.iters.checked
+; CHECK: vector.ph
 ; CHECK:   %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2
 ; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
 ; CHECK: middle.block
diff --git a/test/Transforms/LoopVectorize/miniters.ll b/test/Transforms/LoopVectorize/miniters.ll
index 1cb67f9150ac..f5f4eb5eaa01 100644
--- a/test/Transforms/LoopVectorize/miniters.ll
+++ b/test/Transforms/LoopVectorize/miniters.ll
@@ -10,10 +10,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF.
 ; CHECK-LABEL: foo(
 ; CHECK: %min.iters.check = icmp ult i64 %N, 4
-; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
+; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
 ; UNROLL-LABEL: foo(
 ; UNROLL: %min.iters.check = icmp ult i64 %N, 8
-; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
+; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
 
 define void @foo(i64 %N) {
 entry:
diff --git a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
new file mode 100644
index 000000000000..40af8f3adf02
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -0,0 +1,240 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the vectorizer identifies the %p.09 phi,
+; as an induction variable, despite the potential overflow
+; due to the truncation from 32bit to 8bit. 
+; SCEV will detect the pattern "sext(trunc(%p.09)) + %step"
+; and generate the required runtime checks under which
+; we can assume no overflow. We check here that we generate
+; exactly two runtime checks:
+; 1) an overflow check:
+;    {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nssw>
+; 2) an equality check verifying that the step of the induction 
+;    is equal to sext(trunc(step)): 
+;    Equal predicate: %step == (sext i8 (trunc i32 %step to i8) to i32)
+; 
+; See also pr30654.
+;
+; int a[N];
+; void doit1(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+
+; CHECK-LABEL: @doit1
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow
+; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]]
+; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                    
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                  
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %sext = shl i32 %p.09, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                        
+  br label %for.end
+
+for.end:                                
+  ret void
+}
+
+; Same as above, but for checking the SCEV "zext(trunc(%p.09)) + %step".
+; Here we expect the following two predicates to be added for runtime checking:
+; 1) {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nusw>
+; 2) Equal predicate: %step == (zext i8 (trunc i32 %step to i8) to i32)
+;
+; int a[N];
+; void doit2(int n, int step) {
+;   int i;
+;   unsigned char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+
+; CHECK-LABEL: @doit2
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow
+; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]]
+; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                             
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                      
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %conv = and i32 %p.09, 255
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                        
+  br label %for.end
+
+for.end:                                
+  ret void
+}
+
+; Here we check that the same phi scev analysis would fail 
+; to create the runtime checks because the step is not invariant.
+; As a result vectorization will fail.
+;
+; int a[N];
+; void doit3(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;      step += 2;
+;   }
+; }
+;
+
+; CHECK-LABEL: @doit3
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit3(i32 %n, i32 %step) local_unnamed_addr {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.012 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ]
+  %sext = shl i32 %p.012, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step.addr.010
+  %add3 = add nsw i32 %step.addr.010, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Lastly, we also check the case where we can tell at compile time that
+; the step of the induction is equal to sext(trunc(step)), in which case
+; we don't have to check this equality at runtime (we only need the
+; runtime overflow check). Therefore only the following overflow predicate
+; will be added for runtime checking:
+; {0,+,%cstep}<%for.body> Added Flags: <nssw>
+;
+; a[N];
+; void doit4(int n, char cstep) {
+;   int i;
+;   char p = 0;
+;   int istep = cstep;
+;  for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + istep;
+;   }
+; }
+
+; CHECK-LABEL: @doit4
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %{{.*}} = or i1 {{.*}}, %mul.overflow
+; CHECK-NOT: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK-NOT: %{{.*}} = or i1 %{{.*}}, %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
+entry:
+  %conv = sext i8 %cstep to i32
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %sext = shl i32 %p.011, 24
+  %conv2 = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv2, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv2, %conv
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index ac1145aab67b..b37d94c0c328 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -4,7 +4,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @add_ints(
 ;CHECK: br
-;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-DAG: getelementptr
 ;CHECK-DAG: icmp ugt
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 958b3c135c97..fb0548612715 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: define i32 @foo
 ;CHECK: for.body.preheader:
-;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
+;CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
 ;CHECK: vector.memcheck:
 ;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]]
 ;CHECK: load <4 x float>
diff --git a/test/tools/llvm-cov/showTabsHTML.cpp b/test/tools/llvm-cov/showTabsHTML.cpp
index 953c06a3c60d..c092841aeb22 100644
--- a/test/tools/llvm-cov/showTabsHTML.cpp
+++ b/test/tools/llvm-cov/showTabsHTML.cpp
@@ -1,5 +1,5 @@
 // RUN: llvm-profdata merge -o %t.profdata %S/Inputs/showTabsHTML.proftext
-// RUN: llvm-cov show %S/Inputs/showTabsHTML.covmapping -format html -instr-profile %t.profdata -filename-equivalence %s | FileCheck -check-prefix=CHECK %s
+// RUN: llvm-cov show %S/Inputs/showTabsHTML.covmapping -format html -instr-profile %t.profdata -filename-equivalence %s | FileCheck %s
 
 int main(int argc, char ** argv) {
 	(void) "This tab starts at column 0";            // CHECK: &nbsp;&nbsp;(void) &quot;This tab starts at column 0&quot;;
@@ -13,4 +13,4 @@ int main(int argc, char ** argv) {
 
 // CHECK-TABSIZE: &nbsp;&nbsp;&nbsp;(void) &quot;This tab starts at column 0&quot;;
 // CHECK-TABSIZE: (void) &quot;&nbsp;&nbsp;This tab starts at column 10&quot;;
-// CHECK-TABSIZE: (void) &quot;This &nbsp;&nbsp;&nbsp; tab starts at column 15&quot;;
\ No newline at end of file
+// CHECK-TABSIZE: (void) &quot;This &nbsp;&nbsp;&nbsp; tab starts at column 15&quot;;
diff --git a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
new file mode 100644
index 000000000000..90733eda278f
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
@@ -0,0 +1,193 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
+# RUN: | not llvm-dwarfdump -verify - \
+# RUN: | FileCheck %s
+
+# CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DIE has invalid DW_AT_stmt_list encoding:{{[[:space:]]}}
+# CHECK-NEXT: 0x0000000c: DW_TAG_compile_unit [1] *
+# CHECK-NEXT: DW_AT_producer [DW_FORM_strp]	( .debug_str[0x00000000] = "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)")
+# CHECK-NEXT: DW_AT_language [DW_FORM_data2]	(DW_LANG_C99)
+# CHECK-NEXT: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000037] = "basic.c")
+# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_strx4]	( indexed (00000000) string = )
+# CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp]	( .debug_str[0x0000003f] = "/Users/sgravani/Development/tests")
+# CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
+# CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]	(0x00000016){{[[:space:]]}}
+# CHECK-NEXT: Units[2] - start offset: 0x00000068 
+# CHECK-NEXT:	Error: The length for this unit is too large for the .debug_info provided.
+# CHECK-NEXT:	Error: The unit type encoding is not valid.
+
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main                   ## -- Begin function main
+	.p2align	4, 0x90
+_main:                                  ## @main
+Lfunc_begin0:
+	.file	1 "basic.c"
+	.loc	1 1 0                   ## basic.c:1:0
+	.cfi_startproc
+## BB#0:                                ## %entry
+	pushq	%rbp
+Lcfi0:
+	.cfi_def_cfa_offset 16
+Lcfi1:
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+Lcfi2:
+	.cfi_def_cfa_register %rbp
+	xorl	%eax, %eax
+	movl	$0, -4(%rbp)
+Ltmp0:
+	.loc	1 2 7 prologue_end      ## basic.c:2:7
+	movl	$1, -8(%rbp)
+	.loc	1 3 3                   ## basic.c:3:3
+	popq	%rbp
+	retq
+Ltmp1:
+Lfunc_end0:
+	.cfi_endproc
+                                        ## -- End function
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)" ## string offset=0
+	.asciz	"basic.c"               ## string offset=55
+	.asciz	"/Users/sgravani/Development/tests" ## string offset=63
+	.asciz	"main"                  ## string offset=97
+	.asciz	"int"                   ## string offset=102
+	.asciz	"a"                     ## string offset=106
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                       ## Abbreviation Code
+	.byte	17                      ## DW_TAG_compile_unit
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	37                      ## DW_AT_producer
+	.byte	14                      ## DW_FORM_strp
+	.byte	19                      ## DW_AT_language
+	.byte	5                       ## DW_FORM_data2
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	16                      ## DW_AT_stmt_list
+	.byte	40                      ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding:
+	.byte	27                      ## DW_AT_comp_dir
+	.byte	14                      ## DW_FORM_strp
+	.byte	17                      ## DW_AT_low_pc
+	.byte	1                       ## DW_FORM_addr
+	.byte	18                      ## DW_AT_high_pc
+	.byte	6                       ## DW_FORM_data4
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	2                       ## Abbreviation Code
+	.byte	46                      ## DW_TAG_subprogram
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	17                      ## DW_AT_low_pc
+	.byte	1                       ## DW_FORM_addr
+	.byte	18                      ## DW_AT_high_pc
+	.byte	6                       ## DW_FORM_data4
+	.byte	64                      ## DW_AT_frame_base
+	.byte	24                      ## DW_FORM_exprloc
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	39                      ## DW_AT_prototyped
+	.byte	25                      ## DW_FORM_flag_present
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	63                      ## DW_AT_external
+	.byte	25                      ## DW_FORM_flag_present
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	3                       ## Abbreviation Code
+	.byte	52                      ## DW_TAG_variable
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	2                       ## DW_AT_location
+	.byte	24                      ## DW_FORM_exprloc
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	4                       ## Abbreviation Code
+	.byte	36                      ## DW_TAG_base_type
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	62                      ## DW_AT_encoding
+	.byte	11                      ## DW_FORM_data1
+	.byte	11                      ## DW_AT_byte_size
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	0                       ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+	.long	87                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	1                       ## DWARF Unit Type
+	.byte	8                       ## Address Size (in bytes)
+Lset0 = Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset0
+	.byte	1                       ## Abbrev [1] 0xc:0x4f DW_TAG_compile_unit
+	.long	0                       ## DW_AT_producer
+	.short	12                      ## DW_AT_language
+	.long	55                      ## DW_AT_name
+Lset1 = Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+	.long	Lset1
+	.long	63                      ## DW_AT_comp_dir
+	.quad	Lfunc_begin0            ## DW_AT_low_pc
+Lset2 = Lfunc_end0-Lfunc_begin0         ## DW_AT_high_pc
+	.long	Lset2
+	.byte	2                       ## Abbrev [2] 0x2b:0x28 DW_TAG_subprogram
+	.quad	Lfunc_begin0            ## DW_AT_low_pc
+Lset3 = Lfunc_end0-Lfunc_begin0         ## DW_AT_high_pc
+	.long	Lset3
+	.byte	1                       ## DW_AT_frame_base
+	.byte	86
+	.long	97                      ## DW_AT_name
+	.byte	1                       ## DW_AT_decl_file
+	.byte	1                       ## DW_AT_decl_line
+                                        ## DW_AT_prototyped
+	.long	83                      ## DW_AT_type
+                                        ## DW_AT_external
+	.byte	3                       ## Abbrev [3] 0x44:0xe DW_TAG_variable
+	.byte	2                       ## DW_AT_location
+	.byte	145
+	.byte	120
+	.long	106                     ## DW_AT_name
+	.byte	1                       ## DW_AT_decl_file
+	.byte	2                       ## DW_AT_decl_line
+	.long	83                      ## DW_AT_type
+	.byte	0                       ## End Of Children Mark
+	.byte	4                       ## Abbrev [4] 0x53:0x7 DW_TAG_base_type
+	.long	102                     ## DW_AT_name
+	.byte	5                       ## DW_AT_encoding
+	.byte	4                       ## DW_AT_byte_size
+	.byte	0                       ## End Of Children Mark
+Lcu_begin1:
+	.long	9                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	1                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0						## Abbrev offset
+	.byte 	0	
+Ltu_begin0:
+	.long	26                      ## Length of Unit -- Error: The length for this unit is too large for the .debug_info provided.
+	.short	5                       ## DWARF version number
+	.byte	0                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0
+	.quad	0
+	.long   0
+	.byte 	0			
+
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s b/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s
new file mode 100644
index 000000000000..a3a54077bbf9
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/verify_unit_header_chain.s
@@ -0,0 +1,81 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \
+# RUN: | not llvm-dwarfdump -verify - \
+# RUN: | FileCheck %s
+
+# CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: Units[1] - start offset: 0x0000000d 
+# CHECK-NEXT: 	Error: The unit type encoding is not valid.
+# CHECK-NEXT: 	Error: The address size is unsupported.
+# CHECK-NEXT: Units[2] - start offset: 0x00000026 
+# CHECK-NEXT: 	Error: The 16 bit unit header version is not valid.
+# CHECK-NEXT: 	Error: The offset into the .debug_abbrev section is not valid.
+# CHECK-NEXT: Units[4] - start offset: 0x00000041 
+# CHECK-NEXT: 	Error: The length for this unit is too large for the .debug_info provided.
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.file	1 "basic.c"
+	.comm	_i,4,2                  ## @i
+	.comm	_j,4,2                  ## @j
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.asciz	"clang version 5.0.0 (trunk 307232) (llvm/trunk 307042)" ## string offset=0
+	.asciz	"basic.c"               ## string offset=55
+	.asciz	"/Users/sgravani/Development/tests" ## string offset=63
+	.asciz	"i"                     ## string offset=97
+	.asciz	"int"                   ## string offset=99
+	.asciz	"j"                     ## string offset=103
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                       ## Abbreviation Code
+	.byte	17                      ## DW_TAG_compile_unit
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	0                       ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+	.long	9                      ## Length of Unit
+	.short	4                       ## DWARF version number
+Lset0 = Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset0
+	.byte	4                       ## Address Size (in bytes)
+	.byte	1                       ## Abbrev [1] 0xc:0x45 DW_TAG_compile_unit
+	.byte	0                       ## End Of Children Mark
+Ltu_begin0:
+	.long	21                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	0                       ## DWARF Unit Type -- Error: The unit type encoding is not valid.
+	.byte	3                       ## Address Size (in bytes) -- Error: The address size is unsupported.
+	.long	0
+	.quad	0
+	.long   0
+	.byte 	0
+Lcu_begin1:
+	.long	10                      ## Length of Unit
+	.short	6                       ## DWARF version number -- Error: The 16 bit unit header version is not valid.
+	.byte	1                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes) -- The offset into the .debug_abbrev section is not valid.
+	.long	Lline_table_start0
+	.byte	1                       ## Abbrev [1] 0xc:0x45 DW_TAG_compile_unit
+	.byte	0                       ## End Of Children Mark
+Lcu_begin2:
+	.long	9                      ## Length of Unit
+	.short	5                       ## DWARF version number
+	.byte	1                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0						## Abbrev offset
+	.byte 	0
+Ltu_begin1:
+	.long	26                      ## Length of Unit -- Error: The length for this unit is too large for the .debug_info provided.
+	.short	5                       ## DWARF version number
+	.byte	2                       ## DWARF Unit Type
+	.byte	4                       ## Address Size (in bytes)
+	.long	0
+	.quad	0
+	.long   0
+	.byte 	0		
+
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/test/tools/llvm-mt/help.test b/test/tools/llvm-mt/help.test
new file mode 100644
index 000000000000..29e3667ec2ca
--- /dev/null
+++ b/test/tools/llvm-mt/help.test
@@ -0,0 +1,7 @@
+RUN: llvm-mt /h | FileCheck %s -check-prefix=HELP
+
+RUN: llvm-mt /inputresource:foo.res /manifest foo.manifest | FileCheck %s -check-prefix=NOT_SUPPORTED
+
+HELP:      OVERVIEW: Manifest Tool
+
+NOT_SUPPORTED: llvm-mt: ignoring unsupported 'inputresource:' option
diff --git a/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/reloc-addend.obj.macho-aarch64
new file mode 100644
index 0000000000000000000000000000000000000000..58ed3c7a48fc5e67a957095a1257488c1829ff81
GIT binary patch
literal 424
zcmX^A>+L@t1_nk3AOI08Kr8^}fkYS>B!PGan1!7H>IDM>s2&)f9ViOIFr(t*OHwOJ
zAZkHme0+#&L<oWpW5M);%mcZ_21)}t4KOtzS`~-|zyhpbk_kvk0C5aZJu{H*0n#!+
znj44%fEWaTLU0UH58{&nCIIzJJir`(4dlL=oD54qG|)UBkXax;hz9a)MSwIPk|iJ}
nKN6o4$VcJ=10_ARB%q{-A-=ezC^IiTsWgotrzE$)0H_WC?I0VN

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test b/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test
new file mode 100644
index 000000000000..4cc4a6eca2a1
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/macho-reloc-addend.test
@@ -0,0 +1,6 @@
+RUN: llvm-objdump -r %p/Inputs/reloc-addend.obj.macho-aarch64 | FileCheck %s
+
+CHECK-DAG: 0000000000000004 ARM64_RELOC_ADDEND 0x999
+CHECK-DAG: 0000000000000004 ARM64_RELOC_PAGEOFF12 _stringbuf
+CHECK-DAG: 0000000000000000 ARM64_RELOC_ADDEND 0x999
+CHECK-DAG: 0000000000000000 ARM64_RELOC_PAGE21 _stringbuf
diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86 b/test/tools/llvm-readobj/Inputs/dynamic-table-so.x86
index fb9d37881c983a9e60c81433dec62329f26fea47..01bd1c2fc1edfb7e19791a0fc437bdab9b441bb1 100644
GIT binary patch
delta 1655
zcmZWpO>7%Q6rM?J$4e4>9sjI1j<XvnX=0(;-qwi|p{R}kv5}CHNCXg~fLk>*A|!EY
zM{N&nN`wG~3McQv3Fr+ZI9RzD>VsRFDm@X2O1P*8KvZ&Wj<{Gw5Z=ztl&XEw&YSPO
zZ{F;id2ec&YNi<Wt+@!3jI9XSjHM@Daob2}LUd-JI9mK_CxfVUHW7Nc#kQZ{yrxXn
zIH|@-LN}}^C_?kSNE_N5_&pUIC>`|OwcIF=G>p-=`<~+Ry)yhqGVncJqG7n=3%1In
zx;@dNQMv}%Rz(f-#O<(zgmBVDI!FaJTWj*%{6hZBO8nds$rjF|f@OyWMaL-p74A4R
z<4?XyBrbB8=6|`%|3lbiP@Vr?{!Y?~`C&EhtUr(I&GlREAI0vrJ{-P@l*55rIC`}V
z&MbV^a_;|jh%)elg{=eJ-(&a<#B;^Nfs0I=SI57>iFws&saZ4DG(UM@t~WiaL36`N
zkHBZHe)l|M44;DQE}c@i?>drD(L@S^=KA<AZk)~Hs(gpJG5!!y{YBiUZsKtAwzGi-
z`WIYt`NJ;UXZ-{zwC}%#zB#Y<0;aU#yZ~>urD$q<vrThJ;y7fu*O{0^+GYH6a>p7_
z{^9s%8{*m9B5sD)52^4&Rk^<k6CRE3Le?`y8&L90`RY#F#fhW|K5v4SVZy7?B4oW^
z(i%`%kzVC@J_miW{+e<0Sm9J*F<&q)86)vzDy9#|Q*lE-mNZ6^W62aby^vjeS6@A|
za&CTU5uM3+QXk5%6c*ppk0;Z{@sXH5w6L)He&R9Pa164vA1d<8xN{$tos*PNE40Z_
z8>zz#BD*9dlEKnc=rSwNcCrQ~L>2fQ*L9K|)C#@Jke4*T*QHR*y(`%2BbHY13@qd)
zdyqp^vk-jIwMF;9@GGilZGLPdf|C}~j%%}ih01WjpHM}i3eH8&nT8w4slq+~Ajtmk
zR=^RZ#7m~#f@o{2ekrkjU?wm)`4dA58IpC|&By!99w6cc5Y7i|;(655!rp`%0Y!CS
znNzDRpNQ5C2LXkSz)Vn~CiDdrbqblRb;bSjK(maq#Q2xs3Nk-HrcA<Q^of`sK?%LU
z;@~y#IM4AopR)+xvhg<fLW=YUW4cKgjL<uD6t+UAxBm{EY=LF!g4flrur8_J4;|e-
zQqg+TP>Nu5r=>DSWym3Fa@2$pB2mS4I6x%20Jn&wrHpu{As0zQJt9Lnl9a>^ufYLw
zM4wc^=t)aEJc%91A^QK>EFo%eUjq&hiASsoM)Yx%LnP{B7fR7IoQ)<S+$+P^QJFTj
q$9hjtI?odQRO`orDrqlue#~332U9)@PW{<uhOCzrrB30Y+WQ|%P*6z#

delta 1688
zcmZWpO>7%Q6rM@4wzo;__3vhJ&8{nj)I!+nka%T5R7VQ1l`5(pf)o&~w-upKlZM2O
zNDq`MMM{YxCA<UjrL-4LIi!tzsNm455mH+YE&|CF99$$+NBk&=OE1*CS<e)tJZbmM
z_x@+*y}4Ov6xIfUvmWA7p~J|)ddeD~@O6;jgcz=X;-A^q`(23Ka2J9XyE^jI>sQso
zEpF7}MuIn46ePiUF5*R54}N=s1EoX3ZKg(g${|lrB=^|zTqZBXgd6lMRp3G}+O3fK
z)_9jr=`1+is;2Obcft}9!$pYnkqubyZg`!_M<*7X5-j%U@I_Ca-iP}=diE!7EcPo(
zFYw|TJiU+;NJ>x6*JOj(pWZA>t(VZ>UfTBE?!;-2evhj5;1Bo@=mlIce3~hDnr)=P
zy9~4Q+NrzveZ<pi?ZMAnZm*8rz=gdkF*a$Z9^03GwU-|I7ozrxlw&GP3txE>zD3t=
zbPn!$3>t*rJ<q+FLJ_G9*-K-Oe7KrK-*TV5GPV!ptyeHz%f{c^-%11p^zYlO!s&EC
zzli1FHde=~9!#)wh*Wxaj-%ytK%YSQigXD|-Vr*nb=9kTpy)GXaXbX>^k?$_wA<%d
zggfkq)YoA|TnKL)vcV5k<N7*``*qp?$A6SQf@}Vx!KTzvaYM2T!9a#?!gxTZ%isjQ
zq%EL|D!<CF`VB}b#+3O&u5z+6Q?6t`&gRnQNP0MBjHXA@rZH@0b7s!WlT#&U=3Qgq
zow@n<&dwO9Lgm5oTxI5rF>K|n7tNG$uvA+3AoD+2HU$nn0N*G_Fn1FBO(L7fRl4gU
zFKNOtM4eugiHSX_(xTg$y<`@yA=-e4=+{qb$W^-Sq5%2@Ln_7Hhoap<!nlg3z)%=<
zA0lco1eZgf(<WrYswQTd9~_C}qKoumJx*Aq1Qx>?O|+`8Imc}Za0_kf@E|<ot)a|A
z6ljN1$**xuNq`i5!lT34EmFcNp-LEzL?ZM@a3bpAUtD>Fh^ZG|zqt$l(>WIMC$7Cp
zOvMl)m+ymH5tSNnEUMC%;jf6AeI4gPA$u(+QSJ_~Xa^qWE#^flEP{3sZDQyxh`l$A
zmf=EF&91tU2lmuZbtB^~@yKk98C`a(y3YR6cW83}J0L8#Tx%bmn-(k5Q8*VnC8f(J
z=jXS6jveoUf_4Sl_o0@NmwEPO(4Mj6O^!BU3ehe{yHG_W8odb*5Q#y-1aV6?d2^dE
zg{a6;5vp+u*5V2@<EAV|X#=$VmR#f8sKFGX{~pvTq9)g2CPd<$tAm!Xpq?<{lY}CR
z)@eXBfgb$LLyv?N6xS^t|K?Wm*+Z0`0Y|rJ8mjth{a5(Z?#B~M!yo$da)G_QNPj}o
G(EkO_@J|i^

diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table.c b/test/tools/llvm-readobj/Inputs/dynamic-table.c
index b5251f87ee53..0af66ca0c53f 100644
--- a/test/tools/llvm-readobj/Inputs/dynamic-table.c
+++ b/test/tools/llvm-readobj/Inputs/dynamic-table.c
@@ -1,10 +1,10 @@
 // clang -target x86_64-linux-gnu -shared -fPIC -lc dynamic-table.c \
-//       -o dynamic-table-so.x86 -Wl,-f,aux_val
+//       -o dynamic-table-so.x86 -Wl,-f,aux.so -Wl,-F,filter.so
 // clang -target mipsel-linux-gnu -shared -fPIC -lc dynamic-table.c \
 //       -o dynamic-table-so.mips
 // clang -target mipsel-linux-gnu -lc dynamic-table.c \
 //       -o dynamic-table-exe.mips
-// clang -target aarch64-linux-gnu -fPIC -shared dynamic-table.c \ 
+// clang -target aarch64-linux-gnu -fPIC -shared dynamic-table.c\
 //       -o dynamic-table-so.aarch64
 int puts(const char *);
 
diff --git a/test/tools/llvm-readobj/dynamic.test b/test/tools/llvm-readobj/dynamic.test
index 5079d3556957..71b6b06cbc08 100644
--- a/test/tools/llvm-readobj/dynamic.test
+++ b/test/tools/llvm-readobj/dynamic.test
@@ -8,7 +8,7 @@ ELF-MIPS: AddressSize: 32bit
 ELF-MIPS: LoadName:
 ELF-MIPS: DynamicSection [ (23 entries)
 ELF-MIPS:   Tag        Type                 Name/Value
-ELF-MIPS:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-MIPS:   0x00000001 NEEDED               Shared library: [libc.so.6]
 ELF-MIPS:   0x0000000C INIT                 0x528
 ELF-MIPS:   0x0000000D FINI                 0x860
 ELF-MIPS:   0x00000004 HASH                 0x210
@@ -43,7 +43,7 @@ ELF-MIPS-EXE: AddressSize: 32bit
 ELF-MIPS-EXE: LoadName:
 ELF-MIPS-EXE: DynamicSection [ (26 entries)
 ELF-MIPS-EXE:   Tag        Type                 Name/Value
-ELF-MIPS-EXE:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-MIPS-EXE:   0x00000001 NEEDED               Shared library: [libc.so.6]
 ELF-MIPS-EXE:   0x0000000C INIT                 0x400418
 ELF-MIPS-EXE:   0x0000000D FINI                 0x4007B0
 ELF-MIPS-EXE:   0x00000004 HASH                 0x4002B8
@@ -80,9 +80,9 @@ ELF-X86-EXE: AddressSize: 32bit
 ELF-X86-EXE: LoadName:
 ELF-X86-EXE: DynamicSection [ (30 entries)
 ELF-X86-EXE:   Tag        Type                 Name/Value
-ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libstdc++.so.6)
-ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libgcc_s.so.1)
-ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-X86-EXE:   0x00000001 NEEDED               Shared library: [libstdc++.so.6]
+ELF-X86-EXE:   0x00000001 NEEDED               Shared library: [libgcc_s.so.1]
+ELF-X86-EXE:   0x00000001 NEEDED               Shared library: [libc.so.6]
 ELF-X86-EXE:   0x0000000C INIT                 0x62C
 ELF-X86-EXE:   0x0000000D FINI                 0x920
 ELF-X86-EXE:   0x00000019 INIT_ARRAY           0x19FC
@@ -119,32 +119,33 @@ ELF-X86-SO: Format: ELF64-x86-64
 ELF-X86-SO: Arch: x86_64
 ELF-X86-SO: AddressSize: 64bit
 ELF-X86-SO: LoadName: 
-ELF-X86-SO: DynamicSection [ (26 entries)
+ELF-X86-SO: DynamicSection [ ({{[0-9]+}} entries)
 ELF-X86-SO:   Tag                Type                 Name/Value
-ELF-X86-SO:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
-ELF-X86-SO:   0x0000000000000001 NEEDED               SharedLibrary (ld-linux-x86-64.so.2)
-ELF-X86-SO:   0x000000007FFFFFFD AUXILIARY            Auxiliary library: [aux_val]
-ELF-X86-SO:   0x000000000000000C INIT                 0x610
-ELF-X86-SO:   0x000000000000000D FINI                 0x7AC
-ELF-X86-SO:   0x0000000000000019 INIT_ARRAY           0x200DD0
+ELF-X86-SO:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
+ELF-X86-SO:   0x0000000000000001 NEEDED               Shared library: [ld-linux-x86-64.so.2]
+ELF-X86-SO:   0x000000007FFFFFFF FILTER               Filter library: [filter.so]
+ELF-X86-SO:   0x000000007FFFFFFD AUXILIARY            Auxiliary library: [aux.so]
+ELF-X86-SO:   0x000000000000000C INIT                 0x{{[0-9A-F]+}}
+ELF-X86-SO:   0x000000000000000D FINI                 0x{{[0-9A-F]+}}
+ELF-X86-SO:   0x0000000000000019 INIT_ARRAY           0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000000000001B INIT_ARRAYSZ         8 (bytes)
-ELF-X86-SO:   0x000000000000001A FINI_ARRAY           0x200DD8
+ELF-X86-SO:   0x000000000000001A FINI_ARRAY           0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000000000001C FINI_ARRAYSZ         8 (bytes)
 ELF-X86-SO:   0x000000006FFFFEF5 GNU_HASH             0x1C8
 ELF-X86-SO:   0x0000000000000005 STRTAB               0x3A0
 ELF-X86-SO:   0x0000000000000006 SYMTAB               0x208
-ELF-X86-SO:   0x000000000000000A STRSZ                231 (bytes)
+ELF-X86-SO:   0x000000000000000A STRSZ                {{[0-9]+}} (bytes)
 ELF-X86-SO:   0x000000000000000B SYMENT               24 (bytes)
 ELF-X86-SO:   0x0000000000000003 PLTGOT               0x201000
 ELF-X86-SO:   0x0000000000000002 PLTRELSZ             48 (bytes)
 ELF-X86-SO:   0x0000000000000014 PLTREL               RELA
-ELF-X86-SO:   0x0000000000000017 JMPREL               0x5E0
-ELF-X86-SO:   0x0000000000000007 RELA                 0x4F0
+ELF-X86-SO:   0x0000000000000017 JMPREL               0x{{[0-9A-F]+}}
+ELF-X86-SO:   0x0000000000000007 RELA                 0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x0000000000000008 RELASZ               240 (bytes)
 ELF-X86-SO:   0x0000000000000009 RELAENT              24 (bytes)
-ELF-X86-SO:   0x000000006FFFFFFE VERNEED              0x4B0
+ELF-X86-SO:   0x000000006FFFFFFE VERNEED              0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000006FFFFFFF VERNEEDNUM           2
-ELF-X86-SO:   0x000000006FFFFFF0 VERSYM               0x488
+ELF-X86-SO:   0x000000006FFFFFF0 VERSYM               0x{{[0-9A-F]+}}
 ELF-X86-SO:   0x000000006FFFFFF9 RELACOUNT            3
 ELF-X86-SO:   0x0000000000000000 NULL                 0x0
 
@@ -157,7 +158,7 @@ ELF-AARCH64-SO: AddressSize: 64bit
 ELF-AARCH64-SO: LoadName: 
 ELF-AARCH64-SO: DynamicSection [ (26 entries)
 ELF-AARCH64-SO:   Tag                Type                 Name/Value
-ELF-AARCH64-SO:   0x0000000000000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-AARCH64-SO:   0x0000000000000001 NEEDED               Shared library: [libc.so.6]
 ELF-AARCH64-SO:   0x000000000000000C INIT                 0x660
 ELF-AARCH64-SO:   0x000000000000000D FINI                 0x83C
 ELF-AARCH64-SO:   0x0000000000000019 INIT_ARRAY           0x10DB8
diff --git a/test/tools/llvm-readobj/gnu-sections.test b/test/tools/llvm-readobj/gnu-sections.test
index fb90ce44d10f..f5b504fa66fa 100644
--- a/test/tools/llvm-readobj/gnu-sections.test
+++ b/test/tools/llvm-readobj/gnu-sections.test
@@ -1,6 +1,14 @@
 RUN: llvm-readobj -s %p/Inputs/relocs.obj.elf-i386 --elf-output-style=GNU \
 RUN:   | FileCheck %s -check-prefix ELF32
-RUN: llvm-readobj -s %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN: llvm-readobj -S %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj --wide --sections \
+RUN:   %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj -W --sections \
+RUN:   %p/Inputs/relocs.obj.elf-x86_64 --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readelf -W -S %p/Inputs/relocs.obj.elf-x86_64 \
 RUN:   | FileCheck %s -check-prefix ELF64
 
 ELF32:    Section Headers:
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 3bb0c8f7b7c5..731bcbd8ac9d 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
   Core
+  DlltoolDriver
   LibDriver
   Object
   Support
@@ -15,3 +16,4 @@ add_llvm_tool(llvm-ar
 
 add_llvm_tool_symlink(llvm-ranlib llvm-ar)
 add_llvm_tool_symlink(llvm-lib llvm-ar)
+add_llvm_tool_symlink(llvm-dlltool llvm-ar)
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 500507fd4966..af4d3efa52f7 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
 #include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
@@ -863,6 +864,9 @@ int main(int argc, char **argv) {
   llvm::InitializeAllAsmParsers();
 
   StringRef Stem = sys::path::stem(ToolName);
+  if (Stem.find("dlltool") != StringRef::npos)
+    return dlltoolDriverMain(makeArrayRef(argv, argc));
+
   if (Stem.find("ranlib") == StringRef::npos &&
       Stem.find("lib") != StringRef::npos)
     return libDriverMain(makeArrayRef(argv, argc));
@@ -878,5 +882,5 @@ int main(int argc, char **argv) {
     return ranlib_main();
   if (Stem.find("ar") != StringRef::npos)
     return ar_main();
-  fail("Not ranlib, ar or lib!");
+  fail("Not ranlib, ar, lib or dlltool!");
 }
diff --git a/tools/llvm-mt/CMakeLists.txt b/tools/llvm-mt/CMakeLists.txt
new file mode 100644
index 000000000000..d97cc4fe446f
--- /dev/null
+++ b/tools/llvm-mt/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  Option
+  Support
+  )
+
+set(LLVM_TARGET_DEFINITIONS Opts.td)
+
+tablegen(LLVM Opts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(MtTableGen)
+
+add_llvm_tool(llvm-mt
+  llvm-mt.cpp
+  )
diff --git a/tools/llvm-mt/LLVMBuild.txt b/tools/llvm-mt/LLVMBuild.txt
new file mode 100644
index 000000000000..894d3527924b
--- /dev/null
+++ b/tools/llvm-mt/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-mt/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-mt
+parent = Tools
+required_libraries = Option Support
diff --git a/tools/llvm-mt/Opts.td b/tools/llvm-mt/Opts.td
new file mode 100644
index 000000000000..6dc3eea524e6
--- /dev/null
+++ b/tools/llvm-mt/Opts.td
@@ -0,0 +1,29 @@
+include "llvm/Option/OptParser.td"
+
+def unsupported : OptionGroup<"unsupported">;
+def manifest : Separate<["/", "-"], "manifest">, HelpText<"Used to specify each manifest that need to be processed">, MetaVarName<"manifest">;
+def identity : Joined<["/", "-"], "identity:">, HelpText<"Not supported">, MetaVarName<"identity">, Group<unsupported>;
+def rgs : Joined<["/", "-"], "rgs:">, HelpText<"Not supported">, MetaVarName<"script">, Group<unsupported>;
+def tlb : Joined<["/", "-"], "tlb:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def dll : Joined<["/", "-"], "dll:">, HelpText<"Not supported">, MetaVarName<"dll">, Group<unsupported>;
+def replacements : Joined<["/", "-"], "replacements:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def managed_assembly_name : Joined<["/", "-"], "managedassemblyname:">, HelpText<"Not supported">, MetaVarName<"assembly">, Group<unsupported>;
+def no_dependency : Flag<["/", "-"], "nodependency">, HelpText<"Not supported">, Group<unsupported>;
+def category : Flag<["/", "-"], "category">, HelpText<"Not supported">, Group<unsupported>;
+def no_logo : Flag<["/", "-"], "nologo">, HelpText<"No effect as this tool never writes copyright data.  Included for parity">;
+def out : Joined<["/", "-"], "out:">, HelpText<"Name of the output manifest.  If this is skipped and only one manifest is being operated upon by the tool, that manifest is modified in place">, MetaVarName<"manifest">;
+def input_resource : Joined<["/", "-"], "inputresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def output_resource : Joined<["/", "-"], "outputresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def output_resource_flag : Flag<["/", "-"], "outputresource">, Alias<output_resource>, HelpText<"Not supported">, Group<unsupported>;
+def update_resource : Joined<["/", "-"], "updateresource:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def hash_update : Joined<["/", "-"], "hashupdate:">, HelpText<"Not supported">, MetaVarName<"file">, Group<unsupported>;
+def hash_update_flag : Flag<["/", "-"], "hashupdate">, Alias<hash_update>, HelpText<"Not supported">, Group<unsupported>;
+def validate_manifest : Flag<["/", "-"], "validate_manifest">, HelpText<"Not supported">, Group<unsupported>;
+def validate_file_hashes : Joined<["/", "-"], "validate_file_hashes:">, HelpText<"Not supported">, MetaVarName<"">, Group<unsupported>;
+def canonicalize : Flag<["/", "-"], "canonicalize:">, HelpText<"Not supported">, Group<unsupported>;
+def check_for_duplicates : Flag<["/", "-"], "check_for_duplicates:">, HelpText<"Not supported">, Group<unsupported>;
+def make_cdfs : Flag<["/", "-"], "makecdfs:">, HelpText<"Not supported">, Group<unsupported>;
+def verbose : Flag<["/", "-"], "verbose">, HelpText<"Not supported">, Group<unsupported>;
+def help : Flag<["/", "-"], "?">;
+def help_long : Flag<["/", "-"], "help">, Alias<help>;
+def h : Flag<["/", "-"], "h">, Alias<help>;
diff --git a/tools/llvm-mt/llvm-mt.cpp b/tools/llvm-mt/llvm-mt.cpp
new file mode 100644
index 000000000000..05c9238c7c64
--- /dev/null
+++ b/tools/llvm-mt/llvm-mt.cpp
@@ -0,0 +1,117 @@
+//===- llvm-mt.cpp - Merge .manifest files ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// Merge .manifest files.  This is intended to be a platform-independent port
+// of Microsoft's mt.exe.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <system_error>
+
+using namespace llvm;
+
+namespace {
+
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OPT_##ID,
+#include "Opts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Opts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+{                                                                              \
+      PREFIX,      NAME,      HELPTEXT,                                        \
+      METAVAR,     OPT_##ID,  opt::Option::KIND##Class,                        \
+      PARAM,       FLAGS,     OPT_##GROUP,                                     \
+      OPT_##ALIAS, ALIASARGS, VALUES},
+#include "Opts.inc"
+#undef OPTION
+};
+
+class CvtResOptTable : public opt::OptTable {
+public:
+  CvtResOptTable() : OptTable(InfoTable, true) {}
+};
+
+static ExitOnError ExitOnErr;
+} // namespace
+
+LLVM_ATTRIBUTE_NORETURN void reportError(Twine Msg) {
+  errs() << "llvm-mt error: " << Msg << "\n";
+  exit(1);
+}
+
+int main(int argc, const char **argv) {
+  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  PrettyStackTraceProgram X(argc, argv);
+
+  ExitOnErr.setBanner("llvm-mt: ");
+
+  SmallVector<const char *, 256> argv_buf;
+  SpecificBumpPtrAllocator<char> ArgAllocator;
+  ExitOnErr(errorCodeToError(sys::Process::GetArgumentVector(
+      argv_buf, makeArrayRef(argv, argc), ArgAllocator)));
+
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  CvtResOptTable T;
+  unsigned MAI, MAC;
+  ArrayRef<const char *> ArgsArr = makeArrayRef(argv + 1, argc);
+  opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
+
+  for (auto &Arg : InputArgs) {
+    if (Arg->getOption().matches(OPT_unsupported)) {
+      outs() << "llvm-mt: ignoring unsupported '" << Arg->getOption().getName()
+             << "' option\n";
+    }
+  }
+
+  if (InputArgs.hasArg(OPT_help)) {
+    T.PrintHelp(outs(), "mt", "Manifest Tool", false);
+    return 0;
+  }
+
+  std::vector<std::string> InputFiles = InputArgs.getAllArgValues(OPT_manifest);
+
+  if (InputFiles.size() == 0) {
+    reportError("no input file specified");
+  }
+
+  StringRef OutputFile;
+
+  if (InputArgs.hasArg(OPT_out)) {
+    OutputFile = InputArgs.getLastArgValue(OPT_out);
+  } else if (InputFiles.size() == 1) {
+    OutputFile = InputFiles[0];
+  } else {
+    reportError("no output file specified");
+  }
+
+  return 0;
+}
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 812f1af3ac68..d54b45515f05 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -870,7 +870,10 @@ static void printRelocationTargetName(const MachOObjectFile *O,
   bool isExtern = O->getPlainRelocationExternal(RE);
   uint64_t Val = O->getPlainRelocationSymbolNum(RE);
 
-  if (isExtern) {
+  if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND) {
+    fmt << format("0x%x", Val);
+    return;
+  } else if (isExtern) {
     symbol_iterator SI = O->symbol_begin();
     advance(SI, Val);
     Expected<StringRef> SOrErr = SI->getName();
diff --git a/tools/llvm-pdbutil/DumpOutputStyle.cpp b/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 0642d841fd9f..01c7481c3086 100644
--- a/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -654,7 +654,7 @@ static void dumpFullTypeStream(LinePrinter &Printer,
       NumDigits(TypeIndex::FirstNonSimpleIndex + Stream.getNumTypeRecords());
 
   MinimalTypeDumpVisitor V(Printer, Width + 2, Bytes, Extras, Types,
-                           Stream.getHashValues());
+                           Stream.getNumHashBuckets(), Stream.getHashValues());
 
   if (auto EC = codeview::visitTypeStream(Types, V)) {
     Printer.formatLine("An error occurred dumping type records: {0}",
@@ -670,7 +670,7 @@ static void dumpPartialTypeStream(LinePrinter &Printer,
       NumDigits(TypeIndex::FirstNonSimpleIndex + Stream.getNumTypeRecords());
 
   MinimalTypeDumpVisitor V(Printer, Width + 2, Bytes, Extras, Types,
-                           Stream.getHashValues());
+                           Stream.getNumHashBuckets(), Stream.getHashValues());
 
   if (opts::dump::DumpTypeDependents) {
     // If we need to dump all dependents, then iterate each index and find
diff --git a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
index ab7045ca4492..d93843649db0 100644
--- a/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
+++ b/tools/llvm-pdbutil/MinimalSymbolDumper.cpp
@@ -725,8 +725,9 @@ Error MinimalSymbolDumper::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
                Proc.Parent, Proc.End,
                formatSegmentOffset(Proc.Segment, Proc.CodeOffset),
                Proc.CodeSize);
-  P.formatLine("debug start = {0}, debug end = {1}, flags = {2}", Proc.DbgStart,
-               Proc.DbgEnd,
+  // FIXME: It seems FunctionType is sometimes an id and sometimes a type.
+  P.formatLine("type = `{0}`, debug start = {1}, debug end = {2}, flags = {3}",
+               typeIndex(Proc.FunctionType), Proc.DbgStart, Proc.DbgEnd,
                formatProcSymFlags(P.getIndentLevel() + 9, Proc.Flags));
   return Error::success();
 }
diff --git a/tools/llvm-pdbutil/MinimalTypeDumper.cpp b/tools/llvm-pdbutil/MinimalTypeDumper.cpp
index 9621320ea99a..0079b9e7eaa4 100644
--- a/tools/llvm-pdbutil/MinimalTypeDumper.cpp
+++ b/tools/llvm-pdbutil/MinimalTypeDumper.cpp
@@ -18,6 +18,7 @@
 #include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -214,10 +215,20 @@ Error MinimalTypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
                  getLeafTypeName(Record.Type), Record.length());
   } else {
     std::string H;
-    if (Index.toArrayIndex() >= HashValues.size())
+    if (Index.toArrayIndex() >= HashValues.size()) {
       H = "(not present)";
-    else
-      H = utostr(HashValues[Index.toArrayIndex()]);
+    } else {
+      uint32_t Hash = HashValues[Index.toArrayIndex()];
+      Expected<uint32_t> MaybeHash = hashTypeRecord(Record);
+      if (!MaybeHash)
+        return MaybeHash.takeError();
+      uint32_t OurHash = *MaybeHash;
+      OurHash %= NumHashBuckets;
+      if (Hash == OurHash)
+        H = "0x" + utohexstr(Hash);
+      else
+        H = "0x" + utohexstr(Hash) + ", our hash = 0x" + utohexstr(OurHash);
+    }
     P.formatLine("{0} | {1} [size = {2}, hash = {3}]",
                  fmt_align(Index, AlignStyle::Right, Width),
                  getLeafTypeName(Record.Type), Record.length(), H);
@@ -395,8 +406,7 @@ Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
 
 Error MinimalTypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                                TypeServer2Record &TS) {
-  P.formatLine("name = {0}, age = {1}, guid = {2}", TS.Name, TS.Age,
-               fmt_guid(TS.Guid));
+  P.formatLine("name = {0}, age = {1}, guid = {2}", TS.Name, TS.Age, TS.Guid);
   return Error::success();
 }
 
diff --git a/tools/llvm-pdbutil/MinimalTypeDumper.h b/tools/llvm-pdbutil/MinimalTypeDumper.h
index 42882b4b4060..4227688f0f71 100644
--- a/tools/llvm-pdbutil/MinimalTypeDumper.h
+++ b/tools/llvm-pdbutil/MinimalTypeDumper.h
@@ -25,9 +25,10 @@ class MinimalTypeDumpVisitor : public codeview::TypeVisitorCallbacks {
 public:
   MinimalTypeDumpVisitor(LinePrinter &P, uint32_t Width, bool RecordBytes,
                          bool Hashes, codeview::LazyRandomTypeCollection &Types,
+                         uint32_t NumHashBuckets,
                          FixedStreamArray<support::ulittle32_t> HashValues)
       : P(P), Width(Width), RecordBytes(RecordBytes), Hashes(Hashes),
-        Types(Types), HashValues(HashValues) {}
+        Types(Types), NumHashBuckets(NumHashBuckets), HashValues(HashValues) {}
 
   Error visitTypeBegin(codeview::CVType &Record,
                        codeview::TypeIndex Index) override;
@@ -53,6 +54,7 @@ class MinimalTypeDumpVisitor : public codeview::TypeVisitorCallbacks {
   bool RecordBytes = false;
   bool Hashes = false;
   codeview::LazyRandomTypeCollection &Types;
+  uint32_t NumHashBuckets;
   FixedStreamArray<support::ulittle32_t> HashValues;
 };
 } // namespace pdb
diff --git a/tools/llvm-pdbutil/PdbYaml.cpp b/tools/llvm-pdbutil/PdbYaml.cpp
index 315ae2e6711f..9c3beb566d2c 100644
--- a/tools/llvm-pdbutil/PdbYaml.cpp
+++ b/tools/llvm-pdbutil/PdbYaml.cpp
@@ -38,41 +38,6 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::pdb::PdbRaw_FeatureSig)
 namespace llvm {
 namespace yaml {
 
-template <> struct ScalarTraits<llvm::pdb::PDB_UniqueId> {
-  static void output(const llvm::pdb::PDB_UniqueId &S, void *,
-                     llvm::raw_ostream &OS) {
-    OS << S;
-  }
-
-  static StringRef input(StringRef Scalar, void *Ctx,
-                         llvm::pdb::PDB_UniqueId &S) {
-    if (Scalar.size() != 38)
-      return "GUID strings are 38 characters long";
-    if (Scalar[0] != '{' || Scalar[37] != '}')
-      return "GUID is not enclosed in {}";
-    if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' ||
-        Scalar[24] != '-')
-      return "GUID sections are not properly delineated with dashes";
-
-    uint8_t *OutBuffer = S.Guid;
-    for (auto Iter = Scalar.begin(); Iter != Scalar.end();) {
-      if (*Iter == '-' || *Iter == '{' || *Iter == '}') {
-        ++Iter;
-        continue;
-      }
-      uint8_t Value = (llvm::hexDigitValue(*Iter) << 4);
-      ++Iter;
-      Value |= llvm::hexDigitValue(*Iter);
-      ++Iter;
-      *OutBuffer++ = Value;
-    }
-
-    return "";
-  }
-
-  static bool mustQuote(StringRef Scalar) { return needsQuotes(Scalar); }
-};
-
 template <> struct ScalarEnumerationTraits<llvm::pdb::PDB_Machine> {
   static void enumeration(IO &io, llvm::pdb::PDB_Machine &Value) {
     io.enumCase(Value, "Invalid", PDB_Machine::Invalid);
diff --git a/tools/llvm-pdbutil/PdbYaml.h b/tools/llvm-pdbutil/PdbYaml.h
index 62ed608916fc..91e054490a5f 100644
--- a/tools/llvm-pdbutil/PdbYaml.h
+++ b/tools/llvm-pdbutil/PdbYaml.h
@@ -57,7 +57,7 @@ struct PdbInfoStream {
   PdbRaw_ImplVer Version = PdbImplVC70;
   uint32_t Signature = 0;
   uint32_t Age = 1;
-  PDB_UniqueId Guid;
+  codeview::GUID Guid;
   std::vector<PdbRaw_FeatureSig> Features;
   std::vector<NamedStreamMapping> NamedStreams;
 };
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 6aa08ff3cd87..f2bd194622ed 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -956,8 +956,8 @@ static void mergePdbs() {
     SmallVector<TypeIndex, 128> IdMap;
     if (File.hasPDBTpiStream()) {
       auto &Tpi = ExitOnErr(File.getPDBTpiStream());
-      ExitOnErr(codeview::mergeTypeRecords(MergedTpi, TypeMap, nullptr,
-                                           Tpi.typeArray()));
+      ExitOnErr(
+          codeview::mergeTypeRecords(MergedTpi, TypeMap, Tpi.typeArray()));
     }
     if (File.hasPDBIpiStream()) {
       auto &Ipi = ExitOnErr(File.getPDBIpiStream());
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index bde486a5f0db..f5b1a5b256ad 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -20,3 +20,5 @@ add_llvm_tool(llvm-readobj
   WasmDumper.cpp
   Win64EHDumper.cpp
   )
+
+add_llvm_tool_symlink(llvm-readelf llvm-readobj)
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 9fb3267e2f9d..74c44116b127 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1215,8 +1215,7 @@ void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVIDs,
         error(object_error::parse_failed);
       }
       SmallVector<TypeIndex, 128> SourceToDest;
-      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, nullptr,
-                                          Types))
+      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types))
         return error(std::move(EC));
     }
   }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index a1db96cba081..5698420bbcc2 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -1532,6 +1532,7 @@ static const char *getTypeString(unsigned Arch, uint64_t Type) {
   LLVM_READOBJ_TYPE_CASE(TLSDESC_PLT);
   LLVM_READOBJ_TYPE_CASE(TLSDESC_GOT);
   LLVM_READOBJ_TYPE_CASE(AUXILIARY);
+  LLVM_READOBJ_TYPE_CASE(FILTER);
   default: return "unknown";
   }
 }
@@ -1624,6 +1625,10 @@ StringRef ELFDumper<ELFT>::getDynamicString(uint64_t Value) const {
   return StringRef(DynamicStringTable.data() + Value);
 }
 
+static void printLibrary(raw_ostream &OS, const Twine &Tag, const Twine &Name) {
+  OS << Tag << ": [" << Name << "]";
+}
+
 template <class ELFT>
 void ELFDumper<ELFT>::printValue(uint64_t Type, uint64_t Value) {
   raw_ostream &OS = W.getOStream();
@@ -1687,13 +1692,16 @@ void ELFDumper<ELFT>::printValue(uint64_t Type, uint64_t Value) {
     OS << Value << " (bytes)";
     break;
   case DT_NEEDED:
-    OS << "SharedLibrary (" << getDynamicString(Value) << ")";
+    printLibrary(OS, "Shared library", getDynamicString(Value));
     break;
   case DT_SONAME:
-    OS << "LibrarySoname (" << getDynamicString(Value) << ")";
+    printLibrary(OS, "Library soname", getDynamicString(Value));
     break;
   case DT_AUXILIARY:
-    OS << "Auxiliary library: [" << getDynamicString(Value) << "]";
+    printLibrary(OS, "Auxiliary library", getDynamicString(Value));
+    break;
+  case DT_FILTER:
+    printLibrary(OS, "Filter library", getDynamicString(Value));
     break;
   case DT_RPATH:
   case DT_RUNPATH:
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 51991a3f067b..7bfb18fab12b 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/Signals.h"
@@ -50,6 +51,13 @@ namespace opts {
     cl::desc("<input object files>"),
     cl::ZeroOrMore);
 
+  // -wide, -W
+  cl::opt<bool> WideOutput("wide",
+    cl::desc("Ignored for compatibility with GNU readelf"));
+  cl::alias WideOutputShort("W",
+    cl::desc("Alias for --wide"),
+    cl::aliasopt(WideOutput));
+
   // -file-headers, -h
   cl::opt<bool> FileHeaders("file-headers",
     cl::desc("Display file headers "));
@@ -57,12 +65,16 @@ namespace opts {
     cl::desc("Alias for --file-headers"),
     cl::aliasopt(FileHeaders));
 
-  // -sections, -s
+  // -sections, -s, -S
+  // Note: In GNU readelf, -s means --symbols!
   cl::opt<bool> Sections("sections",
     cl::desc("Display all sections."));
   cl::alias SectionsShort("s",
     cl::desc("Alias for --sections"),
     cl::aliasopt(Sections));
+  cl::alias SectionsShortUpper("S",
+    cl::desc("Alias for --sections"),
+    cl::aliasopt(Sections));
 
   // -section-relocations, -sr
   cl::opt<bool> SectionRelocations("section-relocations",
@@ -533,13 +545,19 @@ static void dumpInput(StringRef File) {
 }
 
 int main(int argc, const char *argv[]) {
-  sys::PrintStackTraceOnErrorSignal(argv[0]);
+  StringRef ToolName = argv[0];
+  sys::PrintStackTraceOnErrorSignal(ToolName);
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y;
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
 
+  opts::WideOutput.setHiddenFlag(cl::Hidden);
+
+  if (sys::path::stem(ToolName).find("readelf") != StringRef::npos)
+    opts::Output = opts::GNU;
+
   cl::ParseCommandLineOptions(argc, argv, "LLVM Object Reader\n");
 
   // Default to stdin if no filename is specified.
diff --git a/tools/opt-viewer/opt-diff.py b/tools/opt-viewer/opt-diff.py
index 9e921f8488d3..f39432c8bc9d 100755
--- a/tools/opt-viewer/opt-diff.py
+++ b/tools/opt-viewer/opt-diff.py
@@ -20,24 +20,17 @@
 import argparse
 from collections import defaultdict
 from multiprocessing import cpu_count, Pool
-import os, os.path
-import fnmatch
-
-def find_files(dir_or_file):
-    if os.path.isfile(dir_or_file):
-        return [dir_or_file]
-
-    all = []
-    for dir, subdirs, files in os.walk(dir_or_file):
-        for file in files:
-            if fnmatch.fnmatch(file, "*.opt.yaml"):
-                all.append( os.path.join(dir, file))
-    return all
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
-    parser.add_argument('yaml_dir_or_file_1')
-    parser.add_argument('yaml_dir_or_file_2')
+    parser.add_argument(
+        'yaml_dir_or_file_1',
+        help='An optimization record file or a directory searched for optimization '
+             'record files that are used as the old version for the comparison')
+    parser.add_argument(
+        'yaml_dir_or_file_2',
+        help='An optimization record file or a directory searched for optimization '
+             'record files that are used as the new version for the comparison')
     parser.add_argument(
         '--jobs',
         '-j',
@@ -53,8 +46,8 @@ def find_files(dir_or_file):
     parser.add_argument('--output', '-o', default='diff.opt.yaml')
     args = parser.parse_args()
 
-    files1 = find_files(args.yaml_dir_or_file_1)
-    files2 = find_files(args.yaml_dir_or_file_2)
+    files1 = optrecord.find_opt_files([args.yaml_dir_or_file_1])
+    files2 = optrecord.find_opt_files([args.yaml_dir_or_file_2])
 
     print_progress = not args.no_progress_indicator
     all_remarks1, _, _ = optrecord.gather_results(files1, args.jobs, print_progress)
diff --git a/tools/opt-viewer/opt-stats.py b/tools/opt-viewer/opt-stats.py
index a7e598fdfd02..205b08ba8a74 100755
--- a/tools/opt-viewer/opt-stats.py
+++ b/tools/opt-viewer/opt-stats.py
@@ -15,7 +15,11 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
-    parser.add_argument('yaml_files', nargs='+')
+    parser.add_argument(
+        'yaml_dirs_or_files',
+        nargs='+',
+        help='List of optimization record files or directories searched '
+             'for optimization record files.')
     parser.add_argument(
         '--jobs',
         '-j',
@@ -31,8 +35,14 @@
     args = parser.parse_args()
 
     print_progress = not args.no_progress_indicator
+
+    files = optrecord.find_opt_files(args.yaml_dirs_or_files)
+    if not files:
+        parser.error("No *.opt.yaml files found")
+        sys.exit(1)
+
     all_remarks, file_remarks, _ = optrecord.gather_results(
-        args.yaml_files, args.jobs, print_progress)
+        files, args.jobs, print_progress)
     if print_progress:
         print('\n')
 
diff --git a/tools/opt-viewer/opt-viewer.py b/tools/opt-viewer/opt-viewer.py
index e6dd6a0286fe..69bcaedb7669 100755
--- a/tools/opt-viewer/opt-viewer.py
+++ b/tools/opt-viewer/opt-viewer.py
@@ -219,7 +219,11 @@ def generate_report(all_remarks,
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description=desc)
-    parser.add_argument('yaml_files', nargs='+')
+    parser.add_argument(
+        'yaml_dirs_or_files',
+        nargs='+',
+        help='List of optimization record files or directories searched '
+             'for optimization record files.')
     parser.add_argument(
         '--output-dir',
         '-o',
@@ -248,8 +252,14 @@ def generate_report(all_remarks,
     args = parser.parse_args()
 
     print_progress = not args.no_progress_indicator
+
+    files = optrecord.find_opt_files(args.yaml_dirs_or_files)
+    if not files:
+        parser.error("No *.opt.yaml files found")
+        sys.exit(1)
+
     all_remarks, file_remarks, should_display_hotness = \
-        optrecord.gather_results(args.yaml_files, args.jobs, print_progress)
+        optrecord.gather_results(files, args.jobs, print_progress)
 
     map_remarks(all_remarks)
 
diff --git a/tools/opt-viewer/optpmap.py b/tools/opt-viewer/optpmap.py
index 01e848e03976..16cb22e21491 100644
--- a/tools/opt-viewer/optpmap.py
+++ b/tools/opt-viewer/optpmap.py
@@ -20,6 +20,7 @@ def _wrapped_func(func_and_args):
         with _current.get_lock():
             _current.value += 1
         sys.stdout.write('\r\t{} of {}'.format(_current.value, _total.value))
+        sys.stdout.flush()
 
     return func(argument)
 
diff --git a/tools/opt-viewer/optrecord.py b/tools/opt-viewer/optrecord.py
index 61ed9626cffa..4599e12d7e68 100644
--- a/tools/opt-viewer/optrecord.py
+++ b/tools/opt-viewer/optrecord.py
@@ -12,8 +12,10 @@
 
 import cgi
 from collections import defaultdict
+import fnmatch
 import functools
 from multiprocessing import Lock
+import os, os.path
 import subprocess
 
 import optpmap
@@ -47,7 +49,7 @@ def demangle(name):
 
 
 def html_file_name(filename):
-    return filename.replace('/', '_') + ".html"
+    return filename.replace('/', '_').replace('#', '_') + ".html"
 
 
 def make_link(File, Line):
@@ -233,3 +235,19 @@ def merge_file_remarks(file_remarks_job, all_remarks, merged):
         all_remarks.update(all_remarks_job)
 
     return all_remarks, file_remarks, max_hotness != 0
+
+
+def find_opt_files(dirs_or_files):
+    all = []
+    for dir_or_file in dirs_or_files:
+        if os.path.isfile(dir_or_file):
+            all.append(dir_or_file)
+        else:
+            for dir, subdirs, files in os.walk(dir_or_file):
+                # Exclude mounted directories and symlinks (os.walk default).
+                subdirs[:] = [d for d in subdirs
+                              if not os.path.ismount(os.path.join(dir, d))]
+                for file in files:
+                    if fnmatch.fnmatch(file, "*.opt.yaml"):
+                        all.append(os.path.join(dir, file))
+    return all
diff --git a/unittests/Analysis/CGSCCPassManagerTest.cpp b/unittests/Analysis/CGSCCPassManagerTest.cpp
index d46d9535fa4b..e24818265975 100644
--- a/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
@@ -227,6 +228,7 @@ class CGSCCPassManagerTest : public ::testing::Test {
             "entry:\n"
             "  ret void\n"
             "}\n")) {
+    MAM.registerPass([&] { return TargetLibraryAnalysis(); });
     MAM.registerPass([&] { return LazyCallGraphAnalysis(); });
     MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
     MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
diff --git a/unittests/Analysis/LazyCallGraphTest.cpp b/unittests/Analysis/LazyCallGraphTest.cpp
index 65730486cd75..9e7e128bcfb3 100644
--- a/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/unittests/Analysis/LazyCallGraphTest.cpp
@@ -216,10 +216,17 @@ static const char DiamondOfTrianglesRefGraph[] =
      "  ret void\n"
      "}\n";
 
+static LazyCallGraph buildCG(Module &M) {
+  TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
+  TargetLibraryInfo TLI(TLII);
+  LazyCallGraph CG(M, TLI);
+  return CG;
+}
+
 TEST(LazyCallGraphTest, BasicGraphFormation) {
   LLVMContext Context;
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // The order of the entry nodes should be stable w.r.t. the source order of
   // the IR, and everything in our module is an entry node, so just directly
@@ -407,7 +414,7 @@ TEST(LazyCallGraphTest, BasicGraphMutation) {
                                                      "entry:\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   LazyCallGraph::Node &A = CG.get(lookupFunction(*M, "a"));
   LazyCallGraph::Node &B = CG.get(lookupFunction(*M, "b"));
@@ -445,7 +452,7 @@ TEST(LazyCallGraphTest, BasicGraphMutation) {
 TEST(LazyCallGraphTest, InnerSCCFormation) {
   LLVMContext Context;
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Now mutate the graph to connect every node into a single RefSCC to ensure
   // that our inner SCC formation handles the rest.
@@ -542,7 +549,7 @@ TEST(LazyCallGraphTest, MultiArmSCC) {
                                                      "  call void @f1()\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -593,7 +600,7 @@ TEST(LazyCallGraphTest, OutgoingEdgeMutation) {
                                                      "entry:\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -739,7 +746,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) {
   //      a3--a2      |
   //
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -831,7 +838,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) {
   // references rather than calls.
   std::unique_ptr<Module> M =
       parseAssembly(Context, DiamondOfTrianglesRefGraph);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -938,7 +945,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeCallCycle) {
                                                      "entry:\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1015,7 +1022,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) {
                              "entry:\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1077,7 +1084,7 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   //      a3--a2      |
   //
   std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1221,7 +1228,7 @@ TEST(LazyCallGraphTest, InternalEdgeMutation) {
                                                      "  call void @a()\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1315,7 +1322,7 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
                "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
                "  ret void\n"
                "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1390,7 +1397,7 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
                "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
                "  ret void\n"
                "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1467,7 +1474,7 @@ TEST(LazyCallGraphTest, InternalCallEdgeToRef) {
                                                      "  call void @c()\n"
                                                      "  ret void\n"
                                                      "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1560,7 +1567,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) {
                              "  store void()* @a, void()** undef\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1672,7 +1679,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) {
                              "  store void()* @a, void()** undef\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1802,7 +1809,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) {
                              "  store void()* @a, void()** undef\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -1885,7 +1892,7 @@ TEST(LazyCallGraphTest, HandleBlockAddress) {
                              "  store i8* blockaddress(@f, %bb), i8** %ptr\n"
                              "  ret void\n"
                              "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
@@ -1933,7 +1940,7 @@ TEST(LazyCallGraphTest, ReplaceNodeFunction) {
                     "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
                     "  ret void\n"
                     "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Force the graph to be fully expanded.
   CG.buildRefSCCs();
@@ -2011,7 +2018,7 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpurriousRef) {
                     "entry:\n"
                     "  ret void\n"
                     "}\n");
-  LazyCallGraph CG(*M);
+  LazyCallGraph CG = buildCG(*M);
 
   // Insert spurious ref edges.
   LazyCallGraph::Node &AN = CG.get(lookupFunction(*M, "a"));
diff --git a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
index 92134513b75b..04b7bb0ba936 100644
--- a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
+++ b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -13,7 +13,6 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
@@ -402,4 +401,4 @@ TEST_F(RandomAccessVisitorTest, CrossChunkName) {
 
   StringRef Name = Types.getTypeName(IndexOne);
   EXPECT_EQ("const FooClass", Name);
-}
\ No newline at end of file
+}
diff --git a/unittests/DebugInfo/PDB/CMakeLists.txt b/unittests/DebugInfo/PDB/CMakeLists.txt
index 989cb396f674..583b065f464c 100644
--- a/unittests/DebugInfo/PDB/CMakeLists.txt
+++ b/unittests/DebugInfo/PDB/CMakeLists.txt
@@ -10,11 +10,10 @@ set(DebugInfoPDBSources
   StringTableBuilderTest.cpp
   MSFBuilderTest.cpp
   PDBApiTest.cpp
-  TypeServerHandlerTest.cpp
   )
 
 add_llvm_unittest(DebugInfoPDBTests
   ${DebugInfoPDBSources}
   )
 
-target_link_libraries(DebugInfoPDBTests LLVMTestingSupport)
\ No newline at end of file
+target_link_libraries(DebugInfoPDBTests LLVMTestingSupport)
diff --git a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp b/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
deleted file mode 100644
index d09b9130ee27..000000000000
--- a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-//===- llvm/unittest/DebugInfo/PDB/TypeServerHandlerTest.cpp --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
-#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
-#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Testing/Support/Error.h"
-
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-namespace {
-
-constexpr uint8_t Guid[] = {0x2a, 0x2c, 0x1c, 0x2a, 0xcb, 0x9e, 0x48, 0x18,
-                            0x82, 0x82, 0x7a, 0x87, 0xc3, 0xfe, 0x16, 0xe8};
-StringRef GuidStr(reinterpret_cast<const char *>(Guid),
-                  llvm::array_lengthof(Guid));
-
-constexpr const char *Name = "Test Name";
-constexpr int Age = 1;
-
-class MockTypeServerHandler : public TypeServerHandler {
-public:
-  explicit MockTypeServerHandler(bool HandleAlways)
-      : HandleAlways(HandleAlways) {}
-
-  Expected<bool> handle(TypeServer2Record &TS,
-                        TypeVisitorCallbacks &Callbacks) override {
-    if (TS.Age != Age || TS.Guid != GuidStr || TS.Name != Name)
-      return make_error<CodeViewError>(cv_error_code::corrupt_record,
-                                       "Invalid TypeServer record!");
-
-    if (Handled && !HandleAlways)
-      return false;
-
-    Handled = true;
-    return true;
-  }
-
-  bool Handled = false;
-  bool HandleAlways;
-};
-
-class MockTypeVisitorCallbacks : public TypeVisitorCallbacks {
-public:
-  enum class State {
-    Ready,
-    VisitTypeBegin,
-    VisitKnownRecord,
-    VisitTypeEnd,
-  };
-  Error visitTypeBegin(CVType &CVT) override {
-    if (S != State::Ready)
-      return make_error<CodeViewError>(cv_error_code::unspecified,
-                                       "Invalid visitor state!");
-
-    S = State::VisitTypeBegin;
-    return Error::success();
-  }
-
-  Error visitKnownRecord(CVType &CVT, TypeServer2Record &TS) override {
-    if (S != State::VisitTypeBegin)
-      return make_error<CodeViewError>(cv_error_code::unspecified,
-                                       "Invalid visitor state!");
-
-    S = State::VisitKnownRecord;
-    return Error::success();
-  }
-
-  Error visitTypeEnd(CVType &CVT) override {
-    if (S != State::VisitKnownRecord)
-      return make_error<CodeViewError>(cv_error_code::unspecified,
-                                       "Invalid visitor state!");
-
-    S = State::VisitTypeEnd;
-    return Error::success();
-  }
-
-  State S = State::Ready;
-};
-
-class TypeServerHandlerTest : public testing::Test {
-public:
-  void SetUp() override {
-    TypeServer2Record R(TypeRecordKind::TypeServer2);
-    R.Age = Age;
-    R.Guid = GuidStr;
-    R.Name = Name;
-
-    TypeTableBuilder Builder(Allocator);
-    Builder.writeKnownType(R);
-    TypeServerRecord.RecordData = Builder.records().front();
-    TypeServerRecord.Type = TypeLeafKind::LF_TYPESERVER2;
-  }
-
-protected:
-  BumpPtrAllocator Allocator;
-  CVType TypeServerRecord;
-};
-
-// Test that when no type server handler is registered, it gets handled by the
-// normal
-// visitor callbacks.
-TEST_F(TypeServerHandlerTest, VisitRecordNoTypeServer) {
-  MockTypeVisitorCallbacks C2;
-  MockTypeVisitorCallbacks C1;
-  TypeVisitorCallbackPipeline Pipeline;
-
-  Pipeline.addCallbackToPipeline(C1);
-  Pipeline.addCallbackToPipeline(C2);
-
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, Pipeline),
-                    Succeeded());
-
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C2.S);
-}
-
-// Test that when a TypeServerHandler is registered, it gets consumed by the
-// handler if and only if the handler returns true.
-TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerOnce) {
-  MockTypeServerHandler Handler(false);
-
-  MockTypeVisitorCallbacks C1;
-
-  // Our mock server returns true the first time.
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
-
-  // And false the second time.
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S);
-}
-
-// Test that when a type server handler is registered, if the handler keeps
-// returning true, it will keep getting consumed by the handler and not go
-// to the default processor.
-TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerAlways) {
-  MockTypeServerHandler Handler(true);
-
-  MockTypeVisitorCallbacks C1;
-
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
-
-  EXPECT_THAT_ERROR(codeview::visitTypeRecord(TypeServerRecord, C1,
-                                              codeview::VDS_BytesExternal,
-                                              &Handler),
-                    Succeeded());
-  EXPECT_TRUE(Handler.Handled);
-  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
-}
-
-} // end anonymous namespace
diff --git a/unittests/IR/CFGBuilder.cpp b/unittests/IR/CFGBuilder.cpp
new file mode 100644
index 000000000000..50494ab5c7ca
--- /dev/null
+++ b/unittests/IR/CFGBuilder.cpp
@@ -0,0 +1,269 @@
+//===- llvm/Testing/Support/CFGBuilder.cpp --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CFGBuilder.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+#define DEBUG_TYPE "cfg-builder"
+
+using namespace llvm;
+
+CFGHolder::CFGHolder(StringRef ModuleName, StringRef FunctionName)
+    : Context(llvm::make_unique<LLVMContext>()),
+      M(llvm::make_unique<Module>(ModuleName, *Context)) {
+  FunctionType *FTy = TypeBuilder<void(), false>::get(*Context);
+  F = cast<Function>(M->getOrInsertFunction(FunctionName, FTy));
+}
+CFGHolder::~CFGHolder() = default;
+
+CFGBuilder::CFGBuilder(Function *F, const std::vector<Arc> &InitialArcs,
+                       std::vector<Update> Updates)
+    : F(F), Updates(std::move(Updates)) {
+  assert(F);
+  buildCFG(InitialArcs);
+}
+
+static void ConnectBlocks(BasicBlock *From, BasicBlock *To) {
+  DEBUG(dbgs() << "Creating BB arc " << From->getName() << " -> "
+               << To->getName() << "\n";
+        dbgs().flush());
+  auto *IntTy = IntegerType::get(From->getContext(), 32);
+
+  if (isa<UnreachableInst>(From->getTerminator()))
+    From->getTerminator()->eraseFromParent();
+  if (!From->getTerminator()) {
+    IRBuilder<> IRB(From);
+    IRB.CreateSwitch(ConstantInt::get(IntTy, 0), To);
+    return;
+  }
+
+  SwitchInst *SI = cast<SwitchInst>(From->getTerminator());
+  const auto Last = SI->getNumCases();
+
+  auto *IntVal = ConstantInt::get(IntTy, Last);
+  SI->addCase(IntVal, To);
+}
+
+static void DisconnectBlocks(BasicBlock *From, BasicBlock *To) {
+  DEBUG(dbgs() << "Deleting BB arc " << From->getName() << " -> "
+               << To->getName() << "\n";
+        dbgs().flush());
+  SwitchInst *SI = cast<SwitchInst>(From->getTerminator());
+
+  if (SI->getNumCases() == 0) {
+    SI->eraseFromParent();
+    IRBuilder<> IRB(From);
+    IRB.CreateUnreachable();
+    return;
+  }
+
+  if (SI->getDefaultDest() == To) {
+    auto FirstC = SI->case_begin();
+    SI->setDefaultDest(FirstC->getCaseSuccessor());
+    SI->removeCase(FirstC);
+    return;
+  }
+
+  for (auto CIt = SI->case_begin(); CIt != SI->case_end(); ++CIt)
+    if (CIt->getCaseSuccessor() == To) {
+      SI->removeCase(CIt);
+      return;
+    }
+}
+
+BasicBlock *CFGBuilder::getOrAddBlock(StringRef BlockName) {
+  auto BIt = NameToBlock.find(BlockName);
+  if (BIt != NameToBlock.end())
+    return BIt->second;
+
+  auto *BB = BasicBlock::Create(F->getParent()->getContext(), BlockName, F);
+  IRBuilder<> IRB(BB);
+  IRB.CreateUnreachable();
+  NameToBlock[BlockName] = BB;
+  return BB;
+}
+
+bool CFGBuilder::connect(const Arc &A) {
+  BasicBlock *From = getOrAddBlock(A.From);
+  BasicBlock *To = getOrAddBlock(A.To);
+  if (Arcs.count(A) != 0)
+    return false;
+
+  Arcs.insert(A);
+  ConnectBlocks(From, To);
+  return true;
+}
+
+bool CFGBuilder::disconnect(const Arc &A) {
+  assert(NameToBlock.count(A.From) != 0 && "No block to disconnect (From)");
+  assert(NameToBlock.count(A.To) != 0 && "No block to disconnect (To)");
+  if (Arcs.count(A) == 0)
+    return false;
+
+  BasicBlock *From = getOrAddBlock(A.From);
+  BasicBlock *To = getOrAddBlock(A.To);
+  Arcs.erase(A);
+  DisconnectBlocks(From, To);
+  return true;
+}
+
+void CFGBuilder::buildCFG(const std::vector<Arc> &NewArcs) {
+  for (const auto &A : NewArcs) {
+    const bool Connected = connect(A);
+    (void)Connected;
+    assert(Connected);
+  }
+}
+
+Optional<CFGBuilder::Update> CFGBuilder::getNextUpdate() const {
+  if (UpdateIdx == Updates.size())
+    return None;
+  return Updates[UpdateIdx];
+}
+
+Optional<CFGBuilder::Update> CFGBuilder::applyUpdate() {
+  if (UpdateIdx == Updates.size())
+    return None;
+  Update NextUpdate = Updates[UpdateIdx++];
+  if (NextUpdate.Action == ActionKind::Insert)
+    connect(NextUpdate.Edge);
+  else
+    disconnect(NextUpdate.Edge);
+
+  return NextUpdate;
+}
+
+void CFGBuilder::dump(raw_ostream &OS) const {
+  OS << "Arcs:\n";
+  size_t i = 0;
+  for (const auto &A : Arcs)
+    OS << "  " << i++ << ":\t" << A.From << " -> " << A.To << "\n";
+
+  OS << "Updates:\n";
+  i = 0;
+  for (const auto &U : Updates) {
+    OS << (i + 1 == UpdateIdx ? "->" : "  ") << i
+       << ((U.Action == ActionKind::Insert) ? "\tIns " : "\tDel ")
+       << U.Edge.From << " -> " << U.Edge.To << "\n";
+    ++i;
+  }
+}
+
+//---- CFGBuilder tests ---------------------------------------------------===//
+
+TEST(CFGBuilder, Construction) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {{"entry", "a"}, {"a", "b"}, {"a", "c"},
+                                       {"c", "d"},     {"d", "b"}, {"d", "e"},
+                                       {"d", "f"},     {"e", "f"}};
+  CFGBuilder B(Holder.F, Arcs, {});
+
+  EXPECT_TRUE(B.getOrAddBlock("entry") == &Holder.F->getEntryBlock());
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("b")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+
+  auto *DSwitch = cast<SwitchInst>(B.getOrAddBlock("d")->getTerminator());
+  // d has 3 successors, but one of them if going to be a default case
+  EXPECT_EQ(DSwitch->getNumCases(), 2U);
+  EXPECT_FALSE(B.getNextUpdate()); // No updates to apply.
+}
+
+TEST(CFGBuilder, Insertions) {
+  CFGHolder Holder;
+  const auto Insert = CFGBuilder::ActionKind::Insert;
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"entry", "a"}}, {Insert, {"a", "b"}}, {Insert, {"a", "c"}},
+      {Insert, {"c", "d"}},     {Insert, {"d", "b"}}, {Insert, {"d", "e"}},
+      {Insert, {"d", "f"}},     {Insert, {"e", "f"}}};
+  const size_t NumUpdates = Updates.size();
+
+  CFGBuilder B(Holder.F, {}, Updates);
+
+  size_t i = 0;
+  while (B.applyUpdate())
+    ++i;
+  EXPECT_EQ(i, NumUpdates);
+
+  EXPECT_TRUE(B.getOrAddBlock("entry") == &Holder.F->getEntryBlock());
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("b")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+
+  auto *DSwitch = cast<SwitchInst>(B.getOrAddBlock("d")->getTerminator());
+  // d has 3 successors, but one of them if going to be a default case
+  EXPECT_EQ(DSwitch->getNumCases(), 2U);
+  EXPECT_FALSE(B.getNextUpdate()); // No updates to apply.
+}
+
+TEST(CFGBuilder, Deletions) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"entry", "a"}, {"a", "b"}, {"a", "c"}, {"c", "d"}, {"d", "b"}};
+  const auto Delete = CFGBuilder::ActionKind::Delete;
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"c", "d"}}, {Delete, {"a", "c"}}, {Delete, {"entry", "a"}},
+  };
+  const size_t NumUpdates = Updates.size();
+
+  CFGBuilder B(Holder.F, Arcs, Updates);
+
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("c")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+
+  auto UpdateC = B.applyUpdate();
+
+  EXPECT_TRUE(UpdateC);
+  EXPECT_EQ(UpdateC->Action, CFGBuilder::ActionKind::Delete);
+  EXPECT_EQ(UpdateC->Edge.From, "c");
+  EXPECT_EQ(UpdateC->Edge.To, "d");
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("c")->getTerminator()));
+
+  size_t i = 1;
+  while (B.applyUpdate())
+    ++i;
+  EXPECT_EQ(i, NumUpdates);
+
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<UnreachableInst>(B.getOrAddBlock("entry")->getTerminator()));
+}
+
+TEST(CFGBuilder, Rebuild) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"entry", "a"}, {"a", "b"}, {"a", "c"}, {"c", "d"}, {"d", "b"}};
+  const auto Insert = CFGBuilder::ActionKind::Insert;
+  const auto Delete = CFGBuilder::ActionKind::Delete;
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"c", "d"}}, {Delete, {"a", "c"}}, {Delete, {"entry", "a"}},
+      {Insert, {"c", "d"}}, {Insert, {"a", "c"}}, {Insert, {"entry", "a"}},
+  };
+  const size_t NumUpdates = Updates.size();
+
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  size_t i = 0;
+  while (B.applyUpdate())
+    ++i;
+  EXPECT_EQ(i, NumUpdates);
+
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("entry")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("a")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("c")->getTerminator()));
+  EXPECT_TRUE(isa<SwitchInst>(B.getOrAddBlock("d")->getTerminator()));
+}
diff --git a/unittests/IR/CFGBuilder.h b/unittests/IR/CFGBuilder.h
new file mode 100644
index 000000000000..d9d9c378e110
--- /dev/null
+++ b/unittests/IR/CFGBuilder.h
@@ -0,0 +1,94 @@
+//===- CFGBuilder.h - CFG building and updating utility ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// CFGBuilders provides utilities fo building and updating CFG for testing
+/// purposes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_CFG_BUILDER_H
+#define LLVM_UNITTESTS_CFG_BUILDER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+
+#include <memory>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace llvm {
+
+class LLVMContext;
+class Module;
+class Function;
+class BasicBlock;
+class raw_ostream;
+
+struct CFGHolder {
+  std::unique_ptr<LLVMContext> Context;
+  std::unique_ptr<Module> M;
+  Function *F;
+
+  CFGHolder(StringRef ModuleName = "m", StringRef FunctionName = "foo");
+  ~CFGHolder(); // Defined in the .cpp file so we can use forward declarations.
+};
+
+/// \brief
+/// CFGBuilder builds IR with specific CFG, based on the supplied list of arcs.
+/// It's able to apply the provided updates and automatically modify the IR.
+///
+/// Internally it makes every basic block end with either SwitchInst or with
+/// UnreachableInst. When all arc to a BB are deleted, the BB remains in the
+/// function and doesn't get deleted.
+///
+class CFGBuilder {
+public:
+  struct Arc {
+    StringRef From;
+    StringRef To;
+
+    friend bool operator<(const Arc &LHS, const Arc &RHS) {
+      return std::tie(LHS.From, LHS.To) <
+             std::tie(RHS.From, RHS.To);
+    }
+  };
+
+  enum class ActionKind { Insert, Delete };
+  struct Update {
+    ActionKind Action;
+    Arc Edge;
+  };
+
+  CFGBuilder(Function *F, const std::vector<Arc> &InitialArcs,
+             std::vector<Update> Updates);
+
+  BasicBlock *getOrAddBlock(StringRef BlockName);
+  Optional<Update> getNextUpdate() const;
+  Optional<Update> applyUpdate();
+  void dump(raw_ostream &OS = dbgs()) const;
+
+private:
+  void buildCFG(const std::vector<Arc> &Arcs);
+  bool connect(const Arc &A);
+  bool disconnect(const Arc &A);
+
+  Function *F;
+  unsigned UpdateIdx = 0;
+  StringMap<BasicBlock *> NameToBlock;
+  std::set<Arc> Arcs;
+  std::vector<Update> Updates;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index d76ebfa64d88..92c9f8d3788d 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ set(IRSources
   AsmWriterTest.cpp
   AttributesTest.cpp
   BasicBlockTest.cpp
+  CFGBuilder.cpp
   ConstantRangeTest.cpp
   ConstantsTest.cpp
   DebugInfoTest.cpp
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index fa3dad8a2ab1..df1e2993dc85 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <random>
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
@@ -15,22 +16,24 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
+#include "CFGBuilder.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
+struct PostDomTree : PostDomTreeBase<BasicBlock> {
+  PostDomTree(Function &F) { recalculate(F); }
+};
+
 /// Build the dominator tree for the function and run the Test.
-static void
-runWithDomTree(Module &M, StringRef FuncName,
-               function_ref<void(Function &F, DominatorTree *DT,
-                                 DominatorTreeBase<BasicBlock> *PDT)>
-                   Test) {
+static void runWithDomTree(
+    Module &M, StringRef FuncName,
+    function_ref<void(Function &F, DominatorTree *DT, PostDomTree *PDT)> Test) {
   auto *F = M.getFunction(FuncName);
   ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
   // Compute the dominator tree for the function.
   DominatorTree DT(*F);
-  DominatorTreeBase<BasicBlock> PDT(/*isPostDom*/ true);
-  PDT.recalculate(*F);
+  PostDomTree PDT(*F);
   Test(*F, &DT, &PDT);
 }
 
@@ -72,8 +75,7 @@ TEST(DominatorTree, Unreachable) {
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
 
   runWithDomTree(
-      *M, "f",
-      [&](Function &F, DominatorTree *DT, DominatorTreeBase<BasicBlock> *PDT) {
+      *M, "f", [&](Function &F, DominatorTree *DT, PostDomTree *PDT) {
         Function::iterator FI = F.begin();
 
         BasicBlock *BB0 = &*FI++;
@@ -293,8 +295,7 @@ TEST(DominatorTree, NonUniqueEdges) {
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
 
   runWithDomTree(
-      *M, "f",
-      [&](Function &F, DominatorTree *DT, DominatorTreeBase<BasicBlock> *PDT) {
+      *M, "f", [&](Function &F, DominatorTree *DT, PostDomTree *PDT) {
         Function::iterator FI = F.begin();
 
         BasicBlock *BB0 = &*FI++;
@@ -324,3 +325,280 @@ TEST(DominatorTree, NonUniqueEdges) {
         EXPECT_FALSE(DT->dominates(Edge_BB0_BB1_b, BB2));
       });
 }
+
+namespace {
+const auto Insert = CFGBuilder::ActionKind::Insert;
+const auto Delete = CFGBuilder::ActionKind::Delete;
+
+bool CompUpdates(const CFGBuilder::Update &A, const CFGBuilder::Update &B) {
+  return std::tie(A.Action, A.Edge.From, A.Edge.To) <
+         std::tie(B.Action, B.Edge.From, B.Edge.To);
+}
+}  // namespace
+
+TEST(DominatorTree, InsertReachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"12", "10"}},
+                                             {Insert, {"10", "9"}},
+                                             {Insert, {"7", "6"}},
+                                             {Insert, {"7", "5"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Insert);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.insertEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.insertEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertReachable2) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"7", "5"}, {"2", "8"}, {"8", "11"}, {"11", "12"}, {"12", "10"},
+      {"10", "9"}, {"9", "10"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"10", "7"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate = B.applyUpdate();
+  EXPECT_TRUE(LastUpdate);
+
+  EXPECT_EQ(LastUpdate->Action, Insert);
+  BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+  BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+  DT.insertEdge(From, To);
+  EXPECT_TRUE(DT.verify());
+  PDT.insertEdge(From, To);
+  EXPECT_TRUE(PDT.verify());
+}
+
+TEST(DominatorTree, InsertUnreachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {{"1", "2"},  {"2", "3"},  {"3", "4"},
+                                       {"5", "6"},  {"5", "7"},  {"3", "8"},
+                                       {"9", "10"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"4", "5"}},
+                                             {Insert, {"8", "9"}},
+                                             {Insert, {"10", "12"}},
+                                             {Insert, {"10", "11"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Insert);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.insertEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.insertEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertMixed) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"},  {"3", "4"},  {"5", "6"},   {"5", "7"},
+      {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"4", "5"}},   {Insert, {"2", "5"}},   {Insert, {"10", "9"}},
+      {Insert, {"12", "10"}}, {Insert, {"12", "10"}}, {Insert, {"7", "8"}},
+      {Insert, {"7", "5"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Insert);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.insertEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.insertEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertPermut) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"},  {"3", "4"},  {"5", "6"},   {"5", "7"},
+      {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}, {"7", "3"}};
+
+  std::vector<CFGBuilder::Update> Updates = {{Insert, {"4", "5"}},
+                                             {Insert, {"2", "5"}},
+                                             {Insert, {"10", "9"}},
+                                             {Insert, {"12", "10"}}};
+
+  while (std::next_permutation(Updates.begin(), Updates.end(), CompUpdates)) {
+    CFGHolder Holder;
+    CFGBuilder B(Holder.F, Arcs, Updates);
+    DominatorTree DT(*Holder.F);
+    EXPECT_TRUE(DT.verify());
+    PostDomTree PDT(*Holder.F);
+    EXPECT_TRUE(PDT.verify());
+
+    Optional<CFGBuilder::Update> LastUpdate;
+    while ((LastUpdate = B.applyUpdate())) {
+      EXPECT_EQ(LastUpdate->Action, Insert);
+      BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+      BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+      DT.insertEdge(From, To);
+      EXPECT_TRUE(DT.verify());
+      PDT.insertEdge(From, To);
+      EXPECT_TRUE(PDT.verify());
+    }
+  }
+}
+
+TEST(DominatorTree, DeleteReachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"2", "4"}, {"3", "4"}, {"4", "5"},  {"5", "6"},
+      {"5", "7"}, {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"2", "4"}}, {Delete, {"7", "8"}}, {Delete, {"10", "2"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Delete);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.deleteEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.deleteEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, DeleteUnreachable) {
+  CFGHolder Holder;
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"}, {"4", "5"},  {"5", "6"}, {"5", "7"},
+      {"7", "8"}, {"3", "8"}, {"8", "9"}, {"9", "10"}, {"10", "2"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Delete, {"8", "9"}}, {Delete, {"7", "8"}}, {Delete, {"3", "4"}}};
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    EXPECT_EQ(LastUpdate->Action, Delete);
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    DT.deleteEdge(From, To);
+    EXPECT_TRUE(DT.verify());
+    PDT.deleteEdge(From, To);
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertDelete) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"2", "4"}},  {Insert, {"12", "10"}}, {Insert, {"10", "9"}},
+      {Insert, {"7", "6"}},  {Insert, {"7", "5"}},   {Delete, {"3", "8"}},
+      {Insert, {"10", "7"}}, {Insert, {"2", "8"}},   {Delete, {"3", "4"}},
+      {Delete, {"8", "9"}},  {Delete, {"11", "12"}}};
+
+  CFGHolder Holder;
+  CFGBuilder B(Holder.F, Arcs, Updates);
+  DominatorTree DT(*Holder.F);
+  EXPECT_TRUE(DT.verify());
+  PostDomTree PDT(*Holder.F);
+  EXPECT_TRUE(PDT.verify());
+
+  Optional<CFGBuilder::Update> LastUpdate;
+  while ((LastUpdate = B.applyUpdate())) {
+    BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+    BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+    if (LastUpdate->Action == Insert) {
+      DT.insertEdge(From, To);
+      PDT.insertEdge(From, To);
+    } else {
+      DT.deleteEdge(From, To);
+      PDT.deleteEdge(From, To);
+    }
+
+    EXPECT_TRUE(DT.verify());
+    EXPECT_TRUE(PDT.verify());
+  }
+}
+
+TEST(DominatorTree, InsertDeleteExhaustive) {
+  std::vector<CFGBuilder::Arc> Arcs = {
+      {"1", "2"}, {"2", "3"}, {"3", "4"},  {"4", "5"},  {"5", "6"},  {"5", "7"},
+      {"3", "8"}, {"8", "9"}, {"9", "10"}, {"8", "11"}, {"11", "12"}};
+
+  std::vector<CFGBuilder::Update> Updates = {
+      {Insert, {"2", "4"}},  {Insert, {"12", "10"}}, {Insert, {"10", "9"}},
+      {Insert, {"7", "6"}},  {Insert, {"7", "5"}},   {Delete, {"3", "8"}},
+      {Insert, {"10", "7"}}, {Insert, {"2", "8"}},   {Delete, {"3", "4"}},
+      {Delete, {"8", "9"}},  {Delete, {"11", "12"}}};
+
+  std::mt19937 Generator(0);
+  for (unsigned i = 0; i < 16; ++i) {
+    std::shuffle(Updates.begin(), Updates.end(), Generator);
+    CFGHolder Holder;
+    CFGBuilder B(Holder.F, Arcs, Updates);
+    DominatorTree DT(*Holder.F);
+    EXPECT_TRUE(DT.verify());
+    PostDomTree PDT(*Holder.F);
+    EXPECT_TRUE(PDT.verify());
+
+    Optional<CFGBuilder::Update> LastUpdate;
+    while ((LastUpdate = B.applyUpdate())) {
+      BasicBlock *From = B.getOrAddBlock(LastUpdate->Edge.From);
+      BasicBlock *To = B.getOrAddBlock(LastUpdate->Edge.To);
+      if (LastUpdate->Action == Insert) {
+        DT.insertEdge(From, To);
+        PDT.insertEdge(From, To);
+      } else {
+        DT.deleteEdge(From, To);
+        PDT.deleteEdge(From, To);
+      }
+
+      EXPECT_TRUE(DT.verify());
+      EXPECT_TRUE(PDT.verify());
+    }
+  }
+}
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 186330f10573..d361107cc0d2 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -463,13 +463,14 @@ TEST_F(IRBuilderTest, DebugLoc) {
 TEST_F(IRBuilderTest, DIImportedEntity) {
   IRBuilder<> Builder(BB);
   DIBuilder DIB(*M);
+  auto F = DIB.createFile("F.CBL", "/");
   auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
-                                  DIB.createFile("F.CBL", "/"), "llvm-cobol74",
+                                  F, "llvm-cobol74",
                                   true, "", 0);
-  DIB.createImportedDeclaration(CU, nullptr, 1);
-  DIB.createImportedDeclaration(CU, nullptr, 1);
-  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, 2);
-  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, 2);
+  DIB.createImportedDeclaration(CU, nullptr, F, 1);
+  DIB.createImportedDeclaration(CU, nullptr, F, 1);
+  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, F, 2);
+  DIB.createImportedModule(CU, (DIImportedEntity *)nullptr, F, 2);
   DIB.finalize();
   EXPECT_TRUE(verifyModule(*M));
   EXPECT_TRUE(CU->getImportedEntities().size() == 2);
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index cb38b30f43e6..e47afca532d2 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -2116,29 +2116,35 @@ TEST_F(DIImportedEntityTest, get) {
   unsigned Tag = dwarf::DW_TAG_imported_module;
   DIScope *Scope = getSubprogram();
   DINode *Entity = getCompositeType();
+  DIFile *File = getFile();
   unsigned Line = 5;
   StringRef Name = "name";
 
-  auto *N = DIImportedEntity::get(Context, Tag, Scope, Entity, Line, Name);
+  auto *N =
+      DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, Name);
 
   EXPECT_EQ(Tag, N->getTag());
   EXPECT_EQ(Scope, N->getScope());
   EXPECT_EQ(Entity, N->getEntity());
+  EXPECT_EQ(File, N->getFile());
   EXPECT_EQ(Line, N->getLine());
   EXPECT_EQ(Name, N->getName());
-  EXPECT_EQ(N, DIImportedEntity::get(Context, Tag, Scope, Entity, Line, Name));
+  EXPECT_EQ(
+      N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line, Name));
 
   EXPECT_NE(N,
             DIImportedEntity::get(Context, dwarf::DW_TAG_imported_declaration,
-                                  Scope, Entity, Line, Name));
+                                  Scope, Entity, File, Line, Name));
   EXPECT_NE(N, DIImportedEntity::get(Context, Tag, getSubprogram(), Entity,
-                                     Line, Name));
+                                     File, Line, Name));
   EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, getCompositeType(),
-                                     Line, Name));
-  EXPECT_NE(N,
-            DIImportedEntity::get(Context, Tag, Scope, Entity, Line + 1, Name));
-  EXPECT_NE(N,
-            DIImportedEntity::get(Context, Tag, Scope, Entity, Line, "other"));
+                                     File, Line, Name));
+  EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, nullptr, Line,
+                                     Name));
+  EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, File,
+                                     Line + 1, Name));
+  EXPECT_NE(N, DIImportedEntity::get(Context, Tag, Scope, Entity, File, Line,
+                                     "other"));
 
   TempDIImportedEntity Temp = N->clone();
   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp
index b252641f1a13..b9b725f934b3 100644
--- a/unittests/Support/TargetParserTest.cpp
+++ b/unittests/Support/TargetParserTest.cpp
@@ -737,7 +737,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   unsigned Extensions = AArch64::AEK_CRC | AArch64::AEK_CRYPTO |
                         AArch64::AEK_FP | AArch64::AEK_SIMD |
                         AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
-                        AArch64::AEK_RAS;
+                        AArch64::AEK_RAS | AArch64::AEK_SVE;
 
   for (unsigned i = 0; i <= Extensions; i++)
     EXPECT_TRUE(i == 0 ? !AArch64::getExtensionFeatures(i, Features)
@@ -762,7 +762,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
                               {"simd", "nosimd", "+neon", "-neon"},
                               {"fp16", "nofp16", "+fullfp16", "-fullfp16"},
                               {"profile", "noprofile", "+spe", "-spe"},
-                              {"ras", "noras", "+ras", "-ras"}};
+                              {"ras", "noras", "+ras", "-ras"},
+                              {"sve", "nosve", "+sve", "-sve"}};
 
   for (unsigned i = 0; i < array_lengthof(ArchExt); i++) {
     EXPECT_EQ(StringRef(ArchExt[i][2]),
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 5cf0e9d0f5b3..120773a0c8dd 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -232,6 +232,22 @@ TEST(YAMLIO, TestSequenceMapWriteAndRead) {
   }
 }
 
+//
+// Test YAML filename handling.
+//
+static void testErrorFilename(const llvm::SMDiagnostic &Error, void *) {
+  EXPECT_EQ(Error.getFilename(), "foo.yaml");
+}
+
+TEST(YAMLIO, TestGivenFilename) {
+  auto Buffer = llvm::MemoryBuffer::getMemBuffer("{ x: 42 }", "foo.yaml");
+  Input yin(*Buffer, nullptr, testErrorFilename);
+  FooBar Value;
+  yin >> Value;
+
+  EXPECT_TRUE(!!yin.error());
+}
+
 
 //===----------------------------------------------------------------------===//
 //  Test built-in types
diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index a7a5ce8dd6d4..a75f446e4ba1 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp
@@ -151,6 +151,11 @@ TEST(raw_ostreamTest, Justify) {
   EXPECT_EQ("   xyz", printToString(right_justify("xyz", 6), 6));
   EXPECT_EQ("abc",    printToString(right_justify("abc", 3), 3));
   EXPECT_EQ("big",    printToString(right_justify("big", 1), 3));
+  EXPECT_EQ("   on    ",    printToString(center_justify("on", 9), 9));
+  EXPECT_EQ("   off    ",    printToString(center_justify("off", 10), 10));
+  EXPECT_EQ("single ",    printToString(center_justify("single", 7), 7));
+  EXPECT_EQ("none",    printToString(center_justify("none", 1), 4));
+  EXPECT_EQ("none",    printToString(center_justify("none", 1), 1));
 }
 
 TEST(raw_ostreamTest, FormatHex) {  
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index d4a21a986c58..6399fb5ec1dd 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -1268,12 +1268,12 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
   CoveringLanes = LaneBitmask::getAll();
   for (auto &Idx : SubRegIndices) {
     if (Idx.getComposites().empty()) {
-      if (Bit > 32) {
+      if (Bit > LaneBitmask::BitWidth) {
         PrintFatalError(
           Twine("Ran out of lanemask bits to represent subregister ")
           + Idx.getName());
       }
-      Idx.LaneMask = LaneBitmask(1 << Bit);
+      Idx.LaneMask = LaneBitmask::getLane(Bit);
       ++Bit;
     } else {
       Idx.LaneMask = LaneBitmask::getNone();
@@ -1298,9 +1298,9 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
       static_assert(sizeof(Idx.LaneMask.getAsInteger()) == 4,
                     "Change Log2_32 to a proper one");
       unsigned DstBit = Log2_32(Idx.LaneMask.getAsInteger());
-      assert(Idx.LaneMask == LaneBitmask(1 << DstBit) &&
+      assert(Idx.LaneMask == LaneBitmask::getLane(DstBit) &&
              "Must be a leaf subregister");
-      MaskRolPair MaskRol = { LaneBitmask(1), (uint8_t)DstBit };
+      MaskRolPair MaskRol = { LaneBitmask::getLane(0), (uint8_t)DstBit };
       LaneTransforms.push_back(MaskRol);
     } else {
       // Go through all leaf subregisters and find the ones that compose with
@@ -1314,7 +1314,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
           continue;
         // Replicate the behaviour from the lane mask generation loop above.
         unsigned SrcBit = NextBit;
-        LaneBitmask SrcMask = LaneBitmask(1 << SrcBit);
+        LaneBitmask SrcMask = LaneBitmask::getLane(SrcBit);
         if (NextBit < LaneBitmask::BitWidth-1)
           ++NextBit;
         assert(Idx2.LaneMask == SrcMask);
@@ -1386,7 +1386,7 @@ void CodeGenRegBank::computeSubRegLaneMasks() {
     // For classes without any subregisters set LaneMask to 1 instead of 0.
     // This makes it easier for client code to handle classes uniformly.
     if (LaneMask.none())
-      LaneMask = LaneBitmask(1);
+      LaneMask = LaneBitmask::getLane(0);
 
     RegClass.LaneMask = LaneMask;
   }
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index 2b680846e176..2ef0a8f77ec9 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -25,7 +25,8 @@ def __init__(self, progname, path, quiet,
                  params, config_prefix = None,
                  maxIndividualTestTime = 0,
                  maxFailures = None,
-                 parallelism_groups = []):
+                 parallelism_groups = [],
+                 echo_all_commands = False):
         # The name of the test runner.
         self.progname = progname
         # The items to add to the PATH environment variable.
@@ -64,6 +65,7 @@ def __init__(self, progname, path, quiet,
         self.maxIndividualTestTime = maxIndividualTestTime
         self.maxFailures = maxFailures
         self.parallelism_groups = parallelism_groups
+        self.echo_all_commands = echo_all_commands
 
     @property
     def maxIndividualTestTime(self):
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 8260d3813345..46bcac4b306e 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -715,6 +715,8 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
     else:
         if test.config.pipefail:
             f.write('set -o pipefail;')
+        if litConfig.echo_all_commands:
+            f.write('set -x;')
         f.write('{ ' + '; } &&\n{ '.join(commands) + '; }')
     f.write('\n')
     f.close()
diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py
index 530f962d336d..f0162464ce33 100755
--- a/utils/lit/lit/main.py
+++ b/utils/lit/lit/main.py
@@ -199,6 +199,12 @@ def main_with_tmp(builtinParameters):
     format_group.add_argument("-v", "--verbose", dest="showOutput",
                      help="Show test output for failures",
                      action="store_true", default=False)
+    format_group.add_argument("-vv", "--echo-all-commands",
+                     dest="echoAllCommands",
+                     action="store_true", default=False,
+                     help="Echo all commands as they are executed to stdout.\
+                     In case of failure, last command shown will be the\
+                     failing one.")
     format_group.add_argument("-a", "--show-all", dest="showAllOutput",
                      help="Display all commandlines and output",
                      action="store_true", default=False)
@@ -303,6 +309,9 @@ def main_with_tmp(builtinParameters):
     if opts.maxFailures == 0:
         parser.error("Setting --max-failures to 0 does not have any effect.")
 
+    if opts.echoAllCommands:
+        opts.showOutput = True
+
     inputs = args
 
     # Create the user defined parameters.
@@ -338,7 +347,8 @@ def main_with_tmp(builtinParameters):
         config_prefix = opts.configPrefix,
         maxIndividualTestTime = maxIndividualTestTime,
         maxFailures = opts.maxFailures,
-        parallelism_groups = {})
+        parallelism_groups = {},
+        echo_all_commands = opts.echoAllCommands)
 
     # Perform test discovery.
     run = lit.run.Run(litConfig,
diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim
index e795c7f62133..e110da4329b5 100644
--- a/utils/vim/syntax/llvm.vim
+++ b/utils/vim/syntax/llvm.vim
@@ -1,7 +1,7 @@
 " Vim syntax file
 " Language:   llvm
 " Maintainer: The LLVM team, http://llvm.org/
-" Version:      $Revision: 307419 $
+" Version:      $Revision: 308208 $
 
 if version < 600
   syntax clear
@@ -161,7 +161,7 @@ syn keyword llvmKeyword
       \ within
       \ writeonly
       \ x86_64_sysvcc
-      \ x86_64_win64cc
+      \ win64cc
       \ x86_fastcallcc
       \ x86_stdcallcc
       \ x86_thiscallcc

From de51d671486b6ac9a2ad9ee5fcfdb1a23cc59238 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 19 Jul 2017 07:02:30 +0000
Subject: [PATCH 3/4] Vendor import of clang trunk r308421:
 https://llvm.org/svn/llvm-project/cfe/trunk@308421

---
 CMakeLists.txt                                |    6 +-
 cmake/caches/Fuchsia-stage2.cmake             |   33 +-
 cmake/caches/Fuchsia.cmake                    |    8 +-
 docs/LanguageExtensions.rst                   |   83 +-
 docs/ReleaseNotes.rst                         |    4 +-
 include/clang-c/Index.h                       |    2 +-
 include/clang/AST/ASTContext.h                |    4 +
 include/clang/AST/DeclObjC.h                  |   10 +-
 include/clang/AST/OpenMPClause.h              |  211 +++
 include/clang/AST/RecursiveASTVisitor.h       |   22 +
 include/clang/AST/StmtOpenMP.h                |   34 +-
 include/clang/AST/Type.h                      |    1 +
 include/clang/Analysis/Analyses/Dominators.h  |    6 +-
 include/clang/Basic/Attr.td                   |   24 +-
 include/clang/Basic/AttrDocs.td               |   25 +-
 include/clang/Basic/Builtins.def              |    5 +
 include/clang/Basic/BuiltinsHexagon.def       |    6 +
 include/clang/Basic/BuiltinsSystemZ.def       |   24 +
 include/clang/Basic/BuiltinsX86.def           |    5 -
 include/clang/Basic/DiagnosticGroups.td       |    2 +
 include/clang/Basic/DiagnosticIDs.h           |    8 +
 include/clang/Basic/DiagnosticSemaKinds.td    |   25 +-
 .../Basic/DiagnosticSerializationKinds.td     |   70 +-
 include/clang/Basic/IdentifierTable.h         |   11 +-
 include/clang/Basic/LangOptions.def           |    1 +
 include/clang/Basic/OpenMPKinds.def           |    8 +
 include/clang/Basic/Specifiers.h              |    2 +-
 include/clang/Basic/TargetInfo.h              |   14 +
 include/clang/Config/config.h.cmake           |    5 +
 include/clang/Driver/Options.td               |    1 +
 include/clang/Frontend/LangStandard.h         |   15 +-
 include/clang/Frontend/LangStandards.def      |   10 +
 include/clang/Index/IndexingAction.h          |   10 +-
 include/clang/Lex/MacroInfo.h                 |   50 +-
 include/clang/Lex/Preprocessor.h              |   19 +-
 include/clang/Sema/Sema.h                     |   32 +-
 include/clang/Tooling/DiagnosticsYaml.h       |   15 +-
 .../Refactoring/RecursiveSymbolVisitor.h      |  122 ++
 .../Tooling/Refactoring/Rename/USRFinder.h    |   34 -
 include/clang/module.modulemap                |    1 -
 lib/AST/ASTContext.cpp                        |    6 +
 lib/AST/ASTDumper.cpp                         |    8 +-
 lib/AST/DeclObjC.cpp                          |   18 +-
 lib/AST/ItaniumMangle.cpp                     |    2 +-
 lib/AST/MicrosoftMangle.cpp                   |    2 +-
 lib/AST/ODRHash.cpp                           |    8 +-
 lib/AST/OpenMPClause.cpp                      |   57 +
 lib/AST/StmtOpenMP.cpp                        |   21 +-
 lib/AST/StmtPrinter.cpp                       |   25 +-
 lib/AST/StmtProfile.cpp                       |   24 +
 lib/AST/Type.cpp                              |    4 +-
 lib/AST/TypePrinter.cpp                       |   75 +-
 lib/Analysis/PrintfFormatString.cpp           |    5 +-
 lib/Basic/CMakeLists.txt                      |   17 +-
 lib/Basic/DiagnosticIDs.cpp                   |   12 +
 lib/Basic/OpenMPKinds.cpp                     |   16 +-
 lib/Basic/Targets.cpp                         |   45 +-
 lib/CodeGen/CGBuiltin.cpp                     |  154 +-
 lib/CodeGen/CGCall.cpp                        |    8 +-
 lib/CodeGen/CGDebugInfo.cpp                   |   24 +-
 lib/CodeGen/CGExpr.cpp                        |    7 +-
 lib/CodeGen/CGExprScalar.cpp                  |   38 +-
 lib/CodeGen/CGOpenMPRuntime.cpp               |  822 ++++++++-
 lib/CodeGen/CGOpenMPRuntime.h                 |  162 +-
 lib/CodeGen/CGStmtOpenMP.cpp                  |  531 ++----
 lib/CodeGen/CodeGenFunction.h                 |    7 +
 lib/CodeGen/CodeGenModule.cpp                 |    9 +-
 lib/CodeGen/MacroPPCallbacks.cpp              |    4 +-
 .../ObjectFilePCHContainerOperations.cpp      |    3 +
 lib/CodeGen/TargetInfo.cpp                    |   21 +-
 lib/Driver/Driver.cpp                         |    7 +
 lib/Driver/ToolChains/Clang.cpp               |   16 +-
 lib/Driver/ToolChains/Darwin.cpp              |   68 +-
 lib/Driver/ToolChains/Fuchsia.cpp             |    3 +
 lib/Driver/ToolChains/Gnu.cpp                 |    3 +-
 lib/Driver/ToolChains/Solaris.cpp             |    2 +-
 lib/Driver/ToolChains/Solaris.h               |    2 +-
 lib/Format/TokenAnnotator.cpp                 |    3 +-
 lib/Format/UnwrappedLineParser.cpp            |    2 +-
 lib/Frontend/CompilerInvocation.cpp           |    1 +
 lib/Frontend/InitPreprocessor.cpp             |    5 +-
 lib/Frontend/PrintPreprocessedOutput.cpp      |    4 +-
 lib/Frontend/Rewrite/FrontendActions.cpp      |    1 +
 lib/Frontend/Rewrite/RewriteModernObjC.cpp    |    1 +
 lib/Frontend/Rewrite/RewriteObjC.cpp          |    1 +
 .../ExecuteCompilerInvocation.cpp             |    1 +
 lib/Headers/vecintrin.h                       | 1572 ++++++++++++++++-
 lib/Index/IndexingAction.cpp                  |   12 +
 lib/Index/IndexingContext.cpp                 |    6 +-
 lib/Lex/MacroArgs.cpp                         |   10 +-
 lib/Lex/MacroInfo.cpp                         |   19 +-
 lib/Lex/PPDirectives.cpp                      |   96 +-
 lib/Lex/PPExpressions.cpp                     |   59 +-
 lib/Lex/PPMacroExpansion.cpp                  |   10 +-
 lib/Lex/Preprocessor.cpp                      |    8 -
 lib/Lex/TokenLexer.cpp                        |   12 +-
 lib/Parse/ParseObjc.cpp                       |    4 +
 lib/Parse/ParseOpenMP.cpp                     |   23 +-
 lib/Sema/DeclSpec.cpp                         |    6 +-
 lib/Sema/Sema.cpp                             |    3 +-
 lib/Sema/SemaChecking.cpp                     |   24 +-
 lib/Sema/SemaCodeComplete.cpp                 |    6 +-
 lib/Sema/SemaDeclAttr.cpp                     |  126 +-
 lib/Sema/SemaDeclObjC.cpp                     |   58 +-
 lib/Sema/SemaExpr.cpp                         |   79 +-
 lib/Sema/SemaObjCProperty.cpp                 |  187 +-
 lib/Sema/SemaOpenMP.cpp                       |  350 ++--
 lib/Sema/SemaType.cpp                         |   39 +-
 lib/Sema/TreeTransform.h                      |   60 +
 lib/Serialization/ASTReader.cpp               |   86 +-
 lib/Serialization/ASTReaderStmt.cpp           |   42 +-
 lib/Serialization/ASTWriter.cpp               |    6 +-
 lib/Serialization/ASTWriterStmt.cpp           |   20 +
 .../Checkers/LocalizationChecker.cpp          |   69 +-
 .../Checkers/RetainCountChecker.cpp           |   35 +-
 lib/Tooling/Refactoring/Rename/USRFinder.cpp  |  177 +-
 .../Refactoring/Rename/USRLocFinder.cpp       |   90 +-
 lib/Tooling/Tooling.cpp                       |    1 +
 test/Analysis/localization-aggressive.m       |   19 +
 test/Analysis/retain-release-inline.m         |   56 +-
 test/Analysis/retain-release.m                |    2 +-
 test/CodeGen/aarch64-type-sizes.c             |    2 +-
 test/CodeGen/aarch64-varargs-ms.c             |   11 +
 test/CodeGen/builtins-hexagon.c               |   12 +
 test/CodeGen/builtins-systemz-vector2-error.c |   61 +
 test/CodeGen/builtins-systemz-vector2.c       |  136 ++
 test/CodeGen/builtins-systemz-zvector-error.c |   38 +-
 test/CodeGen/builtins-systemz-zvector.c       |   88 +
 .../CodeGen/builtins-systemz-zvector2-error.c |  153 ++
 test/CodeGen/builtins-systemz-zvector2.c      |  545 ++++++
 test/CodeGen/coff-aarch64-type-sizes.c        |   88 +
 test/CodeGen/debug-info-imported-entity.cpp   |    3 +-
 test/CodeGen/ms_abi.c                         |    6 +-
 test/CodeGen/ms_abi_aarch64.c                 |   68 +
 test/CodeGen/systemz-abi-vector.c             |    4 +
 test/CodeGen/systemz-abi.c                    |    4 +
 test/CodeGen/target-data.c                    |    4 +
 test/CodeGen/ubsan-pointer-overflow.m         |   15 +-
 test/CodeGen/zvector2.c                       |  194 ++
 test/CodeGenCXX/amdgcn-automatic-variable.cpp |   43 +-
 test/CodeGenCXX/debug-info-anon-namespace.cpp |    4 +-
 test/CodeGenCXX/debug-info-namespace.cpp      |    8 +-
 test/CodeGenCXX/implicit-exception-spec.cpp   |    2 +-
 test/CodeGenObjC/arc-property.m               |   20 +
 test/CodeGenObjC/attr-callconv.m              |    2 +-
 test/CodeGenOpenCL/kernel-arg-info.cl         |   37 +-
 test/Driver/autocomplete.c                    |    4 +
 test/Driver/constructors.c                    |    8 +
 test/Driver/darwin-version.c                  |   12 +
 test/Driver/emulated-tls.cpp                  |    4 +-
 test/Driver/fuchsia.c                         |    1 +
 test/Driver/fuchsia.cpp                       |    1 +
 test/Driver/lto-unit.c                        |    2 +
 test/Driver/pic.c                             |    3 +
 test/Driver/std.cpp                           |   10 +
 test/Driver/systemz-march.c                   |    4 +
 test/Driver/unknown-std.cpp                   |    2 +
 test/FixIt/format.m                           |   13 +
 test/Index/Core/index-source.m                |   25 +
 .../Core/no-templated-canonical-decl.cpp      |    4 +
 test/Index/complete-available.m               |    4 +-
 test/Misc/ast-dump-decl.c                     |    2 +-
 test/Misc/ast-dump-decl.cpp                   |    2 +-
 test/Modules/DebugInfoTransitiveImport.m      |    4 +-
 test/Modules/ExtDebugInfo.cpp                 |    4 +-
 test/Modules/Inputs/DebugObjCImport.h         |    2 +
 test/Modules/Inputs/module.map                |    6 +
 test/Modules/ModuleDebugInfo.m                |    6 +-
 .../debug-info-moduleimport-in-module.m       |   21 +
 test/Modules/debug-info-moduleimport.m        |    5 +-
 test/Modules/odr_hash.cpp                     |   83 +
 .../distribute_parallel_for_if_codegen.cpp    |    2 +-
 test/OpenMP/for_reduction_codegen.cpp         |   46 +-
 test/OpenMP/for_reduction_codegen_UDR.cpp     |   47 +-
 test/OpenMP/parallel_if_codegen.cpp           |    2 +-
 test/OpenMP/taskgroup_ast_print.cpp           |   60 +-
 test/OpenMP/taskgroup_messages.cpp            |    2 +
 .../taskgroup_task_reduction_messages.cpp     |  258 +++
 test/OpenMP/taskloop_codegen.cpp              |    8 +-
 test/OpenMP/taskloop_firstprivate_codegen.cpp |    8 +-
 test/OpenMP/taskloop_lastprivate_codegen.cpp  |    8 +-
 test/OpenMP/taskloop_private_codegen.cpp      |    8 +-
 test/OpenMP/taskloop_reduction_codegen.cpp    |  197 +++
 test/OpenMP/taskloop_simd_codegen.cpp         |    8 +-
 .../taskloop_simd_firstprivate_codegen.cpp    |    8 +-
 .../taskloop_simd_lastprivate_codegen.cpp     |    8 +-
 test/OpenMP/taskloop_simd_private_codegen.cpp |    8 +-
 .../taskloop_simd_reduction_codegen.cpp       |  197 +++
 test/Parser/MicrosoftExtensions.cpp           |    4 +-
 test/Preprocessor/aarch64-target-features.c   |    5 +
 test/Preprocessor/cxx_oper_keyword.cpp        |   12 +
 test/Preprocessor/init.c                      |   16 +
 test/Preprocessor/predefined-arch-macros.c    |   21 +-
 test/Sema/tls.c                               |    6 +-
 test/Sema/varargs-aarch64.c                   |   11 +
 test/Sema/varargs-x86-32.c                    |    2 +-
 test/Sema/zvector2.c                          |  211 +++
 .../attr-x86-no_caller_saved_registers.cpp    |    2 +-
 test/SemaCXX/cxx1z-noexcept-function-type.cpp |    1 +
 test/SemaObjC/arc-property-decl-attrs.m       |  104 ++
 test/SemaObjC/attr-ns_returns_retained.m      |   18 +
 test/SemaObjC/property-ambiguous-synthesis.m  |    2 +-
 .../warn-deprecated-implementations.m         |   19 +-
 tools/clang-fuzzer/CMakeLists.txt             |    3 +-
 tools/clang-fuzzer/ClangFuzzer.cpp            |   11 +-
 tools/libclang/ARCMigrate.cpp                 |    1 +
 tools/libclang/CIndex.cpp                     |   19 +-
 tools/libclang/CXType.cpp                     |    2 +-
 tools/scan-build-py/libscanbuild/analyze.py   |    2 +-
 unittests/ASTMatchers/Dynamic/ParserTest.cpp  |    4 +-
 unittests/Format/FormatTest.cpp               |   11 +
 unittests/Format/FormatTestJS.cpp             |   11 +
 unittests/Format/SortImportsTestJS.cpp        |   17 +
 unittests/Lex/LexerTest.cpp                   |    6 +-
 unittests/Tooling/CMakeLists.txt              |    1 +
 unittests/Tooling/DiagnosticsYamlTest.cpp     |  167 ++
 utils/bash-autocomplete.sh                    |   11 +-
 www/analyzer/alpha_checks.html                |  280 ++-
 www/analyzer/available_checks.html            |  323 +++-
 www/analyzer/implicit_checks.html             |    4 +-
 www/cxx_status.html                           |   83 +-
 221 files changed, 8813 insertions(+), 1827 deletions(-)
 create mode 100644 include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
 create mode 100644 test/CodeGen/aarch64-varargs-ms.c
 create mode 100644 test/CodeGen/builtins-systemz-vector2-error.c
 create mode 100644 test/CodeGen/builtins-systemz-vector2.c
 create mode 100644 test/CodeGen/builtins-systemz-zvector2-error.c
 create mode 100644 test/CodeGen/builtins-systemz-zvector2.c
 create mode 100644 test/CodeGen/coff-aarch64-type-sizes.c
 create mode 100644 test/CodeGen/ms_abi_aarch64.c
 create mode 100644 test/CodeGen/zvector2.c
 create mode 100644 test/Index/Core/no-templated-canonical-decl.cpp
 create mode 100644 test/Modules/Inputs/DebugObjCImport.h
 create mode 100644 test/Modules/debug-info-moduleimport-in-module.m
 create mode 100644 test/OpenMP/taskgroup_task_reduction_messages.cpp
 create mode 100644 test/OpenMP/taskloop_reduction_codegen.cpp
 create mode 100644 test/OpenMP/taskloop_simd_reduction_codegen.cpp
 create mode 100644 test/Sema/varargs-aarch64.c
 create mode 100644 test/Sema/zvector2.c
 create mode 100644 test/SemaObjC/attr-ns_returns_retained.m
 create mode 100644 unittests/Tooling/DiagnosticsYamlTest.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c163a2b50429..2667b1d6892e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -389,11 +389,7 @@ if(CLANG_ANALYZER_BUILD_Z3)
 endif()
 
 if(CLANG_ENABLE_ARCMT)
-  add_definitions(-DCLANG_ENABLE_ARCMT)
-  add_definitions(-DCLANG_ENABLE_OBJC_REWRITER)
-endif()
-if(CLANG_ENABLE_STATIC_ANALYZER)
-  add_definitions(-DCLANG_ENABLE_STATIC_ANALYZER)
+  set(CLANG_ENABLE_OBJC_REWRITER ON)
 endif()
 
 # Clang version information
diff --git a/cmake/caches/Fuchsia-stage2.cmake b/cmake/caches/Fuchsia-stage2.cmake
index ca43e603a631..1b7b636fef3e 100644
--- a/cmake/caches/Fuchsia-stage2.cmake
+++ b/cmake/caches/Fuchsia-stage2.cmake
@@ -7,7 +7,6 @@ set(PACKAGE_VENDOR Fuchsia CACHE STRING "")
 
 set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "")
 set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "")
-set(LLVM_TOOL_CLANG_TOOLS_EXTRA_BUILD OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB ON CACHE BOOL "")
 set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "")
 set(LLVM_EXTERNALIZE_DEBUGINFO ON CACHE BOOL "")
@@ -27,11 +26,27 @@ set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -gline-tables-only -DNDEBUG" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -gline-tables-only -DNDEBUG" CACHE STRING "")
 
-set(LLVM_BUILTIN_TARGETS "x86_64-fuchsia-none;aarch64-fuchsia-none" CACHE STRING "")
-set(BUILTINS_x86_64-fuchsia-none_CMAKE_SYSROOT ${FUCHSIA_SYSROOT} CACHE STRING "")
-set(BUILTINS_x86_64-fuchsia-none_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
-set(BUILTINS_aarch64-fuchsia-none_CMAKE_SYSROOT ${FUCHSIA_SYSROOT} CACHE STRING "")
-set(BUILTINS_aarch64-fuchsia-none_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
+set(LLVM_BUILTIN_TARGETS "x86_64-fuchsia;aarch64-fuchsia" CACHE STRING "")
+foreach(target x86_64;aarch64)
+  set(BUILTINS_${target}-fuchsia_CMAKE_SYSROOT ${FUCHSIA_${target}_SYSROOT} CACHE PATH "")
+  set(BUILTINS_${target}-fuchsia_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
+endforeach()
+if(NOT APPLE)
+  list(APPEND LLVM_BUILTIN_TARGETS "default")
+endif()
+
+set(LLVM_RUNTIME_TARGETS "default;x86_64-fuchsia;aarch64-fuchsia" CACHE STRING "")
+foreach(target x86_64;aarch64)
+  set(RUNTIMES_${target}-fuchsia_CMAKE_BUILD_WITH_INSTALL_RPATH ON CACHE BOOL "")
+  set(RUNTIMES_${target}-fuchsia_CMAKE_SYSROOT ${FUCHSIA_${target}_SYSROOT} CACHE PATH "")
+  set(RUNTIMES_${target}-fuchsia_CMAKE_SYSTEM_NAME Fuchsia CACHE STRING "")
+  set(RUNTIMES_${target}-fuchsia_UNIX 1 CACHE BOOL "")
+  set(RUNTIMES_${target}-fuchsia_LLVM_ENABLE_LIBCXX ON CACHE BOOL "")
+  set(RUNTIMES_${target}-fuchsia_LIBUNWIND_USE_COMPILER_RT ON CACHE BOOL "")
+  set(RUNTIMES_${target}-fuchsia_LIBCXXABI_USE_COMPILER_RT ON CACHE BOOL "")
+  set(RUNTIMES_${target}-fuchsia_LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "")
+  set(RUNTIMES_${target}-fuchsia_LIBCXX_USE_COMPILER_RT ON CACHE BOOL "")
+endforeach()
 
 # Setup toolchain.
 set(LLVM_INSTALL_TOOLCHAIN_ONLY ON CACHE BOOL "")
@@ -47,6 +62,7 @@ set(LLVM_TOOLCHAIN_TOOLS
   llvm-objdump
   llvm-profdata
   llvm-ranlib
+  llvm-readelf
   llvm-readobj
   llvm-size
   llvm-symbolizer
@@ -61,8 +77,9 @@ set(LLVM_DISTRIBUTION_COMPONENTS
   LTO
   clang-format
   clang-headers
-  builtins-x86_64-fuchsia-none
-  builtins-aarch64-fuchsia-none
+  clang-tidy
+  clangd
+  builtins
   runtimes
   ${LLVM_TOOLCHAIN_TOOLS}
   CACHE STRING "")
diff --git a/cmake/caches/Fuchsia.cmake b/cmake/caches/Fuchsia.cmake
index c8a8cf6d58b7..0932c046f628 100644
--- a/cmake/caches/Fuchsia.cmake
+++ b/cmake/caches/Fuchsia.cmake
@@ -38,9 +38,11 @@ set(CLANG_BOOTSTRAP_TARGETS
   install-distribution
   clang CACHE STRING "")
 
-if(FUCHSIA_SYSROOT)
-  set(EXTRA_ARGS -DFUCHSIA_SYSROOT=${FUCHSIA_SYSROOT})
-endif()
+foreach(target x86_64;aarch64)
+  if(FUCHSIA_${target}_SYSROOT)
+    list(APPEND EXTRA_ARGS -DFUCHSIA_${target}_SYSROOT=${FUCHSIA_${target}_SYSROOT})
+  endif()
+endforeach()
 
 # Setup the bootstrap build.
 set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
diff --git a/docs/LanguageExtensions.rst b/docs/LanguageExtensions.rst
index dab538f3a983..78f987c4a8e8 100644
--- a/docs/LanguageExtensions.rst
+++ b/docs/LanguageExtensions.rst
@@ -1271,6 +1271,87 @@ Further examples of these attributes are available in the static analyzer's `lis
 Query for these features with ``__has_attribute(ns_consumed)``,
 ``__has_attribute(ns_returns_retained)``, etc.
 
+Objective-C @available
+----------------------
+
+It is possible to use the newest SDK but still build a program that can run on
+older versions of macOS and iOS by passing ``-mmacosx-version-min=`` /
+``-miphoneos-version-min=``.
+
+Before LLVM 5.0, when calling a function that exists only in the OS that's
+newer than the target OS (as determined by the minimum deployment version),
+programmers had to carefully check if the function exists at runtime, using
+null checks for weakly-linked C functions, ``+class`` for Objective-C classes,
+and ``-respondsToSelector:`` or ``+instancesRespondToSelector:`` for
+Objective-C methods.  If such a check was missed, the program would compile
+fine, run fine on newer systems, but crash on older systems.
+
+As of LLVM 5.0, ``-Wunguarded-availability`` uses the `availability attributes
+<http://clang.llvm.org/docs/AttributeReference.html#availability>`_ together
+with the new ``@available()`` keyword to assist with this issue.
+When a method that's introduced in the OS newer than the target OS is called, a
+-Wunguarded-availability warning is emitted if that call is not guarded:
+
+.. code-block:: objc
+
+  void my_fun(NSSomeClass* var) {
+    // If fancyNewMethod was added in e.g. macOS 10.12, but the code is
+    // built with -mmacosx-version-min=10.11, then this unconditional call
+    // will emit a -Wunguarded-availability warning:
+    [var fancyNewMethod];
+  }
+
+To fix the warning and to avoid the crash on macOS 10.11, wrap it in
+``if(@available())``:
+
+.. code-block:: objc
+
+  void my_fun(NSSomeClass* var) {
+    if (@available(macOS 10.12, *)) {
+      [var fancyNewMethod];
+    } else {
+      // Put fallback behavior for old macOS versions (and for non-mac
+      // platforms) here.
+    }
+  }
+
+The ``*`` is required and means that platforms not explicitly listed will take
+the true branch, and the compiler will emit ``-Wunguarded-availability``
+warnings for unlisted platforms based on those platform's deployment target.
+More than one platform can be listed in ``@available()``:
+
+.. code-block:: objc
+
+  void my_fun(NSSomeClass* var) {
+    if (@available(macOS 10.12, iOS 10, *)) {
+      [var fancyNewMethod];
+    }
+  }
+
+If the caller of ``my_fun()`` already checks that ``my_fun()`` is only called
+on 10.12, then add an `availability attribute
+<http://clang.llvm.org/docs/AttributeReference.html#availability>`_ to it,
+which will also suppress the warning and require that calls to my_fun() are
+checked:
+
+.. code-block:: objc
+
+  API_AVAILABLE(macos(10.12)) void my_fun(NSSomeClass* var) {
+    [var fancyNewMethod];  // Now ok.
+  }
+
+``@available()`` is only available in Objective-C code.  To use the feature
+in C and C++ code, use the ``__builtin_available()`` spelling instead.
+
+If existing code uses null checks or ``-respondsToSelector:``, it should
+be changed to use ``@available()`` (or ``__builtin_available``) instead.
+
+``-Wunguarded-availability`` is disabled by default, but
+``-Wunguarded-availability-new``, which only emits this warning for APIs
+that have been introduced in macOS >= 10.13, iOS >= 11, watchOS >= 4 and
+tvOS >= 11, is enabled by default.
+
+.. _langext-overloading:
 
 Objective-C++ ABI: protocol-qualifier mangling of parameters
 ------------------------------------------------------------
@@ -1287,8 +1368,6 @@ parameters of protocol-qualified type.
 Query the presence of this new mangling with
 ``__has_feature(objc_protocol_qualifier_mangling)``.
 
-.. _langext-overloading:
-
 Initializer lists for complex numbers in C
 ==========================================
 
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 8f1515dafd97..deffa24c4e42 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -82,7 +82,9 @@ Clang now supports the ...
 Attribute Changes in Clang
 --------------------------
 
--  ...
+-  The ``overloadable`` attribute now allows at most one function with a given
+   name to lack the ``overloadable`` attribute. This unmarked function will not
+   have its name mangled.
 
 Windows Support
 ---------------
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index 09f4403556c8..b35f436e91b6 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -3205,7 +3205,7 @@ enum CXCallingConv {
   CXCallingConv_AAPCS_VFP = 7,
   CXCallingConv_X86RegCall = 8,
   CXCallingConv_IntelOclBicc = 9,
-  CXCallingConv_X86_64Win64 = 10,
+  CXCallingConv_Win64 = 10,
   CXCallingConv_X86_64SysV = 11,
   CXCallingConv_X86VectorCall = 12,
   CXCallingConv_Swift = 13,
diff --git a/include/clang/AST/ASTContext.h b/include/clang/AST/ASTContext.h
index 3b46d31458ce..703f588c5663 100644
--- a/include/clang/AST/ASTContext.h
+++ b/include/clang/AST/ASTContext.h
@@ -1441,6 +1441,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// The sizeof operator requires this (C99 6.5.3.4p4).
   CanQualType getSizeType() const;
 
+  /// \brief Return the unique signed counterpart of 
+  /// the integer type corresponding to size_t.
+  CanQualType getSignedSizeType() const;
+
   /// \brief Return the unique type for "intmax_t" (C99 7.18.1.5), defined in
   /// <stdint.h>.
   CanQualType getIntMaxType() const;
diff --git a/include/clang/AST/DeclObjC.h b/include/clang/AST/DeclObjC.h
index 26c0cbe82d17..1cd6e004f751 100644
--- a/include/clang/AST/DeclObjC.h
+++ b/include/clang/AST/DeclObjC.h
@@ -1039,10 +1039,9 @@ class ObjCContainerDecl : public NamedDecl, public DeclContext {
   typedef llvm::DenseMap<std::pair<IdentifierInfo*,
                                    unsigned/*isClassProperty*/>,
                          ObjCPropertyDecl*> PropertyMap;
-  
-  typedef llvm::DenseMap<const ObjCProtocolDecl *, ObjCPropertyDecl*>
-            ProtocolPropertyMap;
-  
+
+  typedef llvm::SmallDenseSet<const ObjCProtocolDecl *, 8> ProtocolPropertySet;
+
   typedef llvm::SmallVector<ObjCPropertyDecl*, 8> PropertyDeclOrder;
   
   /// This routine collects list of properties to be implemented in the class.
@@ -2159,7 +2158,8 @@ class ObjCProtocolDecl : public ObjCContainerDecl,
                                     PropertyDeclOrder &PO) const override;
 
   void collectInheritedProtocolProperties(const ObjCPropertyDecl *Property,
-                                          ProtocolPropertyMap &PM) const;
+                                          ProtocolPropertySet &PS,
+                                          PropertyDeclOrder &PO) const;
 
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) { return K == ObjCProtocol; }
diff --git a/include/clang/AST/OpenMPClause.h b/include/clang/AST/OpenMPClause.h
index 14e73819f53d..a1cae8e18f84 100644
--- a/include/clang/AST/OpenMPClause.h
+++ b/include/clang/AST/OpenMPClause.h
@@ -1890,6 +1890,217 @@ class OMPReductionClause final
   }
 };
 
+/// This represents clause 'task_reduction' in the '#pragma omp taskgroup'
+/// directives.
+///
+/// \code
+/// #pragma omp taskgroup task_reduction(+:a,b)
+/// \endcode
+/// In this example directive '#pragma omp taskgroup' has clause
+/// 'task_reduction' with operator '+' and the variables 'a' and 'b'.
+///
+class OMPTaskReductionClause final
+    : public OMPVarListClause<OMPTaskReductionClause>,
+      public OMPClauseWithPostUpdate,
+      private llvm::TrailingObjects<OMPTaskReductionClause, Expr *> {
+  friend TrailingObjects;
+  friend OMPVarListClause;
+  friend class OMPClauseReader;
+  /// Location of ':'.
+  SourceLocation ColonLoc;
+  /// Nested name specifier for C++.
+  NestedNameSpecifierLoc QualifierLoc;
+  /// Name of custom operator.
+  DeclarationNameInfo NameInfo;
+
+  /// Build clause with number of variables \a N.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  /// \param ColonLoc Location of ':'.
+  /// \param N Number of the variables in the clause.
+  /// \param QualifierLoc The nested-name qualifier with location information
+  /// \param NameInfo The full name info for reduction identifier.
+  ///
+  OMPTaskReductionClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+                         SourceLocation ColonLoc, SourceLocation EndLoc,
+                         unsigned N, NestedNameSpecifierLoc QualifierLoc,
+                         const DeclarationNameInfo &NameInfo)
+      : OMPVarListClause<OMPTaskReductionClause>(OMPC_task_reduction, StartLoc,
+                                                 LParenLoc, EndLoc, N),
+        OMPClauseWithPostUpdate(this), ColonLoc(ColonLoc),
+        QualifierLoc(QualifierLoc), NameInfo(NameInfo) {}
+
+  /// Build an empty clause.
+  ///
+  /// \param N Number of variables.
+  ///
+  explicit OMPTaskReductionClause(unsigned N)
+      : OMPVarListClause<OMPTaskReductionClause>(
+            OMPC_task_reduction, SourceLocation(), SourceLocation(),
+            SourceLocation(), N),
+        OMPClauseWithPostUpdate(this), ColonLoc(), QualifierLoc(), NameInfo() {}
+
+  /// Sets location of ':' symbol in clause.
+  void setColonLoc(SourceLocation CL) { ColonLoc = CL; }
+  /// Sets the name info for specified reduction identifier.
+  void setNameInfo(DeclarationNameInfo DNI) { NameInfo = DNI; }
+  /// Sets the nested name specifier.
+  void setQualifierLoc(NestedNameSpecifierLoc NSL) { QualifierLoc = NSL; }
+
+  /// Set list of helper expressions, required for proper codegen of the clause.
+  /// These expressions represent private copy of the reduction variable.
+  void setPrivates(ArrayRef<Expr *> Privates);
+
+  /// Get the list of helper privates.
+  MutableArrayRef<Expr *> getPrivates() {
+    return MutableArrayRef<Expr *>(varlist_end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getPrivates() const {
+    return llvm::makeArrayRef(varlist_end(), varlist_size());
+  }
+
+  /// Set list of helper expressions, required for proper codegen of the clause.
+  /// These expressions represent LHS expression in the final reduction
+  /// expression performed by the reduction clause.
+  void setLHSExprs(ArrayRef<Expr *> LHSExprs);
+
+  /// Get the list of helper LHS expressions.
+  MutableArrayRef<Expr *> getLHSExprs() {
+    return MutableArrayRef<Expr *>(getPrivates().end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getLHSExprs() const {
+    return llvm::makeArrayRef(getPrivates().end(), varlist_size());
+  }
+
+  /// Set list of helper expressions, required for proper codegen of the clause.
+  /// These expressions represent RHS expression in the final reduction
+  /// expression performed by the reduction clause. Also, variables in these
+  /// expressions are used for proper initialization of reduction copies.
+  void setRHSExprs(ArrayRef<Expr *> RHSExprs);
+
+  ///  Get the list of helper destination expressions.
+  MutableArrayRef<Expr *> getRHSExprs() {
+    return MutableArrayRef<Expr *>(getLHSExprs().end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getRHSExprs() const {
+    return llvm::makeArrayRef(getLHSExprs().end(), varlist_size());
+  }
+
+  /// Set list of helper reduction expressions, required for proper
+  /// codegen of the clause. These expressions are binary expressions or
+  /// operator/custom reduction call that calculates new value from source
+  /// helper expressions to destination helper expressions.
+  void setReductionOps(ArrayRef<Expr *> ReductionOps);
+
+  ///  Get the list of helper reduction expressions.
+  MutableArrayRef<Expr *> getReductionOps() {
+    return MutableArrayRef<Expr *>(getRHSExprs().end(), varlist_size());
+  }
+  ArrayRef<const Expr *> getReductionOps() const {
+    return llvm::makeArrayRef(getRHSExprs().end(), varlist_size());
+  }
+
+public:
+  /// Creates clause with a list of variables \a VL.
+  ///
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param ColonLoc Location of ':'.
+  /// \param EndLoc Ending location of the clause.
+  /// \param VL The variables in the clause.
+  /// \param QualifierLoc The nested-name qualifier with location information
+  /// \param NameInfo The full name info for reduction identifier.
+  /// \param Privates List of helper expressions for proper generation of
+  /// private copies.
+  /// \param LHSExprs List of helper expressions for proper generation of
+  /// assignment operation required for copyprivate clause. This list represents
+  /// LHSs of the reduction expressions.
+  /// \param RHSExprs List of helper expressions for proper generation of
+  /// assignment operation required for copyprivate clause. This list represents
+  /// RHSs of the reduction expressions.
+  /// Also, variables in these expressions are used for proper initialization of
+  /// reduction copies.
+  /// \param ReductionOps List of helper expressions that represents reduction
+  /// expressions:
+  /// \code
+  /// LHSExprs binop RHSExprs;
+  /// operator binop(LHSExpr, RHSExpr);
+  /// <CutomReduction>(LHSExpr, RHSExpr);
+  /// \endcode
+  /// Required for proper codegen of final reduction operation performed by the
+  /// reduction clause.
+  /// \param PreInit Statement that must be executed before entering the OpenMP
+  /// region with this clause.
+  /// \param PostUpdate Expression that must be executed after exit from the
+  /// OpenMP region with this clause.
+  ///
+  static OMPTaskReductionClause *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+         SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef<Expr *> VL,
+         NestedNameSpecifierLoc QualifierLoc,
+         const DeclarationNameInfo &NameInfo, ArrayRef<Expr *> Privates,
+         ArrayRef<Expr *> LHSExprs, ArrayRef<Expr *> RHSExprs,
+         ArrayRef<Expr *> ReductionOps, Stmt *PreInit, Expr *PostUpdate);
+
+  /// Creates an empty clause with the place for \a N variables.
+  ///
+  /// \param C AST context.
+  /// \param N The number of variables.
+  ///
+  static OMPTaskReductionClause *CreateEmpty(const ASTContext &C, unsigned N);
+
+  /// Gets location of ':' symbol in clause.
+  SourceLocation getColonLoc() const { return ColonLoc; }
+  /// Gets the name info for specified reduction identifier.
+  const DeclarationNameInfo &getNameInfo() const { return NameInfo; }
+  /// Gets the nested name specifier.
+  NestedNameSpecifierLoc getQualifierLoc() const { return QualifierLoc; }
+
+  typedef MutableArrayRef<Expr *>::iterator helper_expr_iterator;
+  typedef ArrayRef<const Expr *>::iterator helper_expr_const_iterator;
+  typedef llvm::iterator_range<helper_expr_iterator> helper_expr_range;
+  typedef llvm::iterator_range<helper_expr_const_iterator>
+      helper_expr_const_range;
+
+  helper_expr_const_range privates() const {
+    return helper_expr_const_range(getPrivates().begin(), getPrivates().end());
+  }
+  helper_expr_range privates() {
+    return helper_expr_range(getPrivates().begin(), getPrivates().end());
+  }
+  helper_expr_const_range lhs_exprs() const {
+    return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end());
+  }
+  helper_expr_range lhs_exprs() {
+    return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end());
+  }
+  helper_expr_const_range rhs_exprs() const {
+    return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end());
+  }
+  helper_expr_range rhs_exprs() {
+    return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end());
+  }
+  helper_expr_const_range reduction_ops() const {
+    return helper_expr_const_range(getReductionOps().begin(),
+                                   getReductionOps().end());
+  }
+  helper_expr_range reduction_ops() {
+    return helper_expr_range(getReductionOps().begin(),
+                             getReductionOps().end());
+  }
+
+  child_range children() {
+    return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
+                       reinterpret_cast<Stmt **>(varlist_end()));
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == OMPC_task_reduction;
+  }
+};
+
 /// \brief This represents clause 'linear' in the '#pragma omp ...'
 /// directives.
 ///
diff --git a/include/clang/AST/RecursiveASTVisitor.h b/include/clang/AST/RecursiveASTVisitor.h
index 917b240428e7..e7f271cc0812 100644
--- a/include/clang/AST/RecursiveASTVisitor.h
+++ b/include/clang/AST/RecursiveASTVisitor.h
@@ -3016,6 +3016,28 @@ RecursiveASTVisitor<Derived>::VisitOMPReductionClause(OMPReductionClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPTaskReductionClause(
+    OMPTaskReductionClause *C) {
+  TRY_TO(TraverseNestedNameSpecifierLoc(C->getQualifierLoc()));
+  TRY_TO(TraverseDeclarationNameInfo(C->getNameInfo()));
+  TRY_TO(VisitOMPClauseList(C));
+  TRY_TO(VisitOMPClauseWithPostUpdate(C));
+  for (auto *E : C->privates()) {
+    TRY_TO(TraverseStmt(E));
+  }
+  for (auto *E : C->lhs_exprs()) {
+    TRY_TO(TraverseStmt(E));
+  }
+  for (auto *E : C->rhs_exprs()) {
+    TRY_TO(TraverseStmt(E));
+  }
+  for (auto *E : C->reduction_ops()) {
+    TRY_TO(TraverseStmt(E));
+  }
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPFlushClause(OMPFlushClause *C) {
   TRY_TO(VisitOMPClauseList(C));
diff --git a/include/clang/AST/StmtOpenMP.h b/include/clang/AST/StmtOpenMP.h
index 463af06fddab..09dd87fdc8bc 100644
--- a/include/clang/AST/StmtOpenMP.h
+++ b/include/clang/AST/StmtOpenMP.h
@@ -1895,7 +1895,7 @@ class OMPTaskwaitDirective : public OMPExecutableDirective {
   }
 };
 
-/// \brief This represents '#pragma omp taskgroup' directive.
+/// This represents '#pragma omp taskgroup' directive.
 ///
 /// \code
 /// #pragma omp taskgroup
@@ -1903,39 +1903,45 @@ class OMPTaskwaitDirective : public OMPExecutableDirective {
 ///
 class OMPTaskgroupDirective : public OMPExecutableDirective {
   friend class ASTStmtReader;
-  /// \brief Build directive with the given start and end location.
+  /// Build directive with the given start and end location.
   ///
   /// \param StartLoc Starting location of the directive kind.
   /// \param EndLoc Ending location of the directive.
+  /// \param NumClauses Number of clauses.
   ///
-  OMPTaskgroupDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+  OMPTaskgroupDirective(SourceLocation StartLoc, SourceLocation EndLoc,
+                        unsigned NumClauses)
       : OMPExecutableDirective(this, OMPTaskgroupDirectiveClass, OMPD_taskgroup,
-                               StartLoc, EndLoc, 0, 1) {}
+                               StartLoc, EndLoc, NumClauses, 1) {}
 
-  /// \brief Build an empty directive.
+  /// Build an empty directive.
+  /// \param NumClauses Number of clauses.
   ///
-  explicit OMPTaskgroupDirective()
+  explicit OMPTaskgroupDirective(unsigned NumClauses)
       : OMPExecutableDirective(this, OMPTaskgroupDirectiveClass, OMPD_taskgroup,
-                               SourceLocation(), SourceLocation(), 0, 1) {}
+                               SourceLocation(), SourceLocation(), NumClauses,
+                               1) {}
 
 public:
-  /// \brief Creates directive.
+  /// Creates directive.
   ///
   /// \param C AST context.
   /// \param StartLoc Starting location of the directive kind.
   /// \param EndLoc Ending Location of the directive.
+  /// \param Clauses List of clauses.
   /// \param AssociatedStmt Statement, associated with the directive.
   ///
-  static OMPTaskgroupDirective *Create(const ASTContext &C,
-                                       SourceLocation StartLoc,
-                                       SourceLocation EndLoc,
-                                       Stmt *AssociatedStmt);
+  static OMPTaskgroupDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt);
 
-  /// \brief Creates an empty directive.
+  /// Creates an empty directive.
   ///
   /// \param C AST context.
+  /// \param NumClauses Number of clauses.
   ///
-  static OMPTaskgroupDirective *CreateEmpty(const ASTContext &C, EmptyShell);
+  static OMPTaskgroupDirective *CreateEmpty(const ASTContext &C,
+                                            unsigned NumClauses, EmptyShell);
 
   static bool classof(const Stmt *T) {
     return T->getStmtClass() == OMPTaskgroupDirectiveClass;
diff --git a/include/clang/AST/Type.h b/include/clang/AST/Type.h
index 9eb6d81296d8..64bd3c701985 100644
--- a/include/clang/AST/Type.h
+++ b/include/clang/AST/Type.h
@@ -3878,6 +3878,7 @@ class AttributedType : public Type, public llvm::FoldingSetNode {
     attr_sptr,
     attr_uptr,
     attr_nonnull,
+    attr_ns_returns_retained,
     attr_nullable,
     attr_null_unspecified,
     attr_objc_kindof,
diff --git a/include/clang/Analysis/Analyses/Dominators.h b/include/clang/Analysis/Analyses/Dominators.h
index 1229f8a8efac..38010e1ee1d8 100644
--- a/include/clang/Analysis/Analyses/Dominators.h
+++ b/include/clang/Analysis/Analyses/Dominators.h
@@ -38,15 +38,15 @@ typedef llvm::DomTreeNodeBase<CFGBlock> DomTreeNode;
 class DominatorTree : public ManagedAnalysis {
   virtual void anchor();
 public:
-  llvm::DominatorTreeBase<CFGBlock>* DT;
+  llvm::DomTreeBase<CFGBlock>* DT;
 
   DominatorTree() {
-    DT = new llvm::DominatorTreeBase<CFGBlock>(false);
+    DT = new llvm::DomTreeBase<CFGBlock>();
   }
 
   ~DominatorTree() override { delete DT; }
 
-  llvm::DominatorTreeBase<CFGBlock>& getBase() { return *DT; }
+  llvm::DomTreeBase<CFGBlock>& getBase() { return *DT; }
 
   /// \brief This method returns the root CFGBlock of the dominators tree.
   ///
diff --git a/include/clang/Basic/Attr.td b/include/clang/Basic/Attr.td
index bc36fd8c8297..f13e13b0107b 100644
--- a/include/clang/Basic/Attr.td
+++ b/include/clang/Basic/Attr.td
@@ -1802,11 +1802,18 @@ def Target : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag>;
   let Documentation = [TargetDocs];
   let AdditionalMembers = [{
-    typedef std::pair<std::vector<std::string>, StringRef> ParsedTargetAttr;
+    struct ParsedTargetAttr {
+      std::vector<std::string> Features;
+      StringRef Architecture;
+      bool DuplicateArchitecture = false;
+    };
     ParsedTargetAttr parse() const {
+      return parse(getFeaturesStr());
+    }
+    static ParsedTargetAttr parse(StringRef Features) {
       ParsedTargetAttr Ret;
       SmallVector<StringRef, 1> AttrFeatures;
-      getFeaturesStr().split(AttrFeatures, ",");
+      Features.split(AttrFeatures, ",");
 
       // Grab the various features and prepend a "+" to turn on the feature to
       // the backend and add them to our existing set of features.
@@ -1823,12 +1830,15 @@ def Target : InheritableAttr {
 	  continue;
 
         // While we're here iterating check for a different target cpu.
-        if (Feature.startswith("arch="))
-          Ret.second = Feature.split("=").second.trim();
-        else if (Feature.startswith("no-"))
-          Ret.first.push_back("-" + Feature.split("-").second.str());
+        if (Feature.startswith("arch=")) {
+          if (!Ret.Architecture.empty())
+            Ret.DuplicateArchitecture = true;
+          else
+            Ret.Architecture = Feature.split("=").second.trim();
+        } else if (Feature.startswith("no-"))
+          Ret.Features.push_back("-" + Feature.split("-").second.str());
         else
-          Ret.first.push_back("+" + Feature.str());
+          Ret.Features.push_back("+" + Feature.str());
       }
       return Ret;
     }
diff --git a/include/clang/Basic/AttrDocs.td b/include/clang/Basic/AttrDocs.td
index 2987f07d8bb4..33ef3ea4cade 100644
--- a/include/clang/Basic/AttrDocs.td
+++ b/include/clang/Basic/AttrDocs.td
@@ -910,13 +910,13 @@ the function declaration for a hypothetical function ``f``:
 
   void f(void) __attribute__((availability(macos,introduced=10.4,deprecated=10.6,obsoleted=10.7)));
 
-The availability attribute states that ``f`` was introduced in Mac OS X 10.4,
-deprecated in Mac OS X 10.6, and obsoleted in Mac OS X 10.7.  This information
+The availability attribute states that ``f`` was introduced in macOS 10.4,
+deprecated in macOS 10.6, and obsoleted in macOS 10.7.  This information
 is used by Clang to determine when it is safe to use ``f``: for example, if
-Clang is instructed to compile code for Mac OS X 10.5, a call to ``f()``
-succeeds.  If Clang is instructed to compile code for Mac OS X 10.6, the call
+Clang is instructed to compile code for macOS 10.5, a call to ``f()``
+succeeds.  If Clang is instructed to compile code for macOS 10.6, the call
 succeeds but Clang emits a warning specifying that the function is deprecated.
-Finally, if Clang is instructed to compile code for Mac OS X 10.7, the call
+Finally, if Clang is instructed to compile code for macOS 10.7, the call
 fails because ``f()`` is no longer available.
 
 The availability attribute is a comma-separated list starting with the
@@ -961,7 +961,7 @@ are:
   command-line arguments.
 
 ``macos``
-  Apple's Mac OS X operating system.  The minimum deployment target is
+  Apple's macOS operating system.  The minimum deployment target is
   specified by the ``-mmacosx-version-min=*version*`` command-line argument.
   ``macosx`` is supported for backward-compatibility reasons, but it is
   deprecated.
@@ -1015,6 +1015,19 @@ When one method overrides another, the overriding method can be more widely avai
   - (id)method __attribute__((availability(macos,introduced=10.3))); // okay: method moved into base class later
   - (id)method __attribute__((availability(macos,introduced=10.5))); // error: this method was available via the base class in 10.4
   @end
+
+Starting with the macOS 10.12 SDK, the ``API_AVAILABLE`` macro from
+``<os/availability.h>`` can simplify the spelling:
+
+.. code-block:: objc
+
+  @interface A
+  - (id)method API_AVAILABLE(macos(10.11)));
+  - (id)otherMethod API_AVAILABLE(macos(10.11), ios(11.0));
+  @end
+
+Also see the documentation for `@available
+<http://clang.llvm.org/docs/LanguageExtensions.html#objective-c-available>`_
   }];
 }
 
diff --git a/include/clang/Basic/Builtins.def b/include/clang/Basic/Builtins.def
index 75781dc7491d..1ddb9beaf913 100644
--- a/include/clang/Basic/Builtins.def
+++ b/include/clang/Basic/Builtins.def
@@ -1413,6 +1413,11 @@ BUILTIN(__builtin_os_log_format, "v*v*cC*.", "p:0:nt")
 // Builtins for XRay
 BUILTIN(__xray_customevent, "vcC*z", "")
 
+// Win64-compatible va_list functions
+BUILTIN(__builtin_ms_va_start, "vc*&.", "nt")
+BUILTIN(__builtin_ms_va_end, "vc*&", "n")
+BUILTIN(__builtin_ms_va_copy, "vc*&c*&", "n")
+
 #undef BUILTIN
 #undef LIBBUILTIN
 #undef LANGBUILTIN
diff --git a/include/clang/Basic/BuiltinsHexagon.def b/include/clang/Basic/BuiltinsHexagon.def
index 85936cbfc08e..14fc4adc25bc 100644
--- a/include/clang/Basic/BuiltinsHexagon.def
+++ b/include/clang/Basic/BuiltinsHexagon.def
@@ -882,6 +882,12 @@ BUILTIN(__builtin_HEXAGON_S2_ct0p,"iLLi","")
 BUILTIN(__builtin_HEXAGON_S2_ct1p,"iLLi","")
 BUILTIN(__builtin_HEXAGON_S2_interleave,"LLiLLi","")
 BUILTIN(__builtin_HEXAGON_S2_deinterleave,"LLiLLi","")
+BUILTIN(__builtin_HEXAGON_Y2_dccleana,"vv*","")
+BUILTIN(__builtin_HEXAGON_Y2_dccleaninva,"vv*","")
+BUILTIN(__builtin_HEXAGON_Y2_dcinva,"vv*","")
+BUILTIN(__builtin_HEXAGON_Y2_dczeroa,"vv*","")
+BUILTIN(__builtin_HEXAGON_Y4_l2fetch,"vv*Ui","")
+BUILTIN(__builtin_HEXAGON_Y5_l2fetch,"vv*LLUi","")
 
 BUILTIN(__builtin_HEXAGON_S6_rol_i_r,"iii","v:60:")
 BUILTIN(__builtin_HEXAGON_S6_rol_i_p,"LLiLLii","v:60:")
diff --git a/include/clang/Basic/BuiltinsSystemZ.def b/include/clang/Basic/BuiltinsSystemZ.def
index fa96e10b3990..ac92286af0b5 100644
--- a/include/clang/Basic/BuiltinsSystemZ.def
+++ b/include/clang/Basic/BuiltinsSystemZ.def
@@ -253,5 +253,29 @@ TARGET_BUILTIN(__builtin_s390_vfmsdb, "V2dV2dV2dV2d", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_vfsqdb, "V2dV2d", "nc", "vector")
 TARGET_BUILTIN(__builtin_s390_vftcidb, "V2SLLiV2dIii*", "nc", "vector")
 
+// Vector-enhancements facility 1 intrinsics.
+TARGET_BUILTIN(__builtin_s390_vlrl, "V16ScUivC*", "", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vstrl, "vV16ScUiv*", "", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vbperm, "V2ULLiV16UcV16Uc", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vmslg, "V16UcV2ULLiV2ULLiV16UcIi", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfmaxdb, "V2dV2dV2dIi", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfmindb, "V2dV2dV2dIi", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfnmadb, "V2dV2dV2dV2d", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfnmsdb, "V2dV2dV2dV2d", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfcesbs, "V4SiV4fV4fi*", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfchsbs, "V4SiV4fV4fi*", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfchesbs, "V4SiV4fV4fi*", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfisb, "V4fV4fIiIi", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfmaxsb, "V4fV4fV4fIi", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfminsb, "V4fV4fV4fIi", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vflnsb, "V4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vflpsb, "V4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfmasb, "V4fV4fV4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfmssb, "V4fV4fV4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfnmasb, "V4fV4fV4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfnmssb, "V4fV4fV4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vfsqsb, "V4fV4f", "nc", "vector-enhancements-1")
+TARGET_BUILTIN(__builtin_s390_vftcisb, "V4SiV4fIii*", "nc", "vector-enhancements-1")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def
index 4cd3f1d46473..a516bf6bf06c 100644
--- a/include/clang/Basic/BuiltinsX86.def
+++ b/include/clang/Basic/BuiltinsX86.def
@@ -34,11 +34,6 @@
 // can use it?
 BUILTIN(__builtin_cpu_supports, "bcC*", "nc")
 
-// Win64-compatible va_list functions
-BUILTIN(__builtin_ms_va_start, "vc*&.", "nt")
-BUILTIN(__builtin_ms_va_end, "vc*&", "n")
-BUILTIN(__builtin_ms_va_copy, "vc*&c*&", "n")
-
 // Undefined Values
 //
 TARGET_BUILTIN(__builtin_ia32_undef128, "V2d", "nc", "")
diff --git a/include/clang/Basic/DiagnosticGroups.td b/include/clang/Basic/DiagnosticGroups.td
index 3a0564806b32..53d8f36ecd00 100644
--- a/include/clang/Basic/DiagnosticGroups.td
+++ b/include/clang/Basic/DiagnosticGroups.td
@@ -152,6 +152,8 @@ def GNUFoldingConstant : DiagGroup<"gnu-folding-constant">;
 def FormatExtraArgs : DiagGroup<"format-extra-args">;
 def FormatZeroLength : DiagGroup<"format-zero-length">;
 def CXX1zCompatMangling : DiagGroup<"c++1z-compat-mangling">;
+// Name of this warning in GCC.
+def NoexceptType : DiagGroup<"noexcept-type", [CXX1zCompatMangling]>;
 
 // Warnings for C++1y code which is not compatible with prior C++ standards.
 def CXXPre14Compat : DiagGroup<"c++98-c++11-compat">;
diff --git a/include/clang/Basic/DiagnosticIDs.h b/include/clang/Basic/DiagnosticIDs.h
index 479d1978c62d..cdd358542a0d 100644
--- a/include/clang/Basic/DiagnosticIDs.h
+++ b/include/clang/Basic/DiagnosticIDs.h
@@ -18,6 +18,7 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringRef.h"
+#include <vector>
 
 namespace clang {
   class DiagnosticsEngine;
@@ -263,6 +264,13 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// are not SFINAE errors.
   static SFINAEResponse getDiagnosticSFINAEResponse(unsigned DiagID);
 
+  /// \brief Get the string of all diagnostic flags.
+  ///
+  /// \returns A list of all diagnostics flags as they would be written in a
+  /// command line invocation including their `no-` variants. For example:
+  /// `{"-Wempty-body", "-Wno-empty-body", ...}`
+  static std::vector<std::string> getDiagnosticFlags();
+
   /// \brief Get the set of all diagnostic IDs in the group with the given name.
   ///
   /// \param[out] Diags - On return, the diagnostics in the group.
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 5a8750e4dab6..af14638e1d61 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -808,8 +808,10 @@ def warn_property_types_are_incompatible : Warning<
   "property type %0 is incompatible with type %1 inherited from %2">,
   InGroup<DiagGroup<"incompatible-property-type">>;
 def warn_protocol_property_mismatch : Warning<
-  "property of type %0 was selected for synthesis">,
+  "property %select{of type %1|with attribute '%1'|without attribute '%1'|with "
+  "getter %1|with setter %1}0 was selected for synthesis">,
   InGroup<DiagGroup<"protocol-property-synthesis-ambiguity">>;
+def err_protocol_property_mismatch: Error<warn_protocol_property_mismatch.Text>;
 def err_undef_interface : Error<"cannot find interface declaration for %0">;
 def err_category_forward_interface : Error<
   "cannot define %select{category|class extension}0 for undefined class %1">;
@@ -1088,7 +1090,9 @@ def err_category_property : Error<
 def note_property_declare : Note<
   "property declared here">;
 def note_protocol_property_declare : Note<
-  "it could also be property of type %0 declared here">;
+  "it could also be property "
+  "%select{of type %1|without attribute '%1'|with attribute '%1'|with getter "
+  "%1|with setter %1}0 declared here">;
 def note_property_synthesize : Note<
   "property synthesized here">;
 def err_synthesize_category_decl : Error<
@@ -4575,8 +4579,11 @@ def warn_deprecated_fwdclass_message : Warning<
     "%0 may be deprecated because the receiver type is unknown">,
     InGroup<DeprecatedDeclarations>;
 def warn_deprecated_def : Warning<
-    "Implementing deprecated %select{method|class|category}0">,
-    InGroup<DeprecatedImplementations>, DefaultIgnore;
+  "implementing deprecated %select{method|class|category}0">,
+  InGroup<DeprecatedImplementations>, DefaultIgnore;
+def warn_unavailable_def : Warning<
+  "implementing unavailable method">,
+  InGroup<DeprecatedImplementations>, DefaultIgnore;
 def err_unavailable : Error<"%0 is unavailable">;
 def err_property_method_unavailable :
     Error<"property access is using %0 method which is unavailable">;
@@ -8106,10 +8113,10 @@ def err_systemz_invalid_tabort_code : Error<
   "invalid transaction abort code">;
 def err_64_bit_builtin_32_bit_tgt : Error<
   "this builtin is only available on 64-bit targets">;
+def err_builtin_x64_aarch64_only : Error<
+  "this builtin is only available on x86-64 and aarch64 targets">;
 def err_ppc_builtin_only_on_pwr7 : Error<
   "this builtin is only valid on POWER7 or later CPUs">;
-def err_x86_builtin_64_only : Error<
-  "this builtin is only available on x86-64 targets">;
 def err_x86_builtin_invalid_rounding : Error<
   "invalid rounding argument">;
 def err_x86_builtin_invalid_scale : Error<
@@ -8648,11 +8655,11 @@ def err_omp_unknown_reduction_identifier : Error<
 def err_omp_not_resolved_reduction_identifier : Error<
   "unable to resolve declare reduction construct for type %0">;
 def err_omp_reduction_ref_type_arg : Error<
-  "argument of OpenMP clause 'reduction' must reference the same object in all threads">;
+  "argument of OpenMP clause '%0' must reference the same object in all threads">;
 def err_omp_clause_not_arithmetic_type_arg : Error<
-  "arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of %select{scalar|arithmetic}0 type">;
+  "arguments of OpenMP clause '%0' for 'min' or 'max' must be of %select{scalar|arithmetic}1 type">;
 def err_omp_clause_floating_type_arg : Error<
-  "arguments of OpenMP clause 'reduction' with bitwise operators cannot be of floating type">;
+  "arguments of OpenMP clause '%0' with bitwise operators cannot be of floating type">;
 def err_omp_once_referenced : Error<
   "variable can appear only once in OpenMP '%0' clause">;
 def err_omp_once_referenced_in_target_update : Error<
diff --git a/include/clang/Basic/DiagnosticSerializationKinds.td b/include/clang/Basic/DiagnosticSerializationKinds.td
index 0fc54848581c..420ccebbfaf0 100644
--- a/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -147,18 +147,29 @@ def err_module_odr_violation_mismatch_decl_diff : Error<
   "%select{non-|}5mutable field %4|"
   "field %4 with %select{no|an}5 initalizer|"
   "field %4 with an initializer|"
-  "method %4|"
-  "method %4 is %select{not deleted|deleted}5|"
-  "method %4 is %select{|pure }5%select{not virtual|virtual}6|"
-  "method %4 is %select{not static|static}5|"
-  "method %4 is %select{not volatile|volatile}5|"
-  "method %4 is %select{not const|const}5|"
-  "method %4 is %select{not inline|inline}5|"
-  "method %4 that has %5 parameter%s5|"
-  "method %4 with %ordinal5 parameter of type %6%select{| decayed from %8}7|"
-  "method %4 with %ordinal5 parameter named %6|"
-  "method %4 with %ordinal5 parameter with%select{out|}6 a default argument|"
-  "method %4 with %ordinal5 parameter with a default argument|"
+  "%select{method %5|constructor|destructor}4|"
+  "%select{method %5|constructor|destructor}4 "
+    "is %select{not deleted|deleted}6|"
+  "%select{method %5|constructor|destructor}4 "
+    "is %select{|pure }6%select{not virtual|virtual}7|"
+  "%select{method %5|constructor|destructor}4 "
+    "is %select{not static|static}6|"
+  "%select{method %5|constructor|destructor}4 "
+    "is %select{not volatile|volatile}6|"
+  "%select{method %5|constructor|destructor}4 "
+    "is %select{not const|const}6|"
+  "%select{method %5|constructor|destructor}4 "
+    "is %select{not inline|inline}6|"
+  "%select{method %5|constructor|destructor}4 "
+    "that has %6 parameter%s6|"
+  "%select{method %5|constructor|destructor}4 "
+    "with %ordinal6 parameter of type %7%select{| decayed from %9}8|"
+  "%select{method %5|constructor|destructor}4 "
+    "with %ordinal6 parameter named %7|"
+  "%select{method %5|constructor|destructor}4 "
+    "with %ordinal6 parameter with%select{out|}7 a default argument|"
+  "%select{method %5|constructor|destructor}4 "
+    "with %ordinal6 parameter with a default argument|"
   "%select{typedef|type alias}4 name %5|"
   "%select{typedef|type alias}4 %5 with underlying type %6|"
   "data member with name %4|"
@@ -183,18 +194,29 @@ def note_module_odr_violation_mismatch_decl_diff : Note<"but in '%0' found "
   "%select{non-|}3mutable field %2|"
   "field %2 with %select{no|an}3 initializer|"
   "field %2 with a different initializer|"
-  "method %2|"
-  "method %2 is %select{not deleted|deleted}3|"
-  "method %2 is %select{|pure }3%select{not virtual|virtual}4|"
-  "method %2 is %select{not static|static}3|"
-  "method %2 is %select{not volatile|volatile}3|"
-  "method %2 is %select{not const|const}3|"
-  "method %2 is %select{not inline|inline}3|"
-  "method %2 that has %3 parameter%s3|"
-  "method %2 with %ordinal3 parameter of type %4%select{| decayed from %6}5|"
-  "method %2 with %ordinal3 parameter named %4|"
-  "method %2 with %ordinal3 parameter with%select{out|}4 a default argument|"
-  "method %2 with %ordinal3 parameter with a different default argument|"
+  "%select{method %3|constructor|destructor}2|"
+  "%select{method %3|constructor|destructor}2 "
+    "is %select{not deleted|deleted}4|"
+  "%select{method %3|constructor|destructor}2 "
+    "is %select{|pure }4%select{not virtual|virtual}5|"
+  "%select{method %3|constructor|destructor}2 "
+    "is %select{not static|static}4|"
+  "%select{method %3|constructor|destructor}2 "
+    "is %select{not volatile|volatile}4|"
+  "%select{method %3|constructor|destructor}2 "
+    "is %select{not const|const}4|"
+  "%select{method %3|constructor|destructor}2 "
+    "is %select{not inline|inline}4|"
+  "%select{method %3|constructor|destructor}2 "
+    "that has %4 parameter%s4|"
+  "%select{method %3|constructor|destructor}2 "
+    "with %ordinal4 parameter of type %5%select{| decayed from %7}6|"
+  "%select{method %3|constructor|destructor}2 "
+    "with %ordinal4 parameter named %5|"
+  "%select{method %3|constructor|destructor}2 "
+    "with %ordinal4 parameter with%select{out|}5 a default argument|"
+  "%select{method %3|constructor|destructor}2 "
+    "with %ordinal4 parameter with a different default argument|"
   "%select{typedef|type alias}2 name %3|"
   "%select{typedef|type alias}2 %3 with different underlying type %4|"
   "data member with name %2|"
diff --git a/include/clang/Basic/IdentifierTable.h b/include/clang/Basic/IdentifierTable.h
index 9b1ba4a98e6f..f94b2c9b2f42 100644
--- a/include/clang/Basic/IdentifierTable.h
+++ b/include/clang/Basic/IdentifierTable.h
@@ -272,10 +272,6 @@ class IdentifierInfo {
   /// this identifier is a C++ alternate representation of an operator.
   void setIsCPlusPlusOperatorKeyword(bool Val = true) {
     IsCPPOperatorKeyword = Val;
-    if (Val)
-      NeedsHandleIdentifier = true;
-    else
-      RecomputeNeedsHandleIdentifier();
   }
   bool isCPlusPlusOperatorKeyword() const { return IsCPPOperatorKeyword; }
 
@@ -381,10 +377,9 @@ class IdentifierInfo {
   /// This method is very tied to the definition of HandleIdentifier.  Any
   /// change to it should be reflected here.
   void RecomputeNeedsHandleIdentifier() {
-    NeedsHandleIdentifier =
-      (isPoisoned() | hasMacroDefinition() | isCPlusPlusOperatorKeyword() |
-       isExtensionToken() | isFutureCompatKeyword() || isOutOfDate() ||
-       isModulesImport());
+    NeedsHandleIdentifier = isPoisoned() || hasMacroDefinition() ||
+                            isExtensionToken() || isFutureCompatKeyword() ||
+                            isOutOfDate() || isModulesImport();
   }
 };
 
diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def
index dfdad108922a..c9230e0aaa6f 100644
--- a/include/clang/Basic/LangOptions.def
+++ b/include/clang/Basic/LangOptions.def
@@ -90,6 +90,7 @@ LANGOPT(CPlusPlus         , 1, 0, "C++")
 LANGOPT(CPlusPlus11       , 1, 0, "C++11")
 LANGOPT(CPlusPlus14       , 1, 0, "C++14")
 LANGOPT(CPlusPlus1z       , 1, 0, "C++1z")
+LANGOPT(CPlusPlus2a       , 1, 0, "C++2a")
 LANGOPT(ObjC1             , 1, 0, "Objective-C 1")
 LANGOPT(ObjC2             , 1, 0, "Objective-C 2")
 BENIGN_LANGOPT(ObjCDefaultSynthProperties , 1, 0,
diff --git a/include/clang/Basic/OpenMPKinds.def b/include/clang/Basic/OpenMPKinds.def
index aae1c3a9b8c5..645ed52b59ca 100644
--- a/include/clang/Basic/OpenMPKinds.def
+++ b/include/clang/Basic/OpenMPKinds.def
@@ -168,6 +168,9 @@
 #ifndef OPENMP_TARGET_TEAMS_DISTRIBUTE_SIMD_CLAUSE
 #define OPENMP_TARGET_TEAMS_DISTRIBUTE_SIMD_CLAUSE(Name)
 #endif
+#ifndef OPENMP_TASKGROUP_CLAUSE
+#define OPENMP_TASKGROUP_CLAUSE(Name)
+#endif
 
 // OpenMP directives.
 OPENMP_DIRECTIVE(threadprivate)
@@ -270,6 +273,7 @@ OPENMP_CLAUSE(to, OMPToClause)
 OPENMP_CLAUSE(from, OMPFromClause)
 OPENMP_CLAUSE(use_device_ptr, OMPUseDevicePtrClause)
 OPENMP_CLAUSE(is_device_ptr, OMPIsDevicePtrClause)
+OPENMP_CLAUSE(task_reduction,  OMPTaskReductionClause)
 
 // Clauses allowed for OpenMP directive 'parallel'.
 OPENMP_PARALLEL_CLAUSE(if)
@@ -848,6 +852,10 @@ OPENMP_TARGET_TEAMS_DISTRIBUTE_SIMD_CLAUSE(aligned)
 OPENMP_TARGET_TEAMS_DISTRIBUTE_SIMD_CLAUSE(safelen)
 OPENMP_TARGET_TEAMS_DISTRIBUTE_SIMD_CLAUSE(simdlen)
 
+// Clauses allowed for OpenMP directive 'taskgroup'.
+OPENMP_TASKGROUP_CLAUSE(task_reduction)
+
+#undef OPENMP_TASKGROUP_CLAUSE
 #undef OPENMP_TASKLOOP_SIMD_CLAUSE
 #undef OPENMP_TASKLOOP_CLAUSE
 #undef OPENMP_LINEAR_KIND
diff --git a/include/clang/Basic/Specifiers.h b/include/clang/Basic/Specifiers.h
index 33952f83ff23..50fb936e01d1 100644
--- a/include/clang/Basic/Specifiers.h
+++ b/include/clang/Basic/Specifiers.h
@@ -236,7 +236,7 @@ namespace clang {
     CC_X86ThisCall, // __attribute__((thiscall))
     CC_X86VectorCall, // __attribute__((vectorcall))
     CC_X86Pascal,   // __attribute__((pascal))
-    CC_X86_64Win64, // __attribute__((ms_abi))
+    CC_Win64,       // __attribute__((ms_abi))
     CC_X86_64SysV,  // __attribute__((sysv_abi))
     CC_X86RegCall, // __attribute__((regcall))
     CC_AAPCS,       // __attribute__((pcs("aapcs")))
diff --git a/include/clang/Basic/TargetInfo.h b/include/clang/Basic/TargetInfo.h
index 5885532b91db..d1a9ea85dbe9 100644
--- a/include/clang/Basic/TargetInfo.h
+++ b/include/clang/Basic/TargetInfo.h
@@ -226,6 +226,20 @@ class TargetInfo : public RefCountedBase<TargetInfo> {
 
 public:
   IntType getSizeType() const { return SizeType; }
+  IntType getSignedSizeType() const {
+    switch (SizeType) {
+    case UnsignedShort:
+      return SignedShort;
+    case UnsignedInt:
+      return SignedInt;
+    case UnsignedLong:
+      return SignedLong;
+    case UnsignedLongLong:
+      return SignedLongLong;
+    default:
+      llvm_unreachable("Invalid SizeType");
+    }
+  }
   IntType getIntMaxType() const { return IntMaxType; }
   IntType getUIntMaxType() const {
     return getCorrespondingUnsignedType(IntMaxType);
diff --git a/include/clang/Config/config.h.cmake b/include/clang/Config/config.h.cmake
index 6971b4e9f06d..b138b5fcd828 100644
--- a/include/clang/Config/config.h.cmake
+++ b/include/clang/Config/config.h.cmake
@@ -56,4 +56,9 @@
 /* enable x86 relax relocations by default */
 #cmakedefine01 ENABLE_X86_RELAX_RELOCATIONS
 
+/* Enable each functionality of modules */
+#cmakedefine CLANG_ENABLE_ARCMT
+#cmakedefine CLANG_ENABLE_OBJC_REWRITER
+#cmakedefine CLANG_ENABLE_STATIC_ANALYZER
+
 #endif
diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td
index 861dfbf1916e..753c178eec6a 100644
--- a/include/clang/Driver/Options.td
+++ b/include/clang/Driver/Options.td
@@ -2119,6 +2119,7 @@ def nofixprebinding : Flag<["-"], "nofixprebinding">;
 def nolibc : Flag<["-"], "nolibc">;
 def nomultidefs : Flag<["-"], "nomultidefs">;
 def nopie : Flag<["-"], "nopie">;
+def no_pie : Flag<["-"], "no-pie">, Alias<nopie>;
 def noprebind : Flag<["-"], "noprebind">;
 def noseglinkedit : Flag<["-"], "noseglinkedit">;
 def nostartfiles : Flag<["-"], "nostartfiles">;
diff --git a/include/clang/Frontend/LangStandard.h b/include/clang/Frontend/LangStandard.h
index ec32aa8d161f..6731e08bcae8 100644
--- a/include/clang/Frontend/LangStandard.h
+++ b/include/clang/Frontend/LangStandard.h
@@ -26,11 +26,12 @@ enum LangFeatures {
   CPlusPlus11 = (1 << 4),
   CPlusPlus14 = (1 << 5),
   CPlusPlus1z = (1 << 6),
-  Digraphs = (1 << 7),
-  GNUMode = (1 << 8),
-  HexFloat = (1 << 9),
-  ImplicitInt = (1 << 10),
-  OpenCL = (1 << 11)
+  CPlusPlus2a = (1 << 7),
+  Digraphs = (1 << 8),
+  GNUMode = (1 << 9),
+  HexFloat = (1 << 10),
+  ImplicitInt = (1 << 11),
+  OpenCL = (1 << 12)
 };
 
 }
@@ -81,6 +82,10 @@ struct LangStandard {
   /// isCPlusPlus1z - Language is a C++17 variant (or later).
   bool isCPlusPlus1z() const { return Flags & frontend::CPlusPlus1z; }
 
+  /// isCPlusPlus2a - Language is a post-C++17 variant (or later).
+  bool isCPlusPlus2a() const { return Flags & frontend::CPlusPlus2a; }
+
+
   /// hasDigraphs - Language supports digraphs.
   bool hasDigraphs() const { return Flags & frontend::Digraphs; }
 
diff --git a/include/clang/Frontend/LangStandards.def b/include/clang/Frontend/LangStandards.def
index 1d214fd2a2be..669e487023a5 100644
--- a/include/clang/Frontend/LangStandards.def
+++ b/include/clang/Frontend/LangStandards.def
@@ -119,6 +119,16 @@ LANGSTANDARD(gnucxx1z, "gnu++1z",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus1z |
              Digraphs | HexFloat | GNUMode)
 
+LANGSTANDARD(cxx2a, "c++2a",
+             CXX, "Working draft for ISO C++ 2020",
+             LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus1z |
+             CPlusPlus2a | Digraphs | HexFloat)
+
+LANGSTANDARD(gnucxx2a, "gnu++2a",
+             CXX, "Working draft for ISO C++ 2020 with GNU extensions",
+             LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus1z |
+             CPlusPlus2a | Digraphs | HexFloat | GNUMode)
+
 // OpenCL
 LANGSTANDARD(opencl10, "cl1.0",
              OpenCL, "OpenCL 1.0",
diff --git a/include/clang/Index/IndexingAction.h b/include/clang/Index/IndexingAction.h
index 8eed33c61227..fb703be4e5f5 100644
--- a/include/clang/Index/IndexingAction.h
+++ b/include/clang/Index/IndexingAction.h
@@ -11,11 +11,14 @@
 #define LLVM_CLANG_INDEX_INDEXINGACTION_H
 
 #include "clang/Basic/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
 #include <memory>
 
 namespace clang {
+  class ASTContext;
   class ASTReader;
   class ASTUnit;
+  class Decl;
   class FrontendAction;
 
 namespace serialization {
@@ -47,8 +50,11 @@ void indexASTUnit(ASTUnit &Unit,
                   std::shared_ptr<IndexDataConsumer> DataConsumer,
                   IndexingOptions Opts);
 
-void indexModuleFile(serialization::ModuleFile &Mod,
-                     ASTReader &Reader,
+void indexTopLevelDecls(ASTContext &Ctx, ArrayRef<const Decl *> Decls,
+                        std::shared_ptr<IndexDataConsumer> DataConsumer,
+                        IndexingOptions Opts);
+
+void indexModuleFile(serialization::ModuleFile &Mod, ASTReader &Reader,
                      std::shared_ptr<IndexDataConsumer> DataConsumer,
                      IndexingOptions Opts);
 
diff --git a/include/clang/Lex/MacroInfo.h b/include/clang/Lex/MacroInfo.h
index 7da1e7b41ab8..d25431b55fdc 100644
--- a/include/clang/Lex/MacroInfo.h
+++ b/include/clang/Lex/MacroInfo.h
@@ -42,14 +42,14 @@ class MacroInfo {
 
   /// \brief The list of arguments for a function-like macro.
   ///
-  /// ArgumentList points to the first of NumArguments pointers.
+  /// ParameterList points to the first of NumParameters pointers.
   ///
   /// This can be empty, for, e.g. "#define X()".  In a C99-style variadic
   /// macro, this includes the \c __VA_ARGS__ identifier on the list.
-  IdentifierInfo **ArgumentList;
+  IdentifierInfo **ParameterList;
 
-  /// \see ArgumentList
-  unsigned NumArguments;
+  /// \see ParameterList
+  unsigned NumParameters;
 
   /// \brief This is the list of tokens that the macro is defined to.
   SmallVector<Token, 8> ReplacementTokens;
@@ -153,37 +153,37 @@ class MacroInfo {
   /// \brief Set the value of the IsWarnIfUnused flag.
   void setIsWarnIfUnused(bool val) { IsWarnIfUnused = val; }
 
-  /// \brief Set the specified list of identifiers as the argument list for
+  /// \brief Set the specified list of identifiers as the parameter list for
   /// this macro.
-  void setArgumentList(ArrayRef<IdentifierInfo *> List,
+  void setParameterList(ArrayRef<IdentifierInfo *> List,
                        llvm::BumpPtrAllocator &PPAllocator) {
-    assert(ArgumentList == nullptr && NumArguments == 0 &&
-           "Argument list already set!");
+    assert(ParameterList == nullptr && NumParameters == 0 &&
+           "Parameter list already set!");
     if (List.empty())
       return;
 
-    NumArguments = List.size();
-    ArgumentList = PPAllocator.Allocate<IdentifierInfo *>(List.size());
-    std::copy(List.begin(), List.end(), ArgumentList);
+    NumParameters = List.size();
+    ParameterList = PPAllocator.Allocate<IdentifierInfo *>(List.size());
+    std::copy(List.begin(), List.end(), ParameterList);
   }
 
-  /// Arguments - The list of arguments for a function-like macro.  This can be
-  /// empty, for, e.g. "#define X()".
-  typedef IdentifierInfo *const *arg_iterator;
-  bool arg_empty() const { return NumArguments == 0; }
-  arg_iterator arg_begin() const { return ArgumentList; }
-  arg_iterator arg_end() const { return ArgumentList + NumArguments; }
-  unsigned getNumArgs() const { return NumArguments; }
-  ArrayRef<const IdentifierInfo *> args() const {
-    return ArrayRef<const IdentifierInfo *>(ArgumentList, NumArguments);
+  /// Parameters - The list of parameters for a function-like macro.  This can 
+  /// be empty, for, e.g. "#define X()".
+  typedef IdentifierInfo *const *param_iterator;
+  bool param_empty() const { return NumParameters == 0; }
+  param_iterator param_begin() const { return ParameterList; }
+  param_iterator param_end() const { return ParameterList + NumParameters; }
+  unsigned getNumParams() const { return NumParameters; }
+  ArrayRef<const IdentifierInfo *> params() const {
+    return ArrayRef<const IdentifierInfo *>(ParameterList, NumParameters);
   }
 
-  /// \brief Return the argument number of the specified identifier,
-  /// or -1 if the identifier is not a formal argument identifier.
-  int getArgumentNum(const IdentifierInfo *Arg) const {
-    for (arg_iterator I = arg_begin(), E = arg_end(); I != E; ++I)
+  /// \brief Return the parameter number of the specified identifier,
+  /// or -1 if the identifier is not a formal parameter identifier.
+  int getParameterNum(const IdentifierInfo *Arg) const {
+    for (param_iterator I = param_begin(), E = param_end(); I != E; ++I)
       if (*I == Arg)
-        return I - arg_begin();
+        return I - param_begin();
     return -1;
   }
 
diff --git a/include/clang/Lex/Preprocessor.h b/include/clang/Lex/Preprocessor.h
index 62090d6496ed..a058fbfbb4cf 100644
--- a/include/clang/Lex/Preprocessor.h
+++ b/include/clang/Lex/Preprocessor.h
@@ -1813,11 +1813,24 @@ class Preprocessor {
   void ReadMacroName(Token &MacroNameTok, MacroUse IsDefineUndef = MU_Other,
                      bool *ShadowFlag = nullptr);
 
+  /// ReadOptionalMacroParameterListAndBody - This consumes all (i.e. the
+  /// entire line) of the macro's tokens and adds them to MacroInfo, and while
+  /// doing so performs certain validity checks including (but not limited to):
+  ///   - # (stringization) is followed by a macro parameter
+  /// \param MacroNameTok - Token that represents the macro name
+  /// \param ImmediatelyAfterHeaderGuard - Macro follows an #ifdef header guard
+  /// 
+  ///  Either returns a pointer to a MacroInfo object OR emits a diagnostic and
+  ///  returns a nullptr if an invalid sequence of tokens is encountered.
+
+  MacroInfo *ReadOptionalMacroParameterListAndBody(
+      const Token &MacroNameTok, bool ImmediatelyAfterHeaderGuard);
+
   /// The ( starting an argument list of a macro definition has just been read.
-  /// Lex the rest of the arguments and the closing ), updating \p MI with
+  /// Lex the rest of the parameters and the closing ), updating \p MI with
   /// what we learn and saving in \p LastTok the last token read.
   /// Return true if an error occurs parsing the arg list.
-  bool ReadMacroDefinitionArgList(MacroInfo *MI, Token& LastTok);
+  bool ReadMacroParameterList(MacroInfo *MI, Token& LastTok);
 
   /// We just read a \#if or related directive and decided that the
   /// subsequent tokens are in the \#if'd out portion of the
@@ -1878,7 +1891,7 @@ class Preprocessor {
 
   /// After reading "MACRO(", this method is invoked to read all of the formal
   /// arguments specified for the macro invocation.  Returns null on error.
-  MacroArgs *ReadFunctionLikeMacroArgs(Token &MacroName, MacroInfo *MI,
+  MacroArgs *ReadMacroCallArgumentList(Token &MacroName, MacroInfo *MI,
                                        SourceLocation &ExpansionEnd);
 
   /// \brief If an identifier token is read that is to be expanded
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 95629a2591cf..5a708545705c 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -3881,13 +3881,10 @@ class Sema {
 
   void redelayDiagnostics(sema::DelayedDiagnosticPool &pool);
 
-  void EmitAvailabilityWarning(AvailabilityResult AR,
-                               const NamedDecl *ReferringDecl,
-                               const NamedDecl *OffendingDecl,
-                               StringRef Message, SourceLocation Loc,
-                               const ObjCInterfaceDecl *UnknownObjCClass,
-                               const ObjCPropertyDecl *ObjCProperty,
-                               bool ObjCPropertyAccess);
+  void DiagnoseAvailabilityOfDecl(NamedDecl *D, SourceLocation Loc,
+                                  const ObjCInterfaceDecl *UnknownObjCClass,
+                                  bool ObjCPropertyAccess,
+                                  bool AvoidPartialAvailabilityChecks = false);
 
   bool makeUnavailableInSystemHeader(SourceLocation loc,
                                      UnavailableAttr::ImplicitReason reason);
@@ -8380,6 +8377,8 @@ class Sema {
                          unsigned SpellingListIndex, bool isNSConsumed,
                          bool isTemplateInstantiation);
 
+  bool checkNSReturnsRetainedReturnType(SourceLocation loc, QualType type);
+
   //===--------------------------------------------------------------------===//
   // C++ Coroutines TS
   //
@@ -8680,7 +8679,8 @@ class Sema {
   StmtResult ActOnOpenMPTaskwaitDirective(SourceLocation StartLoc,
                                           SourceLocation EndLoc);
   /// \brief Called on well-formed '\#pragma omp taskgroup'.
-  StmtResult ActOnOpenMPTaskgroupDirective(Stmt *AStmt, SourceLocation StartLoc,
+  StmtResult ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
+                                           Stmt *AStmt, SourceLocation StartLoc,
                                            SourceLocation EndLoc);
   /// \brief Called on well-formed '\#pragma omp flush'.
   StmtResult ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
@@ -9022,6 +9022,13 @@ class Sema {
       CXXScopeSpec &ReductionIdScopeSpec,
       const DeclarationNameInfo &ReductionId,
       ArrayRef<Expr *> UnresolvedReductions = llvm::None);
+  /// Called on well-formed 'task_reduction' clause.
+  OMPClause *ActOnOpenMPTaskReductionClause(
+      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
+      CXXScopeSpec &ReductionIdScopeSpec,
+      const DeclarationNameInfo &ReductionId,
+      ArrayRef<Expr *> UnresolvedReductions = llvm::None);
   /// \brief Called on well-formed 'linear' clause.
   OMPClause *
   ActOnOpenMPLinearClause(ArrayRef<Expr *> VarList, Expr *Step,
@@ -10416,15 +10423,6 @@ class Sema {
     return OriginalLexicalContext ? OriginalLexicalContext : CurContext;
   }
 
-  /// The diagnostic we should emit for \c D, and the declaration that
-  /// originated it, or \c AR_Available.
-  ///
-  /// \param D The declaration to check.
-  /// \param Message If non-null, this will be populated with the message from
-  /// the availability attribute that is selected.
-  std::pair<AvailabilityResult, const NamedDecl *>
-  ShouldDiagnoseAvailabilityOfDecl(const NamedDecl *D, std::string *Message);
-
   const DeclContext *getCurObjCLexicalContext() const {
     const DeclContext *DC = getCurLexicalContext();
     // A category implicitly has the attribute of the interface.
diff --git a/include/clang/Tooling/DiagnosticsYaml.h b/include/clang/Tooling/DiagnosticsYaml.h
index f32b9fa9c94b..4d6ff063641b 100644
--- a/include/clang/Tooling/DiagnosticsYaml.h
+++ b/include/clang/Tooling/DiagnosticsYaml.h
@@ -56,6 +56,9 @@ template <> struct MappingTraits<clang::tooling::Diagnostic> {
     MappingNormalization<NormalizedDiagnostic, clang::tooling::Diagnostic> Keys(
         Io, D);
     Io.mapRequired("DiagnosticName", Keys->DiagnosticName);
+    Io.mapRequired("Message", Keys->Message.Message);
+    Io.mapRequired("FileOffset", Keys->Message.FileOffset);
+    Io.mapRequired("FilePath", Keys->Message.FilePath);
 
     // FIXME: Export properly all the different fields.
 
@@ -82,17 +85,7 @@ template <> struct MappingTraits<clang::tooling::Diagnostic> {
 template <> struct MappingTraits<clang::tooling::TranslationUnitDiagnostics> {
   static void mapping(IO &Io, clang::tooling::TranslationUnitDiagnostics &Doc) {
     Io.mapRequired("MainSourceFile", Doc.MainSourceFile);
-
-    std::vector<clang::tooling::Diagnostic> Diagnostics;
-    for (auto &Diagnostic : Doc.Diagnostics) {
-      // FIXME: Export all diagnostics, not just the ones with fixes.
-      // Update MappingTraits<clang::tooling::Diagnostic>::mapping.
-      if (Diagnostic.Fix.size() > 0) {
-        Diagnostics.push_back(Diagnostic);
-      }
-    }
-    Io.mapRequired("Diagnostics", Diagnostics);
-    Doc.Diagnostics = Diagnostics;
+    Io.mapRequired("Diagnostics", Doc.Diagnostics);
   }
 };
 } // end namespace yaml
diff --git a/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h b/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
new file mode 100644
index 000000000000..8b01a61256f6
--- /dev/null
+++ b/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h
@@ -0,0 +1,122 @@
+//===--- RecursiveSymbolVisitor.h - Clang refactoring library -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief A wrapper class around \c RecursiveASTVisitor that visits each
+/// occurrences of a named symbol.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_REFACTOR_RECURSIVE_SYMBOL_VISITOR_H
+#define LLVM_CLANG_TOOLING_REFACTOR_RECURSIVE_SYMBOL_VISITOR_H
+
+#include "clang/AST/AST.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Lex/Lexer.h"
+
+namespace clang {
+namespace tooling {
+
+/// Traverses the AST and visits the occurrence of each named symbol in the
+/// given nodes.
+template <typename T>
+class RecursiveSymbolVisitor
+    : public RecursiveASTVisitor<RecursiveSymbolVisitor<T>> {
+  using BaseType = RecursiveASTVisitor<RecursiveSymbolVisitor<T>>;
+
+public:
+  RecursiveSymbolVisitor(const SourceManager &SM, const LangOptions &LangOpts)
+      : SM(SM), LangOpts(LangOpts) {}
+
+  bool visitSymbolOccurrence(const NamedDecl *ND,
+                             ArrayRef<SourceRange> NameRanges) {
+    return true;
+  }
+
+  // Declaration visitors:
+
+  bool VisitNamedDecl(const NamedDecl *D) {
+    return isa<CXXConversionDecl>(D) ? true : visit(D, D->getLocation());
+  }
+
+  bool VisitCXXConstructorDecl(const CXXConstructorDecl *CD) {
+    for (const auto *Initializer : CD->inits()) {
+      // Ignore implicit initializers.
+      if (!Initializer->isWritten())
+        continue;
+      if (const FieldDecl *FD = Initializer->getMember()) {
+        if (!visit(FD, Initializer->getSourceLocation(),
+                   Lexer::getLocForEndOfToken(Initializer->getSourceLocation(),
+                                              0, SM, LangOpts)))
+          return false;
+      }
+    }
+    return true;
+  }
+
+  // Expression visitors:
+
+  bool VisitDeclRefExpr(const DeclRefExpr *Expr) {
+    return visit(Expr->getFoundDecl(), Expr->getLocation());
+  }
+
+  bool VisitMemberExpr(const MemberExpr *Expr) {
+    return visit(Expr->getFoundDecl().getDecl(), Expr->getMemberLoc());
+  }
+
+  // Other visitors:
+
+  bool VisitTypeLoc(const TypeLoc Loc) {
+    const SourceLocation TypeBeginLoc = Loc.getBeginLoc();
+    const SourceLocation TypeEndLoc =
+        Lexer::getLocForEndOfToken(TypeBeginLoc, 0, SM, LangOpts);
+    if (const auto *TemplateTypeParm =
+            dyn_cast<TemplateTypeParmType>(Loc.getType())) {
+      if (!visit(TemplateTypeParm->getDecl(), TypeBeginLoc, TypeEndLoc))
+        return false;
+    }
+    if (const auto *TemplateSpecType =
+            dyn_cast<TemplateSpecializationType>(Loc.getType())) {
+      if (!visit(TemplateSpecType->getTemplateName().getAsTemplateDecl(),
+                 TypeBeginLoc, TypeEndLoc))
+        return false;
+    }
+    return visit(Loc.getType()->getAsCXXRecordDecl(), TypeBeginLoc, TypeEndLoc);
+  }
+
+  bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) {
+    // The base visitor will visit NNSL prefixes, so we should only look at
+    // the current NNS.
+    if (NNS) {
+      const NamespaceDecl *ND = NNS.getNestedNameSpecifier()->getAsNamespace();
+      if (!visit(ND, NNS.getLocalBeginLoc(), NNS.getLocalEndLoc()))
+        return false;
+    }
+    return BaseType::TraverseNestedNameSpecifierLoc(NNS);
+  }
+
+private:
+  const SourceManager &SM;
+  const LangOptions &LangOpts;
+
+  bool visit(const NamedDecl *ND, SourceLocation BeginLoc,
+             SourceLocation EndLoc) {
+    return static_cast<T *>(this)->visitSymbolOccurrence(
+        ND, SourceRange(BeginLoc, EndLoc));
+  }
+  bool visit(const NamedDecl *ND, SourceLocation Loc) {
+    return visit(ND, Loc,
+                 Loc.getLocWithOffset(ND->getNameAsString().length() - 1));
+  }
+};
+
+} // end namespace tooling
+} // end namespace clang
+
+#endif // LLVM_CLANG_TOOLING_REFACTOR_RECURSIVE_SYMBOL_VISITOR_H
diff --git a/include/clang/Tooling/Refactoring/Rename/USRFinder.h b/include/clang/Tooling/Refactoring/Rename/USRFinder.h
index 28d541af43c0..b74a5d7f70af 100644
--- a/include/clang/Tooling/Refactoring/Rename/USRFinder.h
+++ b/include/clang/Tooling/Refactoring/Rename/USRFinder.h
@@ -18,13 +18,9 @@
 
 #include "clang/AST/AST.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/ASTMatchers/ASTMatchFinder.h"
 #include <string>
 #include <vector>
 
-using namespace llvm;
-using namespace clang::ast_matchers;
-
 namespace clang {
 
 class ASTContext;
@@ -48,36 +44,6 @@ const NamedDecl *getNamedDeclFor(const ASTContext &Context,
 // Converts a Decl into a USR.
 std::string getUSRForDecl(const Decl *Decl);
 
-// FIXME: Implement RecursiveASTVisitor<T>::VisitNestedNameSpecifier instead.
-class NestedNameSpecifierLocFinder : public MatchFinder::MatchCallback {
-public:
-  explicit NestedNameSpecifierLocFinder(ASTContext &Context)
-      : Context(Context) {}
-
-  std::vector<NestedNameSpecifierLoc> getNestedNameSpecifierLocations() {
-    addMatchers();
-    Finder.matchAST(Context);
-    return Locations;
-  }
-
-private:
-  void addMatchers() {
-    const auto NestedNameSpecifierLocMatcher =
-        nestedNameSpecifierLoc().bind("nestedNameSpecifierLoc");
-    Finder.addMatcher(NestedNameSpecifierLocMatcher, this);
-  }
-
-  void run(const MatchFinder::MatchResult &Result) override {
-    const auto *NNS = Result.Nodes.getNodeAs<NestedNameSpecifierLoc>(
-        "nestedNameSpecifierLoc");
-    Locations.push_back(*NNS);
-  }
-
-  ASTContext &Context;
-  std::vector<NestedNameSpecifierLoc> Locations;
-  MatchFinder Finder;
-};
-
 } // end namespace tooling
 } // end namespace clang
 
diff --git a/include/clang/module.modulemap b/include/clang/module.modulemap
index f7e338d93399..d850bd552e1f 100644
--- a/include/clang/module.modulemap
+++ b/include/clang/module.modulemap
@@ -138,5 +138,4 @@ module Clang_Tooling {
   // importing the AST matchers library gives a link dependency on the AST
   // matchers (and thus the AST), which clang-format should not have.
   exclude header "Tooling/RefactoringCallbacks.h"
-  exclude header "Tooling/Refactoring/Rename/USRFinder.h"
 }
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index fd9723298fca..c60373c5a90a 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -4525,6 +4525,12 @@ CanQualType ASTContext::getSizeType() const {
   return getFromTargetType(Target->getSizeType());
 }
 
+/// Return the unique signed counterpart of the integer type 
+/// corresponding to size_t.
+CanQualType ASTContext::getSignedSizeType() const {
+  return getFromTargetType(Target->getSignedSizeType());
+}
+
 /// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5).
 CanQualType ASTContext::getIntMaxType() const {
   return getFromTargetType(Target->getIntMaxType());
diff --git a/lib/AST/ASTDumper.cpp b/lib/AST/ASTDumper.cpp
index 4758109fbcf7..92ed7da94d8e 100644
--- a/lib/AST/ASTDumper.cpp
+++ b/lib/AST/ASTDumper.cpp
@@ -1189,12 +1189,8 @@ void ASTDumper::VisitFunctionDecl(const FunctionDecl *D) {
       auto dumpOverride =
         [=](const CXXMethodDecl *D) {
           SplitQualType T_split = D->getType().split();
-          OS << D << " " << D->getParent()->getName() << "::";
-          if (isa<CXXDestructorDecl>(D))
-            OS << "~" << D->getParent()->getName();
-          else
-            OS << D->getName();
-          OS << " '" << QualType::getAsString(T_split) << "'";
+          OS << D << " " << D->getParent()->getName() << "::"
+             << D->getNameAsString() << " '" << QualType::getAsString(T_split) << "'";
         };
 
       dumpChild([=] {
diff --git a/lib/AST/DeclObjC.cpp b/lib/AST/DeclObjC.cpp
index a0ec0c2b251e..d8bdb6369e94 100644
--- a/lib/AST/DeclObjC.cpp
+++ b/lib/AST/DeclObjC.cpp
@@ -1889,25 +1889,23 @@ void ObjCProtocolDecl::collectPropertiesToImplement(PropertyMap &PM,
   }
 }
 
-    
 void ObjCProtocolDecl::collectInheritedProtocolProperties(
-                                                const ObjCPropertyDecl *Property,
-                                                ProtocolPropertyMap &PM) const {
+    const ObjCPropertyDecl *Property, ProtocolPropertySet &PS,
+    PropertyDeclOrder &PO) const {
   if (const ObjCProtocolDecl *PDecl = getDefinition()) {
-    bool MatchFound = false;
+    if (!PS.insert(PDecl).second)
+      return;
     for (auto *Prop : PDecl->properties()) {
       if (Prop == Property)
         continue;
       if (Prop->getIdentifier() == Property->getIdentifier()) {
-        PM[PDecl] = Prop;
-        MatchFound = true;
-        break;
+        PO.push_back(Prop);
+        return;
       }
     }
     // Scan through protocol's protocols which did not have a matching property.
-    if (!MatchFound)
-      for (const auto *PI : PDecl->protocols())
-        PI->collectInheritedProtocolProperties(Property, PM);
+    for (const auto *PI : PDecl->protocols())
+      PI->collectInheritedProtocolProperties(Property, PS, PO);
   }
 }
 
diff --git a/lib/AST/ItaniumMangle.cpp b/lib/AST/ItaniumMangle.cpp
index dc25e5213bae..4e7c6c4edf37 100644
--- a/lib/AST/ItaniumMangle.cpp
+++ b/lib/AST/ItaniumMangle.cpp
@@ -2529,7 +2529,7 @@ StringRef CXXNameMangler::getCallingConvQualifierName(CallingConv CC) {
   case CC_X86ThisCall:
   case CC_X86VectorCall:
   case CC_X86Pascal:
-  case CC_X86_64Win64:
+  case CC_Win64:
   case CC_X86_64SysV:
   case CC_X86RegCall:
   case CC_AAPCS:
diff --git a/lib/AST/MicrosoftMangle.cpp b/lib/AST/MicrosoftMangle.cpp
index 3a899bdbb6d2..24b16f892e7a 100644
--- a/lib/AST/MicrosoftMangle.cpp
+++ b/lib/AST/MicrosoftMangle.cpp
@@ -2122,7 +2122,7 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
   switch (CC) {
     default:
       llvm_unreachable("Unsupported CC for mangling");
-    case CC_X86_64Win64:
+    case CC_Win64:
     case CC_X86_64SysV:
     case CC_C: Out << 'A'; break;
     case CC_X86Pascal: Out << 'C'; break;
diff --git a/lib/AST/ODRHash.cpp b/lib/AST/ODRHash.cpp
index 66b9940b8b08..b19135384cfd 100644
--- a/lib/AST/ODRHash.cpp
+++ b/lib/AST/ODRHash.cpp
@@ -246,7 +246,9 @@ class ODRDeclVisitor : public ConstDeclVisitor<ODRDeclVisitor> {
   }
 
   void VisitValueDecl(const ValueDecl *D) {
-    AddQualType(D->getType());
+    if (!isa<FunctionDecl>(D)) {
+      AddQualType(D->getType());
+    }
     Inherited::VisitValueDecl(D);
   }
 
@@ -305,6 +307,8 @@ class ODRDeclVisitor : public ConstDeclVisitor<ODRDeclVisitor> {
       Hash.AddSubDecl(Param);
     }
 
+    AddQualType(D->getReturnType());
+
     Inherited::VisitFunctionDecl(D);
   }
 
@@ -350,6 +354,8 @@ bool ODRHash::isWhitelistedDecl(const Decl *D, const CXXRecordDecl *Parent) {
     default:
       return false;
     case Decl::AccessSpec:
+    case Decl::CXXConstructor:
+    case Decl::CXXDestructor:
     case Decl::CXXMethod:
     case Decl::Field:
     case Decl::Friend:
diff --git a/lib/AST/OpenMPClause.cpp b/lib/AST/OpenMPClause.cpp
index 77470a9b76d0..2c4d159a1bc8 100644
--- a/lib/AST/OpenMPClause.cpp
+++ b/lib/AST/OpenMPClause.cpp
@@ -46,6 +46,8 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
     return static_cast<const OMPLastprivateClause *>(C);
   case OMPC_reduction:
     return static_cast<const OMPReductionClause *>(C);
+  case OMPC_task_reduction:
+    return static_cast<const OMPTaskReductionClause *>(C);
   case OMPC_linear:
     return static_cast<const OMPLinearClause *>(C);
   case OMPC_if:
@@ -112,6 +114,8 @@ const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C)
     return static_cast<const OMPLastprivateClause *>(C);
   case OMPC_reduction:
     return static_cast<const OMPReductionClause *>(C);
+  case OMPC_task_reduction:
+    return static_cast<const OMPTaskReductionClause *>(C);
   case OMPC_linear:
     return static_cast<const OMPLinearClause *>(C);
   case OMPC_schedule:
@@ -505,6 +509,59 @@ OMPReductionClause *OMPReductionClause::CreateEmpty(const ASTContext &C,
   return new (Mem) OMPReductionClause(N);
 }
 
+void OMPTaskReductionClause::setPrivates(ArrayRef<Expr *> Privates) {
+  assert(Privates.size() == varlist_size() &&
+         "Number of private copies is not the same as the preallocated buffer");
+  std::copy(Privates.begin(), Privates.end(), varlist_end());
+}
+
+void OMPTaskReductionClause::setLHSExprs(ArrayRef<Expr *> LHSExprs) {
+  assert(
+      LHSExprs.size() == varlist_size() &&
+      "Number of LHS expressions is not the same as the preallocated buffer");
+  std::copy(LHSExprs.begin(), LHSExprs.end(), getPrivates().end());
+}
+
+void OMPTaskReductionClause::setRHSExprs(ArrayRef<Expr *> RHSExprs) {
+  assert(
+      RHSExprs.size() == varlist_size() &&
+      "Number of RHS expressions is not the same as the preallocated buffer");
+  std::copy(RHSExprs.begin(), RHSExprs.end(), getLHSExprs().end());
+}
+
+void OMPTaskReductionClause::setReductionOps(ArrayRef<Expr *> ReductionOps) {
+  assert(ReductionOps.size() == varlist_size() && "Number of task reduction "
+                                                  "expressions is not the same "
+                                                  "as the preallocated buffer");
+  std::copy(ReductionOps.begin(), ReductionOps.end(), getRHSExprs().end());
+}
+
+OMPTaskReductionClause *OMPTaskReductionClause::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation EndLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VL,
+    NestedNameSpecifierLoc QualifierLoc, const DeclarationNameInfo &NameInfo,
+    ArrayRef<Expr *> Privates, ArrayRef<Expr *> LHSExprs,
+    ArrayRef<Expr *> RHSExprs, ArrayRef<Expr *> ReductionOps, Stmt *PreInit,
+    Expr *PostUpdate) {
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * VL.size()));
+  OMPTaskReductionClause *Clause = new (Mem) OMPTaskReductionClause(
+      StartLoc, LParenLoc, EndLoc, ColonLoc, VL.size(), QualifierLoc, NameInfo);
+  Clause->setVarRefs(VL);
+  Clause->setPrivates(Privates);
+  Clause->setLHSExprs(LHSExprs);
+  Clause->setRHSExprs(RHSExprs);
+  Clause->setReductionOps(ReductionOps);
+  Clause->setPreInitStmt(PreInit);
+  Clause->setPostUpdateExpr(PostUpdate);
+  return Clause;
+}
+
+OMPTaskReductionClause *OMPTaskReductionClause::CreateEmpty(const ASTContext &C,
+                                                            unsigned N) {
+  void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(5 * N));
+  return new (Mem) OMPTaskReductionClause(N);
+}
+
 OMPFlushClause *OMPFlushClause::Create(const ASTContext &C,
                                        SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
diff --git a/lib/AST/StmtOpenMP.cpp b/lib/AST/StmtOpenMP.cpp
index cccb2f075b65..1dcb4fd5196b 100644
--- a/lib/AST/StmtOpenMP.cpp
+++ b/lib/AST/StmtOpenMP.cpp
@@ -522,23 +522,28 @@ OMPTaskwaitDirective *OMPTaskwaitDirective::CreateEmpty(const ASTContext &C,
   return new (Mem) OMPTaskwaitDirective();
 }
 
-OMPTaskgroupDirective *OMPTaskgroupDirective::Create(const ASTContext &C,
-                                                     SourceLocation StartLoc,
-                                                     SourceLocation EndLoc,
-                                                     Stmt *AssociatedStmt) {
-  unsigned Size = llvm::alignTo(sizeof(OMPTaskgroupDirective), alignof(Stmt *));
+OMPTaskgroupDirective *OMPTaskgroupDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt) {
+  unsigned Size = llvm::alignTo(sizeof(OMPTaskgroupDirective) +
+                                    sizeof(OMPClause *) * Clauses.size(),
+                                alignof(Stmt *));
   void *Mem = C.Allocate(Size + sizeof(Stmt *));
   OMPTaskgroupDirective *Dir =
-      new (Mem) OMPTaskgroupDirective(StartLoc, EndLoc);
+      new (Mem) OMPTaskgroupDirective(StartLoc, EndLoc, Clauses.size());
   Dir->setAssociatedStmt(AssociatedStmt);
+  Dir->setClauses(Clauses);
   return Dir;
 }
 
 OMPTaskgroupDirective *OMPTaskgroupDirective::CreateEmpty(const ASTContext &C,
+                                                          unsigned NumClauses,
                                                           EmptyShell) {
-  unsigned Size = llvm::alignTo(sizeof(OMPTaskgroupDirective), alignof(Stmt *));
+  unsigned Size = llvm::alignTo(sizeof(OMPTaskgroupDirective) +
+                                    sizeof(OMPClause *) * NumClauses,
+                                alignof(Stmt *));
   void *Mem = C.Allocate(Size + sizeof(Stmt *));
-  return new (Mem) OMPTaskgroupDirective();
+  return new (Mem) OMPTaskgroupDirective(NumClauses);
 }
 
 OMPCancellationPointDirective *OMPCancellationPointDirective::Create(
diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp
index 21f5259c3ca8..5ebaa32b49c8 100644
--- a/lib/AST/StmtPrinter.cpp
+++ b/lib/AST/StmtPrinter.cpp
@@ -836,6 +836,29 @@ void OMPClausePrinter::VisitOMPReductionClause(OMPReductionClause *Node) {
   }
 }
 
+void OMPClausePrinter::VisitOMPTaskReductionClause(
+    OMPTaskReductionClause *Node) {
+  if (!Node->varlist_empty()) {
+    OS << "task_reduction(";
+    NestedNameSpecifier *QualifierLoc =
+        Node->getQualifierLoc().getNestedNameSpecifier();
+    OverloadedOperatorKind OOK =
+        Node->getNameInfo().getName().getCXXOverloadedOperator();
+    if (QualifierLoc == nullptr && OOK != OO_None) {
+      // Print reduction identifier in C format
+      OS << getOperatorSpelling(OOK);
+    } else {
+      // Use C++ format
+      if (QualifierLoc != nullptr)
+        QualifierLoc->print(OS, Policy);
+      OS << Node->getNameInfo();
+    }
+    OS << ":";
+    VisitOMPClauseList(Node, ' ');
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPLinearClause(OMPLinearClause *Node) {
   if (!Node->varlist_empty()) {
     OS << "linear";
@@ -1081,7 +1104,7 @@ void StmtPrinter::VisitOMPTaskwaitDirective(OMPTaskwaitDirective *Node) {
 }
 
 void StmtPrinter::VisitOMPTaskgroupDirective(OMPTaskgroupDirective *Node) {
-  Indent() << "#pragma omp taskgroup";
+  Indent() << "#pragma omp taskgroup ";
   PrintOMPExecutableDirective(Node);
 }
 
diff --git a/lib/AST/StmtProfile.cpp b/lib/AST/StmtProfile.cpp
index f1fbe2806b5d..e06386018d68 100644
--- a/lib/AST/StmtProfile.cpp
+++ b/lib/AST/StmtProfile.cpp
@@ -549,6 +549,30 @@ void OMPClauseProfiler::VisitOMPReductionClause(
       Profiler->VisitStmt(E);
   }
 }
+void OMPClauseProfiler::VisitOMPTaskReductionClause(
+    const OMPTaskReductionClause *C) {
+  Profiler->VisitNestedNameSpecifier(
+      C->getQualifierLoc().getNestedNameSpecifier());
+  Profiler->VisitName(C->getNameInfo().getName());
+  VisitOMPClauseList(C);
+  VistOMPClauseWithPostUpdate(C);
+  for (auto *E : C->privates()) {
+    if (E)
+      Profiler->VisitStmt(E);
+  }
+  for (auto *E : C->lhs_exprs()) {
+    if (E)
+      Profiler->VisitStmt(E);
+  }
+  for (auto *E : C->rhs_exprs()) {
+    if (E)
+      Profiler->VisitStmt(E);
+  }
+  for (auto *E : C->reduction_ops()) {
+    if (E)
+      Profiler->VisitStmt(E);
+  }
+}
 void OMPClauseProfiler::VisitOMPLinearClause(const OMPLinearClause *C) {
   VisitOMPClauseList(C);
   VistOMPClauseWithPostUpdate(C);
diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp
index a62ca5f9b4d7..eac02c0102bc 100644
--- a/lib/AST/Type.cpp
+++ b/lib/AST/Type.cpp
@@ -2630,7 +2630,7 @@ StringRef FunctionType::getNameForCallConv(CallingConv CC) {
   case CC_X86ThisCall: return "thiscall";
   case CC_X86Pascal: return "pascal";
   case CC_X86VectorCall: return "vectorcall";
-  case CC_X86_64Win64: return "ms_abi";
+  case CC_Win64: return "ms_abi";
   case CC_X86_64SysV: return "sysv_abi";
   case CC_X86RegCall : return "regcall";
   case CC_AAPCS: return "aapcs";
@@ -3023,6 +3023,7 @@ bool AttributedType::isQualifier() const {
   case AttributedType::attr_sptr:
   case AttributedType::attr_uptr:
   case AttributedType::attr_objc_kindof:
+  case AttributedType::attr_ns_returns_retained:
     return false;
   }
   llvm_unreachable("bad attributed type kind");
@@ -3056,6 +3057,7 @@ bool AttributedType::isCallingConv() const {
   case attr_objc_inert_unsafe_unretained:
   case attr_noreturn:
   case attr_nonnull:
+  case attr_ns_returns_retained:
   case attr_nullable:
   case attr_null_unspecified:
   case attr_objc_kindof:
diff --git a/lib/AST/TypePrinter.cpp b/lib/AST/TypePrinter.cpp
index 0551340c37a1..15c63bf4ed98 100644
--- a/lib/AST/TypePrinter.cpp
+++ b/lib/AST/TypePrinter.cpp
@@ -104,6 +104,7 @@ namespace {
     void printAfter(QualType T, raw_ostream &OS);
     void AppendScope(DeclContext *DC, raw_ostream &OS);
     void printTag(TagDecl *T, raw_ostream &OS);
+    void printFunctionAfter(const FunctionType::ExtInfo &Info, raw_ostream &OS);
 #define ABSTRACT_TYPE(CLASS, PARENT)
 #define TYPE(CLASS, PARENT) \
     void print##CLASS##Before(const CLASS##Type *T, raw_ostream &OS); \
@@ -685,6 +686,36 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
 
   FunctionType::ExtInfo Info = T->getExtInfo();
 
+  printFunctionAfter(Info, OS);
+
+  if (unsigned quals = T->getTypeQuals()) {
+    OS << ' ';
+    AppendTypeQualList(OS, quals, Policy.Restrict);
+  }
+
+  switch (T->getRefQualifier()) {
+  case RQ_None:
+    break;
+
+  case RQ_LValue:
+    OS << " &";
+    break;
+
+  case RQ_RValue:
+    OS << " &&";
+    break;
+  }
+  T->printExceptionSpecification(OS, Policy);
+
+  if (T->hasTrailingReturn()) {
+    OS << " -> ";
+    print(T->getReturnType(), OS, StringRef());
+  } else
+    printAfter(T->getReturnType(), OS);
+}
+
+void TypePrinter::printFunctionAfter(const FunctionType::ExtInfo &Info,
+                                     raw_ostream &OS) {
   if (!InsideCCAttribute) {
     switch (Info.getCC()) {
     case CC_C:
@@ -720,7 +751,7 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
     case CC_IntelOclBicc:
       OS << " __attribute__((intel_ocl_bicc))";
       break;
-    case CC_X86_64Win64:
+    case CC_Win64:
       OS << " __attribute__((ms_abi))";
       break;
     case CC_X86_64SysV:
@@ -747,36 +778,13 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
 
   if (Info.getNoReturn())
     OS << " __attribute__((noreturn))";
+  if (Info.getProducesResult())
+    OS << " __attribute__((ns_returns_retained))";
   if (Info.getRegParm())
     OS << " __attribute__((regparm ("
        << Info.getRegParm() << ")))";
   if (Info.getNoCallerSavedRegs())
-    OS << "__attribute__((no_caller_saved_registers))";
-
-  if (unsigned quals = T->getTypeQuals()) {
-    OS << ' ';
-    AppendTypeQualList(OS, quals, Policy.Restrict);
-  }
-
-  switch (T->getRefQualifier()) {
-  case RQ_None:
-    break;
-    
-  case RQ_LValue:
-    OS << " &";
-    break;
-    
-  case RQ_RValue:
-    OS << " &&";
-    break;
-  }
-  T->printExceptionSpecification(OS, Policy);
-
-  if (T->hasTrailingReturn()) {
-    OS << " -> ";
-    print(T->getReturnType(), OS, StringRef());
-  } else
-    printAfter(T->getReturnType(), OS);
+    OS << " __attribute__((no_caller_saved_registers))";
 }
 
 void TypePrinter::printFunctionNoProtoBefore(const FunctionNoProtoType *T, 
@@ -795,8 +803,7 @@ void TypePrinter::printFunctionNoProtoAfter(const FunctionNoProtoType *T,
   SaveAndRestore<bool> NonEmptyPH(HasEmptyPlaceHolder, false);
   
   OS << "()";
-  if (T->getNoReturnAttr())
-    OS << " __attribute__((noreturn))";
+  printFunctionAfter(T->getExtInfo(), OS);
   printAfter(T->getReturnType(), OS);
 }
 
@@ -1270,6 +1277,12 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   if (T->getAttrKind() == AttributedType::attr_objc_inert_unsafe_unretained)
     return;
 
+  // Don't print ns_returns_retained unless it had an effect.
+  if (T->getAttrKind() == AttributedType::attr_ns_returns_retained &&
+      !T->getEquivalentType()->castAs<FunctionType>()
+                             ->getExtInfo().getProducesResult())
+    return;
+
   // Print nullability type specifiers that occur after
   if (T->getAttrKind() == AttributedType::attr_nonnull ||
       T->getAttrKind() == AttributedType::attr_nullable ||
@@ -1361,6 +1374,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
     OS << ')';
     break;
 
+  case AttributedType::attr_ns_returns_retained:
+    OS << "ns_returns_retained";
+    break;
+
   // FIXME: When Sema learns to form this AttributedType, avoid printing the
   // attribute again in printFunctionProtoAfter.
   case AttributedType::attr_noreturn: OS << "noreturn"; break;
diff --git a/lib/Analysis/PrintfFormatString.cpp b/lib/Analysis/PrintfFormatString.cpp
index 60556697113a..50a3aa20bd19 100644
--- a/lib/Analysis/PrintfFormatString.cpp
+++ b/lib/Analysis/PrintfFormatString.cpp
@@ -466,8 +466,7 @@ ArgType PrintfSpecifier::getArgType(ASTContext &Ctx,
       case LengthModifier::AsIntMax:
         return ArgType(Ctx.getIntMaxType(), "intmax_t");
       case LengthModifier::AsSizeT:
-        // FIXME: How to get the corresponding signed version of size_t?
-        return ArgType();
+        return ArgType(Ctx.getSignedSizeType(), "ssize_t");
       case LengthModifier::AsInt3264:
         return Ctx.getTargetInfo().getTriple().isArch64Bit()
                    ? ArgType(Ctx.LongLongTy, "__int64")
@@ -537,7 +536,7 @@ ArgType PrintfSpecifier::getArgType(ASTContext &Ctx,
       case LengthModifier::AsIntMax:
         return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t"));
       case LengthModifier::AsSizeT:
-        return ArgType(); // FIXME: ssize_t
+        return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t"));
       case LengthModifier::AsPtrDiff:
         return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"));
       case LengthModifier::AsLongDouble:
diff --git a/lib/Basic/CMakeLists.txt b/lib/Basic/CMakeLists.txt
index 2feb31851cfe..e971b55e8585 100644
--- a/lib/Basic/CMakeLists.txt
+++ b/lib/Basic/CMakeLists.txt
@@ -15,8 +15,23 @@ function(find_first_existing_file out_var)
 endfunction()
 
 macro(find_first_existing_vc_file out_var path)
+  set(git_path "${path}/.git")
+
+  # Normally '.git' is a directory that contains a 'logs/HEAD' file that
+  # is updated as modifications are made to the repository. In case the
+  # repository is a Git submodule, '.git' is a file that contains text that
+  # indicates where the repository's Git directory exists.
+  if (EXISTS "${git_path}" AND NOT IS_DIRECTORY "${git_path}")
+    FILE(READ "${git_path}" file_contents)
+    if("${file_contents}" MATCHES "^gitdir: ([^\n]+)")
+      # '.git' is indeed a link to the submodule's Git directory.
+      # Use the path to that Git directory.
+      set(git_path "${path}/${CMAKE_MATCH_1}")
+    endif()
+  endif()
+
   find_first_existing_file(${out_var}
-    "${path}/.git/logs/HEAD" # Git
+    "${git_path}/logs/HEAD"  # Git or Git submodule
     "${path}/.svn/wc.db"     # SVN 1.7
     "${path}/.svn/entries"   # SVN 1.6
     )
diff --git a/lib/Basic/DiagnosticIDs.cpp b/lib/Basic/DiagnosticIDs.cpp
index ce493c1e5cab..932b3f1934cc 100644
--- a/lib/Basic/DiagnosticIDs.cpp
+++ b/lib/Basic/DiagnosticIDs.cpp
@@ -510,6 +510,18 @@ StringRef DiagnosticIDs::getWarningOptionForDiag(unsigned DiagID) {
   return StringRef();
 }
 
+std::vector<std::string> DiagnosticIDs::getDiagnosticFlags() {
+  std::vector<std::string> Res;
+  for (size_t I = 1; DiagGroupNames[I] != '\0';) {
+    std::string Diag(DiagGroupNames + I + 1, DiagGroupNames[I]);
+    I += DiagGroupNames[I] + 1;
+    Res.push_back("-W" + Diag);
+    Res.push_back("-Wno" + Diag);
+  }
+
+  return Res;
+}
+
 /// Return \c true if any diagnostics were found in this group, even if they
 /// were filtered out due to having the wrong flavor.
 static bool getDiagnosticsInGroup(diag::Flavor Flavor,
diff --git a/lib/Basic/OpenMPKinds.cpp b/lib/Basic/OpenMPKinds.cpp
index 76a0e18c2d73..050c0cc466db 100644
--- a/lib/Basic/OpenMPKinds.cpp
+++ b/lib/Basic/OpenMPKinds.cpp
@@ -138,6 +138,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind,
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_aligned:
   case OMPC_copyin:
   case OMPC_copyprivate:
@@ -277,6 +278,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_aligned:
   case OMPC_copyin:
   case OMPC_copyprivate:
@@ -705,6 +707,16 @@ bool clang::isAllowedClauseForDirective(OpenMPDirectiveKind DKind,
 #define OPENMP_TARGET_TEAMS_DISTRIBUTE_SIMD_CLAUSE(Name)                       \
   case OMPC_##Name:                                                            \
     return true;
+#include "clang/Basic/OpenMPKinds.def"
+    default:
+      break;
+    }
+    break;
+  case OMPD_taskgroup:
+    switch (CKind) {
+#define OPENMP_TASKGROUP_CLAUSE(Name)                                          \
+  case OMPC_##Name:                                                            \
+    return true;
 #include "clang/Basic/OpenMPKinds.def"
     default:
       break;
@@ -719,7 +731,6 @@ bool clang::isAllowedClauseForDirective(OpenMPDirectiveKind DKind,
   case OMPD_taskyield:
   case OMPD_barrier:
   case OMPD_taskwait:
-  case OMPD_taskgroup:
   case OMPD_cancellation_point:
   case OMPD_declare_reduction:
     break;
@@ -840,7 +851,8 @@ bool clang::isOpenMPDistributeDirective(OpenMPDirectiveKind Kind) {
 bool clang::isOpenMPPrivate(OpenMPClauseKind Kind) {
   return Kind == OMPC_private || Kind == OMPC_firstprivate ||
          Kind == OMPC_lastprivate || Kind == OMPC_linear ||
-         Kind == OMPC_reduction; // TODO add next clauses like 'reduction'.
+         Kind == OMPC_reduction ||
+         Kind == OMPC_task_reduction; // TODO add next clauses like 'reduction'.
 }
 
 bool clang::isOpenMPThreadPrivate(OpenMPClauseKind Kind) {
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 50b4fc34ad3a..5d75aa5a7528 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -571,8 +571,6 @@ class OpenBSDTargetInfo : public OSTargetInfo<Target> {
 public:
   OpenBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : OSTargetInfo<Target>(Triple, Opts) {
-    this->TLSSupported = false;
-
       switch (Triple.getArch()) {
         case llvm::Triple::x86:
         case llvm::Triple::x86_64:
@@ -3401,6 +3399,7 @@ bool X86TargetInfo::initFeatureMap(
     setFeatureEnabledImpl(Features, "bmi", true);
     setFeatureEnabledImpl(Features, "f16c", true);
     setFeatureEnabledImpl(Features, "xsaveopt", true);
+    setFeatureEnabledImpl(Features, "movbe", true);
     LLVM_FALLTHROUGH;
   case CK_BTVER1:
     setFeatureEnabledImpl(Features, "ssse3", true);
@@ -4900,7 +4899,7 @@ class X86_64TargetInfo : public X86TargetInfo {
     case CC_Swift:
     case CC_X86VectorCall:
     case CC_IntelOclBicc:
-    case CC_X86_64Win64:
+    case CC_Win64:
     case CC_PreserveMost:
     case CC_PreserveAll:
     case CC_X86RegCall:
@@ -6251,7 +6250,8 @@ class AArch64TargetInfo : public TargetInfo {
 
   enum FPUModeEnum {
     FPUMode,
-    NeonMode
+    NeonMode = (1 << 0),
+    SveMode = (1 << 1)
   };
 
   unsigned FPU;
@@ -6290,6 +6290,9 @@ class AArch64TargetInfo : public TargetInfo {
     LongDoubleWidth = LongDoubleAlign = SuitableAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
 
+    // Make __builtin_ms_va_list available.
+    HasBuiltinMSVaList = true;
+
     // {} in inline assembly are neon specifiers, not assembly variant
     // specifiers.
     NoAsmVariants = true;
@@ -6385,12 +6388,15 @@ class AArch64TargetInfo : public TargetInfo {
     Builder.defineMacro("__ARM_SIZEOF_MINIMAL_ENUM",
                         Opts.ShortEnums ? "1" : "4");
 
-    if (FPU == NeonMode) {
+    if (FPU & NeonMode) {
       Builder.defineMacro("__ARM_NEON", "1");
       // 64-bit NEON supports half, single and double precision operations.
       Builder.defineMacro("__ARM_NEON_FP", "0xE");
     }
 
+    if (FPU & SveMode)
+      Builder.defineMacro("__ARM_FEATURE_SVE", "1");
+
     if (CRC)
       Builder.defineMacro("__ARM_FEATURE_CRC32", "1");
 
@@ -6426,7 +6432,8 @@ class AArch64TargetInfo : public TargetInfo {
     return Feature == "aarch64" ||
       Feature == "arm64" ||
       Feature == "arm" ||
-      (Feature == "neon" && FPU == NeonMode);
+      (Feature == "neon" && (FPU & NeonMode)) ||
+      (Feature == "sve" && (FPU & SveMode));
   }
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
@@ -6440,7 +6447,9 @@ class AArch64TargetInfo : public TargetInfo {
 
     for (const auto &Feature : Features) {
       if (Feature == "+neon")
-        FPU = NeonMode;
+        FPU |= NeonMode;
+      if (Feature == "+sve")
+        FPU |= SveMode;
       if (Feature == "+crc")
         CRC = 1;
       if (Feature == "+crypto")
@@ -6467,6 +6476,7 @@ class AArch64TargetInfo : public TargetInfo {
     case CC_PreserveMost:
     case CC_PreserveAll:
     case CC_OpenCLKernel:
+    case CC_Win64:
       return CCCR_OK;
     default:
       return CCCR_Warning;
@@ -6644,13 +6654,26 @@ class MicrosoftARM64TargetInfo
   MicrosoftARM64TargetInfo(const llvm::Triple &Triple,
                              const TargetOptions &Opts)
       : WindowsTargetInfo<AArch64leTargetInfo>(Triple, Opts), Triple(Triple) {
+
+    // This is an LLP64 platform.
+    // int:4, long:4, long long:8, long double:8.
     WCharType = UnsignedShort;
+    IntWidth = IntAlign = 32;
+    LongWidth = LongAlign = 32;
+    DoubleAlign = LongLongAlign = 64;
+    LongDoubleWidth = LongDoubleAlign = 64;
+    LongDoubleFormat = &llvm::APFloat::IEEEdouble();
+    IntMaxType = SignedLongLong;
+    Int64Type = SignedLongLong;
     SizeType = UnsignedLongLong;
+    PtrDiffType = SignedLongLong;
+    IntPtrType = SignedLongLong;
+
     TheCXXABI.set(TargetCXXABI::Microsoft);
   }
 
   void setDataLayout() override {
-    resetDataLayout("e-m:w-i64:64-i128:128-n32:64-S128");
+    resetDataLayout("e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128");
   }
 
   void getVisualStudioDefines(const LangOptions &Opts,
@@ -7470,7 +7493,7 @@ class SystemZTargetInfo : public TargetInfo {
     if (HasVector)
       Builder.defineMacro("__VX__");
     if (Opts.ZVector)
-      Builder.defineMacro("__VEC__", "10301");
+      Builder.defineMacro("__VEC__", "10302");
   }
   ArrayRef<Builtin::Info> getTargetBuiltins() const override {
     return llvm::makeArrayRef(BuiltinInfo,
@@ -7497,6 +7520,7 @@ class SystemZTargetInfo : public TargetInfo {
       .Cases("arch9", "z196", 9)
       .Cases("arch10", "zEC12", 10)
       .Cases("arch11", "z13", 11)
+      .Cases("arch12", "z14", 12)
       .Default(-1);
   }
   bool setCPU(const std::string &Name) override {
@@ -7513,6 +7537,8 @@ class SystemZTargetInfo : public TargetInfo {
       Features["transactional-execution"] = true;
     if (ISARevision >= 11)
       Features["vector"] = true;
+    if (ISARevision >= 12)
+      Features["vector-enhancements-1"] = true;
     return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
   }
 
@@ -7542,6 +7568,7 @@ class SystemZTargetInfo : public TargetInfo {
         .Case("arch9", ISARevision >= 9)
         .Case("arch10", ISARevision >= 10)
         .Case("arch11", ISARevision >= 11)
+        .Case("arch12", ISARevision >= 12)
         .Case("htm", HasTransactionalExecution)
         .Case("vx", HasVector)
         .Default(false);
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index bc902507c46e..f3527b0f39d1 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -2795,6 +2795,33 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
       Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
     return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
   }
+
+  case Builtin::BI__builtin_ms_va_start:
+  case Builtin::BI__builtin_ms_va_end:
+    return RValue::get(
+        EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
+                       BuiltinID == Builtin::BI__builtin_ms_va_start));
+
+  case Builtin::BI__builtin_ms_va_copy: {
+    // Lower this manually. We can't reliably determine whether or not any
+    // given va_copy() is for a Win64 va_list from the calling convention
+    // alone, because it's legal to do this from a System V ABI function.
+    // With opaque pointer types, we won't have enough information in LLVM
+    // IR to determine this from the argument types, either. Best to do it
+    // now, while we have enough information.
+    Address DestAddr = EmitMSVAListRef(E->getArg(0));
+    Address SrcAddr = EmitMSVAListRef(E->getArg(1));
+
+    llvm::Type *BPP = Int8PtrPtrTy;
+
+    DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), BPP, "cp"),
+                       DestAddr.getAlignment());
+    SrcAddr = Address(Builder.CreateBitCast(SrcAddr.getPointer(), BPP, "ap"),
+                      SrcAddr.getAlignment());
+
+    Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
+    return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
+  }
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
@@ -7223,31 +7250,6 @@ static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
 
 Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
                                            const CallExpr *E) {
-  if (BuiltinID == X86::BI__builtin_ms_va_start ||
-      BuiltinID == X86::BI__builtin_ms_va_end)
-    return EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
-                          BuiltinID == X86::BI__builtin_ms_va_start);
-  if (BuiltinID == X86::BI__builtin_ms_va_copy) {
-    // Lower this manually. We can't reliably determine whether or not any
-    // given va_copy() is for a Win64 va_list from the calling convention
-    // alone, because it's legal to do this from a System V ABI function.
-    // With opaque pointer types, we won't have enough information in LLVM
-    // IR to determine this from the argument types, either. Best to do it
-    // now, while we have enough information.
-    Address DestAddr = EmitMSVAListRef(E->getArg(0));
-    Address SrcAddr = EmitMSVAListRef(E->getArg(1));
-
-    llvm::Type *BPP = Int8PtrPtrTy;
-
-    DestAddr = Address(Builder.CreateBitCast(DestAddr.getPointer(), BPP, "cp"),
-                       DestAddr.getAlignment());
-    SrcAddr = Address(Builder.CreateBitCast(SrcAddr.getPointer(), BPP, "ap"),
-                      SrcAddr.getAlignment());
-
-    Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
-    return Builder.CreateStore(ArgPtr, DestAddr);
-  }
-
   SmallVector<Value*, 4> Ops;
 
   // Find out if any arguments are required to be integer constant expressions.
@@ -8790,12 +8792,14 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, {X, Undef});
   }
 
+  case SystemZ::BI__builtin_s390_vfsqsb:
   case SystemZ::BI__builtin_s390_vfsqdb: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
     Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
     return Builder.CreateCall(F, X);
   }
+  case SystemZ::BI__builtin_s390_vfmasb:
   case SystemZ::BI__builtin_s390_vfmadb: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
@@ -8804,6 +8808,7 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
     return Builder.CreateCall(F, {X, Y, Z});
   }
+  case SystemZ::BI__builtin_s390_vfmssb:
   case SystemZ::BI__builtin_s390_vfmsdb: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
@@ -8813,12 +8818,35 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
     return Builder.CreateCall(F, {X, Y, Builder.CreateFSub(Zero, Z, "sub")});
   }
+  case SystemZ::BI__builtin_s390_vfnmasb:
+  case SystemZ::BI__builtin_s390_vfnmadb: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *X = EmitScalarExpr(E->getArg(0));
+    Value *Y = EmitScalarExpr(E->getArg(1));
+    Value *Z = EmitScalarExpr(E->getArg(2));
+    Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
+    Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
+    return Builder.CreateFSub(Zero, Builder.CreateCall(F, {X, Y, Z}), "sub");
+  }
+  case SystemZ::BI__builtin_s390_vfnmssb:
+  case SystemZ::BI__builtin_s390_vfnmsdb: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *X = EmitScalarExpr(E->getArg(0));
+    Value *Y = EmitScalarExpr(E->getArg(1));
+    Value *Z = EmitScalarExpr(E->getArg(2));
+    Value *Zero = llvm::ConstantFP::getZeroValueForNegation(ResultType);
+    Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
+    Value *NegZ = Builder.CreateFSub(Zero, Z, "sub");
+    return Builder.CreateFSub(Zero, Builder.CreateCall(F, {X, Y, NegZ}));
+  }
+  case SystemZ::BI__builtin_s390_vflpsb:
   case SystemZ::BI__builtin_s390_vflpdb: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
     return Builder.CreateCall(F, X);
   }
+  case SystemZ::BI__builtin_s390_vflnsb:
   case SystemZ::BI__builtin_s390_vflndb: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
@@ -8826,6 +8854,7 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
     return Builder.CreateFSub(Zero, Builder.CreateCall(F, X), "sub");
   }
+  case SystemZ::BI__builtin_s390_vfisb:
   case SystemZ::BI__builtin_s390_vfidb: {
     llvm::Type *ResultType = ConvertType(E->getType());
     Value *X = EmitScalarExpr(E->getArg(0));
@@ -8835,8 +8864,8 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
     bool IsConstM5 = E->getArg(2)->isIntegerConstantExpr(M5, getContext());
     assert(IsConstM4 && IsConstM5 && "Constant arg isn't actually constant?");
     (void)IsConstM4; (void)IsConstM5;
-    // Check whether this instance of vfidb can be represented via a LLVM
-    // standard intrinsic.  We only support some combinations of M4 and M5.
+    // Check whether this instance can be represented via a LLVM standard
+    // intrinsic.  We only support some combinations of M4 and M5.
     Intrinsic::ID ID = Intrinsic::not_intrinsic;
     switch (M4.getZExtValue()) {
     default: break;
@@ -8861,11 +8890,76 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
       Function *F = CGM.getIntrinsic(ID, ResultType);
       return Builder.CreateCall(F, X);
     }
-    Function *F = CGM.getIntrinsic(Intrinsic::s390_vfidb);
+    switch (BuiltinID) {
+      case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
+      case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
+      default: llvm_unreachable("Unknown BuiltinID");
+    }
+    Function *F = CGM.getIntrinsic(ID);
     Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
     Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
     return Builder.CreateCall(F, {X, M4Value, M5Value});
   }
+  case SystemZ::BI__builtin_s390_vfmaxsb:
+  case SystemZ::BI__builtin_s390_vfmaxdb: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *X = EmitScalarExpr(E->getArg(0));
+    Value *Y = EmitScalarExpr(E->getArg(1));
+    // Constant-fold the M4 mask argument.
+    llvm::APSInt M4;
+    bool IsConstM4 = E->getArg(2)->isIntegerConstantExpr(M4, getContext());
+    assert(IsConstM4 && "Constant arg isn't actually constant?");
+    (void)IsConstM4;
+    // Check whether this instance can be represented via a LLVM standard
+    // intrinsic.  We only support some values of M4.
+    Intrinsic::ID ID = Intrinsic::not_intrinsic;
+    switch (M4.getZExtValue()) {
+    default: break;
+    case 4: ID = Intrinsic::maxnum; break;
+    }
+    if (ID != Intrinsic::not_intrinsic) {
+      Function *F = CGM.getIntrinsic(ID, ResultType);
+      return Builder.CreateCall(F, {X, Y});
+    }
+    switch (BuiltinID) {
+      case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
+      case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
+      default: llvm_unreachable("Unknown BuiltinID");
+    }
+    Function *F = CGM.getIntrinsic(ID);
+    Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
+    return Builder.CreateCall(F, {X, Y, M4Value});
+  }
+  case SystemZ::BI__builtin_s390_vfminsb:
+  case SystemZ::BI__builtin_s390_vfmindb: {
+    llvm::Type *ResultType = ConvertType(E->getType());
+    Value *X = EmitScalarExpr(E->getArg(0));
+    Value *Y = EmitScalarExpr(E->getArg(1));
+    // Constant-fold the M4 mask argument.
+    llvm::APSInt M4;
+    bool IsConstM4 = E->getArg(2)->isIntegerConstantExpr(M4, getContext());
+    assert(IsConstM4 && "Constant arg isn't actually constant?");
+    (void)IsConstM4;
+    // Check whether this instance can be represented via a LLVM standard
+    // intrinsic.  We only support some values of M4.
+    Intrinsic::ID ID = Intrinsic::not_intrinsic;
+    switch (M4.getZExtValue()) {
+    default: break;
+    case 4: ID = Intrinsic::minnum; break;
+    }
+    if (ID != Intrinsic::not_intrinsic) {
+      Function *F = CGM.getIntrinsic(ID, ResultType);
+      return Builder.CreateCall(F, {X, Y});
+    }
+    switch (BuiltinID) {
+      case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
+      case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
+      default: llvm_unreachable("Unknown BuiltinID");
+    }
+    Function *F = CGM.getIntrinsic(ID);
+    Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
+    return Builder.CreateCall(F, {X, Y, M4Value});
+  }
 
   // Vector intrisincs that output the post-instruction CC value.
 
@@ -8932,10 +9026,14 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
   INTRINSIC_WITH_CC(s390_vstrczhs);
   INTRINSIC_WITH_CC(s390_vstrczfs);
 
+  INTRINSIC_WITH_CC(s390_vfcesbs);
   INTRINSIC_WITH_CC(s390_vfcedbs);
+  INTRINSIC_WITH_CC(s390_vfchsbs);
   INTRINSIC_WITH_CC(s390_vfchdbs);
+  INTRINSIC_WITH_CC(s390_vfchesbs);
   INTRINSIC_WITH_CC(s390_vfchedbs);
 
+  INTRINSIC_WITH_CC(s390_vftcisb);
   INTRINSIC_WITH_CC(s390_vftcidb);
 
 #undef INTRINSIC_WITH_CC
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index cee656a62fe7..316bf44cb1c3 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -50,7 +50,7 @@ unsigned CodeGenTypes::ClangCallConvToLLVMCallConv(CallingConv CC) {
   case CC_X86FastCall: return llvm::CallingConv::X86_FastCall;
   case CC_X86RegCall: return llvm::CallingConv::X86_RegCall;
   case CC_X86ThisCall: return llvm::CallingConv::X86_ThisCall;
-  case CC_X86_64Win64: return llvm::CallingConv::X86_64_Win64;
+  case CC_Win64: return llvm::CallingConv::Win64;
   case CC_X86_64SysV: return llvm::CallingConv::X86_64_SysV;
   case CC_AAPCS: return llvm::CallingConv::ARM_AAPCS;
   case CC_AAPCS_VFP: return llvm::CallingConv::ARM_AAPCS_VFP;
@@ -218,7 +218,7 @@ static CallingConv getCallingConventionForDecl(const Decl *D, bool IsWindows) {
     return CC_IntelOclBicc;
 
   if (D->hasAttr<MSABIAttr>())
-    return IsWindows ? CC_C : CC_X86_64Win64;
+    return IsWindows ? CC_C : CC_Win64;
 
   if (D->hasAttr<SysVABIAttr>())
     return IsWindows ? CC_X86_64SysV : CC_C;
@@ -1877,8 +1877,8 @@ void CodeGenModule::ConstructAttributeList(
       // the function.
       const auto *TD = FD->getAttr<TargetAttr>();
       TargetAttr::ParsedTargetAttr ParsedAttr = TD->parse();
-      if (ParsedAttr.second != "")
-        TargetCPU = ParsedAttr.second;
+      if (ParsedAttr.Architecture != "")
+        TargetCPU = ParsedAttr.Architecture;
       if (TargetCPU != "")
         FuncAttrs.addAttribute("target-cpu", TargetCPU);
       if (!Features.empty()) {
diff --git a/lib/CodeGen/CGDebugInfo.cpp b/lib/CodeGen/CGDebugInfo.cpp
index b00d296fe34a..c9c450c32e3b 100644
--- a/lib/CodeGen/CGDebugInfo.cpp
+++ b/lib/CodeGen/CGDebugInfo.cpp
@@ -956,7 +956,7 @@ static unsigned getDwarfCC(CallingConv CC) {
     return llvm::dwarf::DW_CC_BORLAND_pascal;
 
   // FIXME: Create new DW_CC_ codes for these calling conventions.
-  case CC_X86_64Win64:
+  case CC_Win64:
   case CC_X86_64SysV:
   case CC_AAPCS:
   case CC_AAPCS_VFP:
@@ -3970,10 +3970,10 @@ void CGDebugInfo::EmitUsingDirective(const UsingDirectiveDecl &UD) {
   const NamespaceDecl *NSDecl = UD.getNominatedNamespace();
   if (!NSDecl->isAnonymousNamespace() ||
       CGM.getCodeGenOpts().DebugExplicitImport) {
+    auto Loc = UD.getLocation();
     DBuilder.createImportedModule(
         getCurrentContextDescriptor(cast<Decl>(UD.getDeclContext())),
-        getOrCreateNamespace(NSDecl),
-        getLineNumber(UD.getLocation()));
+        getOrCreateNamespace(NSDecl), getOrCreateFile(Loc), getLineNumber(Loc));
   }
 }
 
@@ -3996,10 +3996,12 @@ void CGDebugInfo::EmitUsingDecl(const UsingDecl &UD) {
       if (AT->getDeducedType().isNull())
         return;
   if (llvm::DINode *Target =
-          getDeclarationOrDefinition(USD.getUnderlyingDecl()))
+          getDeclarationOrDefinition(USD.getUnderlyingDecl())) {
+    auto Loc = USD.getLocation();
     DBuilder.createImportedDeclaration(
         getCurrentContextDescriptor(cast<Decl>(USD.getDeclContext())), Target,
-        getLineNumber(USD.getLocation()));
+        getOrCreateFile(Loc), getLineNumber(Loc));
+  }
 }
 
 void CGDebugInfo::EmitImportDecl(const ImportDecl &ID) {
@@ -4007,10 +4009,11 @@ void CGDebugInfo::EmitImportDecl(const ImportDecl &ID) {
     return;
   if (Module *M = ID.getImportedModule()) {
     auto Info = ExternalASTSource::ASTSourceDescriptor(*M);
+    auto Loc = ID.getLocation();
     DBuilder.createImportedDeclaration(
         getCurrentContextDescriptor(cast<Decl>(ID.getDeclContext())),
-        getOrCreateModuleRef(Info, DebugTypeExtRefs),
-        getLineNumber(ID.getLocation()));
+        getOrCreateModuleRef(Info, DebugTypeExtRefs), getOrCreateFile(Loc),
+        getLineNumber(Loc));
   }
 }
 
@@ -4022,18 +4025,19 @@ CGDebugInfo::EmitNamespaceAlias(const NamespaceAliasDecl &NA) {
   if (VH)
     return cast<llvm::DIImportedEntity>(VH);
   llvm::DIImportedEntity *R;
+  auto Loc = NA.getLocation();
   if (const auto *Underlying =
           dyn_cast<NamespaceAliasDecl>(NA.getAliasedNamespace()))
     // This could cache & dedup here rather than relying on metadata deduping.
     R = DBuilder.createImportedDeclaration(
         getCurrentContextDescriptor(cast<Decl>(NA.getDeclContext())),
-        EmitNamespaceAlias(*Underlying), getLineNumber(NA.getLocation()),
-        NA.getName());
+        EmitNamespaceAlias(*Underlying), getOrCreateFile(Loc),
+        getLineNumber(Loc), NA.getName());
   else
     R = DBuilder.createImportedDeclaration(
         getCurrentContextDescriptor(cast<Decl>(NA.getDeclContext())),
         getOrCreateNamespace(cast<NamespaceDecl>(NA.getAliasedNamespace())),
-        getLineNumber(NA.getLocation()), NA.getName());
+        getOrCreateFile(Loc), getLineNumber(Loc), NA.getName());
   VH.reset(R);
   return R;
 }
diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp
index 9f40ee5a00a3..9572bd3543bd 100644
--- a/lib/CodeGen/CGExpr.cpp
+++ b/lib/CodeGen/CGExpr.cpp
@@ -73,9 +73,12 @@ Address CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
   // cast alloca to the default address space when necessary.
   if (CastToDefaultAddrSpace && getASTAllocaAddressSpace() != LangAS::Default) {
     auto DestAddrSpace = getContext().getTargetAddressSpace(LangAS::Default);
+    auto CurIP = Builder.saveIP();
+    Builder.SetInsertPoint(AllocaInsertPt);
     V = getTargetHooks().performAddrSpaceCast(
         *this, V, getASTAllocaAddressSpace(), LangAS::Default,
         Ty->getPointerTo(DestAddrSpace), /*non-null*/ true);
+    Builder.restoreIP(CurIP);
   }
 
   return Address(V, Align);
@@ -3052,7 +3055,9 @@ static llvm::Value *emitArraySubscriptGEP(CodeGenFunction &CGF,
                                           SourceLocation loc,
                                     const llvm::Twine &name = "arrayidx") {
   if (inbounds) {
-    return CGF.EmitCheckedInBoundsGEP(ptr, indices, signedIndices, loc, name);
+    return CGF.EmitCheckedInBoundsGEP(ptr, indices, signedIndices,
+                                      CodeGenFunction::NotSubtraction, loc,
+                                      name);
   } else {
     return CGF.Builder.CreateGEP(ptr, indices, name);
   }
diff --git a/lib/CodeGen/CGExprScalar.cpp b/lib/CodeGen/CGExprScalar.cpp
index 43c86495f3d3..1170b014ec7f 100644
--- a/lib/CodeGen/CGExprScalar.cpp
+++ b/lib/CodeGen/CGExprScalar.cpp
@@ -1851,7 +1851,7 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
   llvm::Value *input;
 
   int amount = (isInc ? 1 : -1);
-  bool signedIndex = !isInc;
+  bool isSubtraction = !isInc;
 
   if (const AtomicType *atomicTy = type->getAs<AtomicType>()) {
     type = atomicTy->getValueType();
@@ -1941,8 +1941,9 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
       if (CGF.getLangOpts().isSignedOverflowDefined())
         value = Builder.CreateGEP(value, numElts, "vla.inc");
       else
-        value = CGF.EmitCheckedInBoundsGEP(value, numElts, signedIndex,
-                                           E->getExprLoc(), "vla.inc");
+        value = CGF.EmitCheckedInBoundsGEP(
+            value, numElts, /*SignedIndices=*/false, isSubtraction,
+            E->getExprLoc(), "vla.inc");
 
     // Arithmetic on function pointers (!) is just +-1.
     } else if (type->isFunctionType()) {
@@ -1952,8 +1953,9 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
       if (CGF.getLangOpts().isSignedOverflowDefined())
         value = Builder.CreateGEP(value, amt, "incdec.funcptr");
       else
-        value = CGF.EmitCheckedInBoundsGEP(value, amt, signedIndex,
-                                           E->getExprLoc(), "incdec.funcptr");
+        value = CGF.EmitCheckedInBoundsGEP(value, amt, /*SignedIndices=*/false,
+                                           isSubtraction, E->getExprLoc(),
+                                           "incdec.funcptr");
       value = Builder.CreateBitCast(value, input->getType());
 
     // For everything else, we can just do a simple increment.
@@ -1962,8 +1964,9 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
       if (CGF.getLangOpts().isSignedOverflowDefined())
         value = Builder.CreateGEP(value, amt, "incdec.ptr");
       else
-        value = CGF.EmitCheckedInBoundsGEP(value, amt, signedIndex,
-                                           E->getExprLoc(), "incdec.ptr");
+        value = CGF.EmitCheckedInBoundsGEP(value, amt, /*SignedIndices=*/false,
+                                           isSubtraction, E->getExprLoc(),
+                                           "incdec.ptr");
     }
 
   // Vector increment/decrement.
@@ -2044,7 +2047,8 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
     if (CGF.getLangOpts().isSignedOverflowDefined())
       value = Builder.CreateGEP(value, sizeValue, "incdec.objptr");
     else
-      value = CGF.EmitCheckedInBoundsGEP(value, sizeValue, signedIndex,
+      value = CGF.EmitCheckedInBoundsGEP(value, sizeValue,
+                                         /*SignedIndices=*/false, isSubtraction,
                                          E->getExprLoc(), "incdec.objptr");
     value = Builder.CreateBitCast(value, input->getType());
   }
@@ -2663,7 +2667,6 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
   }
 
   bool isSigned = indexOperand->getType()->isSignedIntegerOrEnumerationType();
-  bool mayHaveNegativeGEPIndex = isSigned || isSubtraction;
 
   unsigned width = cast<llvm::IntegerType>(index->getType())->getBitWidth();
   auto &DL = CGF.CGM.getDataLayout();
@@ -2715,7 +2718,7 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
     } else {
       index = CGF.Builder.CreateNSWMul(index, numElements, "vla.index");
       pointer =
-          CGF.EmitCheckedInBoundsGEP(pointer, index, mayHaveNegativeGEPIndex,
+          CGF.EmitCheckedInBoundsGEP(pointer, index, isSigned, isSubtraction,
                                      op.E->getExprLoc(), "add.ptr");
     }
     return pointer;
@@ -2733,7 +2736,7 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
   if (CGF.getLangOpts().isSignedOverflowDefined())
     return CGF.Builder.CreateGEP(pointer, index, "add.ptr");
 
-  return CGF.EmitCheckedInBoundsGEP(pointer, index, mayHaveNegativeGEPIndex,
+  return CGF.EmitCheckedInBoundsGEP(pointer, index, isSigned, isSubtraction,
                                     op.E->getExprLoc(), "add.ptr");
 }
 
@@ -2807,7 +2810,7 @@ static Value* tryEmitFMulAdd(const BinOpInfo &op,
 Value *ScalarExprEmitter::EmitAdd(const BinOpInfo &op) {
   if (op.LHS->getType()->isPointerTy() ||
       op.RHS->getType()->isPointerTy())
-    return emitPointerArithmetic(CGF, op, /*subtraction*/ false);
+    return emitPointerArithmetic(CGF, op, CodeGenFunction::NotSubtraction);
 
   if (op.Ty->isSignedIntegerOrEnumerationType()) {
     switch (CGF.getLangOpts().getSignedOverflowBehavior()) {
@@ -2878,7 +2881,7 @@ Value *ScalarExprEmitter::EmitSub(const BinOpInfo &op) {
   // If the RHS is not a pointer, then we have normal pointer
   // arithmetic.
   if (!op.RHS->getType()->isPointerTy())
-    return emitPointerArithmetic(CGF, op, /*subtraction*/ true);
+    return emitPointerArithmetic(CGF, op, CodeGenFunction::IsSubtraction);
 
   // Otherwise, this is a pointer subtraction.
 
@@ -3853,6 +3856,7 @@ LValue CodeGenFunction::EmitCompoundAssignmentLValue(
 Value *CodeGenFunction::EmitCheckedInBoundsGEP(Value *Ptr,
                                                ArrayRef<Value *> IdxList,
                                                bool SignedIndices,
+                                               bool IsSubtraction,
                                                SourceLocation Loc,
                                                const Twine &Name) {
   Value *GEPVal = Builder.CreateInBoundsGEP(Ptr, IdxList, Name);
@@ -3958,15 +3962,19 @@ Value *CodeGenFunction::EmitCheckedInBoundsGEP(Value *Ptr,
   // pointer matches the sign of the total offset.
   llvm::Value *ValidGEP;
   auto *NoOffsetOverflow = Builder.CreateNot(OffsetOverflows);
-  auto *PosOrZeroValid = Builder.CreateICmpUGE(ComputedGEP, IntPtr);
   if (SignedIndices) {
+    auto *PosOrZeroValid = Builder.CreateICmpUGE(ComputedGEP, IntPtr);
     auto *PosOrZeroOffset = Builder.CreateICmpSGE(TotalOffset, Zero);
     llvm::Value *NegValid = Builder.CreateICmpULT(ComputedGEP, IntPtr);
     ValidGEP = Builder.CreateAnd(
         Builder.CreateSelect(PosOrZeroOffset, PosOrZeroValid, NegValid),
         NoOffsetOverflow);
-  } else {
+  } else if (!SignedIndices && !IsSubtraction) {
+    auto *PosOrZeroValid = Builder.CreateICmpUGE(ComputedGEP, IntPtr);
     ValidGEP = Builder.CreateAnd(PosOrZeroValid, NoOffsetOverflow);
+  } else {
+    auto *NegOrZeroValid = Builder.CreateICmpULE(ComputedGEP, IntPtr);
+    ValidGEP = Builder.CreateAnd(NegOrZeroValid, NoOffsetOverflow);
   }
 
   llvm::Constant *StaticArgs[] = {EmitCheckSourceLocation(Loc)};
diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index a2ea0dec3e9d..d488bd4b30bf 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -643,6 +643,12 @@ enum OpenMPRTLFunction {
   // Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
   // *vec);
   OMPRTL__kmpc_doacross_wait,
+  // Call to void *__kmpc_task_reduction_init(int gtid, int num_data, void
+  // *data);
+  OMPRTL__kmpc_task_reduction_init,
+  // Call to void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
+  // *d);
+  OMPRTL__kmpc_task_reduction_get_th_data,
 
   //
   // Offloading related calls
@@ -697,6 +703,414 @@ void RegionCodeGenTy::operator()(CodeGenFunction &CGF) const {
   }
 }
 
+/// Check if the combiner is a call to UDR combiner and if it is so return the
+/// UDR decl used for reduction.
+static const OMPDeclareReductionDecl *
+getReductionInit(const Expr *ReductionOp) {
+  if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
+    if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
+      if (auto *DRE =
+              dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
+        if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl()))
+          return DRD;
+  return nullptr;
+}
+
+static void emitInitWithReductionInitializer(CodeGenFunction &CGF,
+                                             const OMPDeclareReductionDecl *DRD,
+                                             const Expr *InitOp,
+                                             Address Private, Address Original,
+                                             QualType Ty) {
+  if (DRD->getInitializer()) {
+    std::pair<llvm::Function *, llvm::Function *> Reduction =
+        CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
+    auto *CE = cast<CallExpr>(InitOp);
+    auto *OVE = cast<OpaqueValueExpr>(CE->getCallee());
+    const Expr *LHS = CE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
+    const Expr *RHS = CE->getArg(/*Arg=*/1)->IgnoreParenImpCasts();
+    auto *LHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(LHS)->getSubExpr());
+    auto *RHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(RHS)->getSubExpr());
+    CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+    PrivateScope.addPrivate(cast<VarDecl>(LHSDRE->getDecl()),
+                            [=]() -> Address { return Private; });
+    PrivateScope.addPrivate(cast<VarDecl>(RHSDRE->getDecl()),
+                            [=]() -> Address { return Original; });
+    (void)PrivateScope.Privatize();
+    RValue Func = RValue::get(Reduction.second);
+    CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
+    CGF.EmitIgnoredExpr(InitOp);
+  } else {
+    llvm::Constant *Init = CGF.CGM.EmitNullConstant(Ty);
+    auto *GV = new llvm::GlobalVariable(
+        CGF.CGM.getModule(), Init->getType(), /*isConstant=*/true,
+        llvm::GlobalValue::PrivateLinkage, Init, ".init");
+    LValue LV = CGF.MakeNaturalAlignAddrLValue(GV, Ty);
+    RValue InitRVal;
+    switch (CGF.getEvaluationKind(Ty)) {
+    case TEK_Scalar:
+      InitRVal = CGF.EmitLoadOfLValue(LV, SourceLocation());
+      break;
+    case TEK_Complex:
+      InitRVal =
+          RValue::getComplex(CGF.EmitLoadOfComplex(LV, SourceLocation()));
+      break;
+    case TEK_Aggregate:
+      InitRVal = RValue::getAggregate(LV.getAddress());
+      break;
+    }
+    OpaqueValueExpr OVE(SourceLocation(), Ty, VK_RValue);
+    CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, &OVE, InitRVal);
+    CGF.EmitAnyExprToMem(&OVE, Private, Ty.getQualifiers(),
+                         /*IsInitializer=*/false);
+  }
+}
+
+/// \brief Emit initialization of arrays of complex types.
+/// \param DestAddr Address of the array.
+/// \param Type Type of array.
+/// \param Init Initial expression of array.
+/// \param SrcAddr Address of the original array.
+static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr,
+                                 QualType Type, const Expr *Init,
+                                 const OMPDeclareReductionDecl *DRD,
+                                 Address SrcAddr = Address::invalid()) {
+  // Perform element-by-element initialization.
+  QualType ElementTy;
+
+  // Drill down to the base element type on both arrays.
+  auto ArrayTy = Type->getAsArrayTypeUnsafe();
+  auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, DestAddr);
+  DestAddr =
+      CGF.Builder.CreateElementBitCast(DestAddr, DestAddr.getElementType());
+  if (DRD)
+    SrcAddr =
+        CGF.Builder.CreateElementBitCast(SrcAddr, DestAddr.getElementType());
+
+  llvm::Value *SrcBegin = nullptr;
+  if (DRD)
+    SrcBegin = SrcAddr.getPointer();
+  auto DestBegin = DestAddr.getPointer();
+  // Cast from pointer to array type to pointer to single element.
+  auto DestEnd = CGF.Builder.CreateGEP(DestBegin, NumElements);
+  // The basic structure here is a while-do loop.
+  auto BodyBB = CGF.createBasicBlock("omp.arrayinit.body");
+  auto DoneBB = CGF.createBasicBlock("omp.arrayinit.done");
+  auto IsEmpty =
+      CGF.Builder.CreateICmpEQ(DestBegin, DestEnd, "omp.arrayinit.isempty");
+  CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
+
+  // Enter the loop body, making that address the current address.
+  auto EntryBB = CGF.Builder.GetInsertBlock();
+  CGF.EmitBlock(BodyBB);
+
+  CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy);
+
+  llvm::PHINode *SrcElementPHI = nullptr;
+  Address SrcElementCurrent = Address::invalid();
+  if (DRD) {
+    SrcElementPHI = CGF.Builder.CreatePHI(SrcBegin->getType(), 2,
+                                          "omp.arraycpy.srcElementPast");
+    SrcElementPHI->addIncoming(SrcBegin, EntryBB);
+    SrcElementCurrent =
+        Address(SrcElementPHI,
+                SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize));
+  }
+  llvm::PHINode *DestElementPHI = CGF.Builder.CreatePHI(
+      DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
+  DestElementPHI->addIncoming(DestBegin, EntryBB);
+  Address DestElementCurrent =
+      Address(DestElementPHI,
+              DestAddr.getAlignment().alignmentOfArrayElement(ElementSize));
+
+  // Emit copy.
+  {
+    CodeGenFunction::RunCleanupsScope InitScope(CGF);
+    if (DRD && (DRD->getInitializer() || !Init)) {
+      emitInitWithReductionInitializer(CGF, DRD, Init, DestElementCurrent,
+                                       SrcElementCurrent, ElementTy);
+    } else
+      CGF.EmitAnyExprToMem(Init, DestElementCurrent, ElementTy.getQualifiers(),
+                           /*IsInitializer=*/false);
+  }
+
+  if (DRD) {
+    // Shift the address forward by one element.
+    auto SrcElementNext = CGF.Builder.CreateConstGEP1_32(
+        SrcElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
+    SrcElementPHI->addIncoming(SrcElementNext, CGF.Builder.GetInsertBlock());
+  }
+
+  // Shift the address forward by one element.
+  auto DestElementNext = CGF.Builder.CreateConstGEP1_32(
+      DestElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
+  // Check whether we've reached the end.
+  auto Done =
+      CGF.Builder.CreateICmpEQ(DestElementNext, DestEnd, "omp.arraycpy.done");
+  CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB);
+  DestElementPHI->addIncoming(DestElementNext, CGF.Builder.GetInsertBlock());
+
+  // Done.
+  CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
+}
+
+LValue ReductionCodeGen::emitSharedLValue(CodeGenFunction &CGF, const Expr *E) {
+  if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E))
+    return CGF.EmitOMPArraySectionExpr(OASE);
+  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
+    return CGF.EmitLValue(ASE);
+  auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
+  DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
+                  CGF.CapturedStmtInfo &&
+                      CGF.CapturedStmtInfo->lookup(OrigVD) != nullptr,
+                  E->getType(), VK_LValue, E->getExprLoc());
+  // Store the address of the original variable associated with the LHS
+  // implicit variable.
+  return CGF.EmitLValue(&DRE);
+}
+
+LValue ReductionCodeGen::emitSharedLValueUB(CodeGenFunction &CGF,
+                                            const Expr *E) {
+  if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E))
+    return CGF.EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false);
+  return LValue();
+}
+
+void ReductionCodeGen::emitAggregateInitialization(
+    CodeGenFunction &CGF, unsigned N, Address PrivateAddr, LValue SharedLVal,
+    const OMPDeclareReductionDecl *DRD) {
+  // Emit VarDecl with copy init for arrays.
+  // Get the address of the original variable captured in current
+  // captured region.
+  auto *PrivateVD =
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
+  EmitOMPAggregateInit(CGF, PrivateAddr, PrivateVD->getType(),
+                       DRD ? ClausesData[N].ReductionOp : PrivateVD->getInit(),
+                       DRD, SharedLVal.getAddress());
+}
+
+ReductionCodeGen::ReductionCodeGen(ArrayRef<const Expr *> Shareds,
+                                   ArrayRef<const Expr *> Privates,
+                                   ArrayRef<const Expr *> ReductionOps) {
+  ClausesData.reserve(Shareds.size());
+  SharedAddresses.reserve(Shareds.size());
+  Sizes.reserve(Shareds.size());
+  BaseDecls.reserve(Shareds.size());
+  auto IPriv = Privates.begin();
+  auto IRed = ReductionOps.begin();
+  for (const auto *Ref : Shareds) {
+    ClausesData.emplace_back(Ref, *IPriv, *IRed);
+    std::advance(IPriv, 1);
+    std::advance(IRed, 1);
+  }
+}
+
+void ReductionCodeGen::emitSharedLValue(CodeGenFunction &CGF, unsigned N) {
+  assert(SharedAddresses.size() == N &&
+         "Number of generated lvalues must be exactly N.");
+  SharedAddresses.emplace_back(emitSharedLValue(CGF, ClausesData[N].Ref),
+                               emitSharedLValueUB(CGF, ClausesData[N].Ref));
+}
+
+void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N) {
+  auto *PrivateVD =
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
+  QualType PrivateType = PrivateVD->getType();
+  bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref);
+  if (!AsArraySection && !PrivateType->isVariablyModifiedType()) {
+    Sizes.emplace_back(
+        CGF.getTypeSize(
+            SharedAddresses[N].first.getType().getNonReferenceType()),
+        nullptr);
+    return;
+  }
+  llvm::Value *Size;
+  llvm::Value *SizeInChars;
+  llvm::Type *ElemType =
+      cast<llvm::PointerType>(SharedAddresses[N].first.getPointer()->getType())
+          ->getElementType();
+  auto *ElemSizeOf = llvm::ConstantExpr::getSizeOf(ElemType);
+  if (AsArraySection) {
+    Size = CGF.Builder.CreatePtrDiff(SharedAddresses[N].second.getPointer(),
+                                     SharedAddresses[N].first.getPointer());
+    Size = CGF.Builder.CreateNUWAdd(
+        Size, llvm::ConstantInt::get(Size->getType(), /*V=*/1));
+    SizeInChars = CGF.Builder.CreateNUWMul(Size, ElemSizeOf);
+  } else {
+    SizeInChars = CGF.getTypeSize(
+        SharedAddresses[N].first.getType().getNonReferenceType());
+    Size = CGF.Builder.CreateExactUDiv(SizeInChars, ElemSizeOf);
+  }
+  Sizes.emplace_back(SizeInChars, Size);
+  CodeGenFunction::OpaqueValueMapping OpaqueMap(
+      CGF,
+      cast<OpaqueValueExpr>(
+          CGF.getContext().getAsVariableArrayType(PrivateType)->getSizeExpr()),
+      RValue::get(Size));
+  CGF.EmitVariablyModifiedType(PrivateType);
+}
+
+void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N,
+                                         llvm::Value *Size) {
+  auto *PrivateVD =
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
+  QualType PrivateType = PrivateVD->getType();
+  bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref);
+  if (!AsArraySection && !PrivateType->isVariablyModifiedType()) {
+    assert(!Size && !Sizes[N].second &&
+           "Size should be nullptr for non-variably modified redution "
+           "items.");
+    return;
+  }
+  CodeGenFunction::OpaqueValueMapping OpaqueMap(
+      CGF,
+      cast<OpaqueValueExpr>(
+          CGF.getContext().getAsVariableArrayType(PrivateType)->getSizeExpr()),
+      RValue::get(Size));
+  CGF.EmitVariablyModifiedType(PrivateType);
+}
+
+void ReductionCodeGen::emitInitialization(
+    CodeGenFunction &CGF, unsigned N, Address PrivateAddr, LValue SharedLVal,
+    llvm::function_ref<bool(CodeGenFunction &)> DefaultInit) {
+  assert(SharedAddresses.size() > N && "No variable was generated");
+  auto *PrivateVD =
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
+  auto *DRD = getReductionInit(ClausesData[N].ReductionOp);
+  QualType PrivateType = PrivateVD->getType();
+  PrivateAddr = CGF.Builder.CreateElementBitCast(
+      PrivateAddr, CGF.ConvertTypeForMem(PrivateType));
+  QualType SharedType = SharedAddresses[N].first.getType();
+  SharedLVal = CGF.MakeAddrLValue(
+      CGF.Builder.CreateElementBitCast(SharedLVal.getAddress(),
+                                       CGF.ConvertTypeForMem(SharedType)),
+      SharedType, SharedAddresses[N].first.getBaseInfo());
+  if (isa<OMPArraySectionExpr>(ClausesData[N].Ref) ||
+      CGF.getContext().getAsArrayType(PrivateVD->getType())) {
+    emitAggregateInitialization(CGF, N, PrivateAddr, SharedLVal, DRD);
+  } else if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
+    emitInitWithReductionInitializer(CGF, DRD, ClausesData[N].ReductionOp,
+                                     PrivateAddr, SharedLVal.getAddress(),
+                                     SharedLVal.getType());
+  } else if (!DefaultInit(CGF) && PrivateVD->hasInit() &&
+             !CGF.isTrivialInitializer(PrivateVD->getInit())) {
+    CGF.EmitAnyExprToMem(PrivateVD->getInit(), PrivateAddr,
+                         PrivateVD->getType().getQualifiers(),
+                         /*IsInitializer=*/false);
+  }
+}
+
+bool ReductionCodeGen::needCleanups(unsigned N) {
+  auto *PrivateVD =
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
+  QualType PrivateType = PrivateVD->getType();
+  QualType::DestructionKind DTorKind = PrivateType.isDestructedType();
+  return DTorKind != QualType::DK_none;
+}
+
+void ReductionCodeGen::emitCleanups(CodeGenFunction &CGF, unsigned N,
+                                    Address PrivateAddr) {
+  auto *PrivateVD =
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl());
+  QualType PrivateType = PrivateVD->getType();
+  QualType::DestructionKind DTorKind = PrivateType.isDestructedType();
+  if (needCleanups(N)) {
+    PrivateAddr = CGF.Builder.CreateElementBitCast(
+        PrivateAddr, CGF.ConvertTypeForMem(PrivateType));
+    CGF.pushDestroy(DTorKind, PrivateAddr, PrivateType);
+  }
+}
+
+static LValue loadToBegin(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
+                          LValue BaseLV) {
+  BaseTy = BaseTy.getNonReferenceType();
+  while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
+         !CGF.getContext().hasSameType(BaseTy, ElTy)) {
+    if (auto *PtrTy = BaseTy->getAs<PointerType>())
+      BaseLV = CGF.EmitLoadOfPointerLValue(BaseLV.getAddress(), PtrTy);
+    else {
+      BaseLV = CGF.EmitLoadOfReferenceLValue(BaseLV.getAddress(),
+                                             BaseTy->castAs<ReferenceType>());
+    }
+    BaseTy = BaseTy->getPointeeType();
+  }
+  return CGF.MakeAddrLValue(
+      CGF.Builder.CreateElementBitCast(BaseLV.getAddress(),
+                                       CGF.ConvertTypeForMem(ElTy)),
+      BaseLV.getType(), BaseLV.getBaseInfo());
+}
+
+static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
+                          llvm::Type *BaseLVType, CharUnits BaseLVAlignment,
+                          llvm::Value *Addr) {
+  Address Tmp = Address::invalid();
+  Address TopTmp = Address::invalid();
+  Address MostTopTmp = Address::invalid();
+  BaseTy = BaseTy.getNonReferenceType();
+  while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
+         !CGF.getContext().hasSameType(BaseTy, ElTy)) {
+    Tmp = CGF.CreateMemTemp(BaseTy);
+    if (TopTmp.isValid())
+      CGF.Builder.CreateStore(Tmp.getPointer(), TopTmp);
+    else
+      MostTopTmp = Tmp;
+    TopTmp = Tmp;
+    BaseTy = BaseTy->getPointeeType();
+  }
+  llvm::Type *Ty = BaseLVType;
+  if (Tmp.isValid())
+    Ty = Tmp.getElementType();
+  Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Ty);
+  if (Tmp.isValid()) {
+    CGF.Builder.CreateStore(Addr, Tmp);
+    return MostTopTmp;
+  }
+  return Address(Addr, BaseLVAlignment);
+}
+
+Address ReductionCodeGen::adjustPrivateAddress(CodeGenFunction &CGF, unsigned N,
+                                               Address PrivateAddr) {
+  const DeclRefExpr *DE;
+  const VarDecl *OrigVD = nullptr;
+  if (auto *OASE = dyn_cast<OMPArraySectionExpr>(ClausesData[N].Ref)) {
+    auto *Base = OASE->getBase()->IgnoreParenImpCasts();
+    while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+      Base = TempOASE->getBase()->IgnoreParenImpCasts();
+    while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    DE = cast<DeclRefExpr>(Base);
+    OrigVD = cast<VarDecl>(DE->getDecl());
+  } else if (auto *ASE = dyn_cast<ArraySubscriptExpr>(ClausesData[N].Ref)) {
+    auto *Base = ASE->getBase()->IgnoreParenImpCasts();
+    while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    DE = cast<DeclRefExpr>(Base);
+    OrigVD = cast<VarDecl>(DE->getDecl());
+  }
+  if (OrigVD) {
+    BaseDecls.emplace_back(OrigVD);
+    auto OriginalBaseLValue = CGF.EmitLValue(DE);
+    LValue BaseLValue =
+        loadToBegin(CGF, OrigVD->getType(), SharedAddresses[N].first.getType(),
+                    OriginalBaseLValue);
+    llvm::Value *Adjustment = CGF.Builder.CreatePtrDiff(
+        BaseLValue.getPointer(), SharedAddresses[N].first.getPointer());
+    llvm::Value *Ptr =
+        CGF.Builder.CreateGEP(PrivateAddr.getPointer(), Adjustment);
+    return castToBase(CGF, OrigVD->getType(),
+                      SharedAddresses[N].first.getType(),
+                      OriginalBaseLValue.getPointer()->getType(),
+                      OriginalBaseLValue.getAlignment(), Ptr);
+  }
+  BaseDecls.emplace_back(
+      cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Ref)->getDecl()));
+  return PrivateAddr;
+}
+
+bool ReductionCodeGen::usesReductionInitializer(unsigned N) const {
+  auto *DRD = getReductionInit(ClausesData[N].ReductionOp);
+  return DRD && DRD->getInitializer();
+}
+
 LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
   return CGF.EmitLoadOfPointerLValue(
       CGF.GetAddrOfLocalVar(getThreadIDVariable()),
@@ -1554,6 +1968,26 @@ CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
     RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_wait");
     break;
   }
+  case OMPRTL__kmpc_task_reduction_init: {
+    // Build void *__kmpc_task_reduction_init(int gtid, int num_data, void
+    // *data);
+    llvm::Type *TypeParams[] = {CGM.IntTy, CGM.IntTy, CGM.VoidPtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
+    RTLFn =
+        CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_task_reduction_init");
+    break;
+  }
+  case OMPRTL__kmpc_task_reduction_get_th_data: {
+    // Build void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
+    // *d);
+    llvm::Type *TypeParams[] = {CGM.IntTy, CGM.VoidPtrTy, CGM.VoidPtrTy};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(
+        FnTy, /*Name=*/"__kmpc_task_reduction_get_th_data");
+    break;
+  }
   case OMPRTL__tgt_target: {
     // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
     // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
@@ -1904,6 +2338,27 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
   return nullptr;
 }
 
+Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
+                                                          QualType VarType,
+                                                          StringRef Name) {
+  llvm::Twine VarName(Name, ".artificial.");
+  llvm::Type *VarLVType = CGF.ConvertTypeForMem(VarType);
+  llvm::Value *GAddr = getOrCreateInternalVariable(VarLVType, VarName);
+  llvm::Value *Args[] = {
+      emitUpdateLocation(CGF, SourceLocation()),
+      getThreadID(CGF, SourceLocation()),
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(GAddr, CGM.VoidPtrTy),
+      CGF.Builder.CreateIntCast(CGF.getTypeSize(VarType), CGM.SizeTy,
+                                /*IsSigned=*/false),
+      getOrCreateInternalVariable(CGM.VoidPtrPtrTy, VarName + ".cache.")};
+  return Address(
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.EmitRuntimeCall(
+              createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args),
+          VarLVType->getPointerTo(/*AddrSpace=*/0)),
+      CGM.getPointerAlign());
+}
+
 /// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
 /// function. Here is the logic:
 /// if (Cond) {
@@ -2699,6 +3154,8 @@ enum KmpTaskTFields {
   KmpTaskTStride,
   /// (Taskloops only) Is last iteration flag.
   KmpTaskTLastIter,
+  /// (Taskloops only) Reduction data.
+  KmpTaskTReductions,
 };
 } // anonymous namespace
 
@@ -3250,6 +3707,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
   //         kmp_uint64          ub;
   //         kmp_int64           st;
   //         kmp_int32           liter;
+  //         void *              reductions;
   //       };
   auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
   UD->startDefinition();
@@ -3273,6 +3731,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
     addFieldToRecordDecl(C, RD, KmpUInt64Ty);
     addFieldToRecordDecl(C, RD, KmpInt64Ty);
     addFieldToRecordDecl(C, RD, KmpInt32Ty);
+    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
   }
   RD->completeDefinition();
   return RD;
@@ -3303,7 +3762,7 @@ createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy,
 ///   TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt,
 ///   For taskloops:
 ///   tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
-///   tt->shareds);
+///   tt->reductions, tt->shareds);
 ///   return 0;
 /// }
 /// \endcode
@@ -3389,10 +3848,14 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
     auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
     auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
     auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
+    auto RFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTReductions);
+    auto RLVal = CGF.EmitLValueForField(Base, *RFI);
+    auto *RParam = CGF.EmitLoadOfLValue(RLVal, Loc).getScalarVal();
     CallArgs.push_back(LBParam);
     CallArgs.push_back(UBParam);
     CallArgs.push_back(StParam);
     CallArgs.push_back(LIParam);
+    CallArgs.push_back(RParam);
   }
   CallArgs.push_back(SharedsParam);
 
@@ -4155,6 +4618,16 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
   CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
                        /*IsInitializer=*/true);
+  // Store reductions address.
+  LValue RedLVal = CGF.EmitLValueForField(
+      Result.TDBase,
+      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTReductions));
+  if (Data.Reductions)
+    CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
+  else {
+    CGF.EmitNullInitialization(RedLVal.getAddress(),
+                               CGF.getContext().VoidPtrTy);
+  }
   enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
   llvm::Value *TaskArgs[] = {
       UpLoc,
@@ -4680,6 +5153,353 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
   CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
 }
 
+/// Generates unique name for artificial threadprivate variables.
+/// Format is: <Prefix> "." <Loc_raw_encoding> "_" <N>
+static std::string generateUniqueName(StringRef Prefix, SourceLocation Loc,
+                                      unsigned N) {
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << Prefix << "." << Loc.getRawEncoding() << "_" << N;
+  return Out.str();
+}
+
+/// Emits reduction initializer function:
+/// \code
+/// void @.red_init(void* %arg) {
+/// %0 = bitcast void* %arg to <type>*
+/// store <type> <init>, <type>* %0
+/// ret void
+/// }
+/// \endcode
+static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM,
+                                           SourceLocation Loc,
+                                           ReductionCodeGen &RCG, unsigned N) {
+  auto &C = CGM.getContext();
+  FunctionArgList Args;
+  ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  Args.emplace_back(&Param);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    ".red_init.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  Address PrivateAddr = CGF.EmitLoadOfPointer(
+      CGF.GetAddrOfLocalVar(&Param),
+      C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+  llvm::Value *Size = nullptr;
+  // If the size of the reduction item is non-constant, load it from global
+  // threadprivate variable.
+  if (RCG.getSizes(N).second) {
+    Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    Size =
+        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                             CGM.getContext().getSizeType(), SourceLocation());
+  }
+  RCG.emitAggregateType(CGF, N, Size);
+  LValue SharedLVal;
+  // If initializer uses initializer from declare reduction construct, emit a
+  // pointer to the address of the original reduction item (reuired by reduction
+  // initializer)
+  if (RCG.usesReductionInitializer(N)) {
+    Address SharedAddr =
+        CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+            CGF, CGM.getContext().VoidPtrTy,
+            generateUniqueName("reduction", Loc, N));
+    SharedLVal = CGF.MakeAddrLValue(SharedAddr, CGM.getContext().VoidPtrTy);
+  } else {
+    SharedLVal = CGF.MakeNaturalAlignAddrLValue(
+        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+        CGM.getContext().VoidPtrTy);
+  }
+  // Emit the initializer:
+  // %0 = bitcast void* %arg to <type>*
+  // store <type> <init>, <type>* %0
+  RCG.emitInitialization(CGF, N, PrivateAddr, SharedLVal,
+                         [](CodeGenFunction &) { return false; });
+  CGF.FinishFunction();
+  return Fn;
+}
+
+/// Emits reduction combiner function:
+/// \code
+/// void @.red_comb(void* %arg0, void* %arg1) {
+/// %lhs = bitcast void* %arg0 to <type>*
+/// %rhs = bitcast void* %arg1 to <type>*
+/// %2 = <ReductionOp>(<type>* %lhs, <type>* %rhs)
+/// store <type> %2, <type>* %lhs
+/// ret void
+/// }
+/// \endcode
+static llvm::Value *emitReduceCombFunction(CodeGenModule &CGM,
+                                           SourceLocation Loc,
+                                           ReductionCodeGen &RCG, unsigned N,
+                                           const Expr *ReductionOp,
+                                           const Expr *LHS, const Expr *RHS,
+                                           const Expr *PrivateRef) {
+  auto &C = CGM.getContext();
+  auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(LHS)->getDecl());
+  auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(RHS)->getDecl());
+  FunctionArgList Args;
+  ImplicitParamDecl ParamInOut(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  ImplicitParamDecl ParamIn(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  Args.emplace_back(&ParamInOut);
+  Args.emplace_back(&ParamIn);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    ".red_comb.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  llvm::Value *Size = nullptr;
+  // If the size of the reduction item is non-constant, load it from global
+  // threadprivate variable.
+  if (RCG.getSizes(N).second) {
+    Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    Size =
+        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                             CGM.getContext().getSizeType(), SourceLocation());
+  }
+  RCG.emitAggregateType(CGF, N, Size);
+  // Remap lhs and rhs variables to the addresses of the function arguments.
+  // %lhs = bitcast void* %arg0 to <type>*
+  // %rhs = bitcast void* %arg1 to <type>*
+  CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+  PrivateScope.addPrivate(LHSVD, [&C, &CGF, &ParamInOut, LHSVD]() -> Address {
+    // Pull out the pointer to the variable.
+    Address PtrAddr = CGF.EmitLoadOfPointer(
+        CGF.GetAddrOfLocalVar(&ParamInOut),
+        C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+    return CGF.Builder.CreateElementBitCast(
+        PtrAddr, CGF.ConvertTypeForMem(LHSVD->getType()));
+  });
+  PrivateScope.addPrivate(RHSVD, [&C, &CGF, &ParamIn, RHSVD]() -> Address {
+    // Pull out the pointer to the variable.
+    Address PtrAddr = CGF.EmitLoadOfPointer(
+        CGF.GetAddrOfLocalVar(&ParamIn),
+        C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+    return CGF.Builder.CreateElementBitCast(
+        PtrAddr, CGF.ConvertTypeForMem(RHSVD->getType()));
+  });
+  PrivateScope.Privatize();
+  // Emit the combiner body:
+  // %2 = <ReductionOp>(<type> *%lhs, <type> *%rhs)
+  // store <type> %2, <type>* %lhs
+  CGM.getOpenMPRuntime().emitSingleReductionCombiner(
+      CGF, ReductionOp, PrivateRef, cast<DeclRefExpr>(LHS),
+      cast<DeclRefExpr>(RHS));
+  CGF.FinishFunction();
+  return Fn;
+}
+
+/// Emits reduction finalizer function:
+/// \code
+/// void @.red_fini(void* %arg) {
+/// %0 = bitcast void* %arg to <type>*
+/// <destroy>(<type>* %0)
+/// ret void
+/// }
+/// \endcode
+static llvm::Value *emitReduceFiniFunction(CodeGenModule &CGM,
+                                           SourceLocation Loc,
+                                           ReductionCodeGen &RCG, unsigned N) {
+  if (!RCG.needCleanups(N))
+    return nullptr;
+  auto &C = CGM.getContext();
+  FunctionArgList Args;
+  ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  Args.emplace_back(&Param);
+  auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    ".red_fini.", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
+  CodeGenFunction CGF(CGM);
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
+  Address PrivateAddr = CGF.EmitLoadOfPointer(
+      CGF.GetAddrOfLocalVar(&Param),
+      C.getPointerType(C.VoidPtrTy).castAs<PointerType>());
+  llvm::Value *Size = nullptr;
+  // If the size of the reduction item is non-constant, load it from global
+  // threadprivate variable.
+  if (RCG.getSizes(N).second) {
+    Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    Size =
+        CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false,
+                             CGM.getContext().getSizeType(), SourceLocation());
+  }
+  RCG.emitAggregateType(CGF, N, Size);
+  // Emit the finalizer body:
+  // <destroy>(<type>* %0)
+  RCG.emitCleanups(CGF, N, PrivateAddr);
+  CGF.FinishFunction();
+  return Fn;
+}
+
+llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
+    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> LHSExprs,
+    ArrayRef<const Expr *> RHSExprs, const OMPTaskDataTy &Data) {
+  if (!CGF.HaveInsertPoint() || Data.ReductionVars.empty())
+    return nullptr;
+
+  // Build typedef struct:
+  // kmp_task_red_input {
+  //   void *reduce_shar; // shared reduction item
+  //   size_t reduce_size; // size of data item
+  //   void *reduce_init; // data initialization routine
+  //   void *reduce_fini; // data finalization routine
+  //   void *reduce_comb; // data combiner routine
+  //   kmp_task_red_flags_t flags; // flags for additional info from compiler
+  // } kmp_task_red_input_t;
+  ASTContext &C = CGM.getContext();
+  auto *RD = C.buildImplicitRecord("kmp_task_red_input_t");
+  RD->startDefinition();
+  const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType());
+  const FieldDecl *InitFD  = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy);
+  const FieldDecl *FlagsFD = addFieldToRecordDecl(
+      C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false));
+  RD->completeDefinition();
+  QualType RDType = C.getRecordType(RD);
+  unsigned Size = Data.ReductionVars.size();
+  llvm::APInt ArraySize(/*numBits=*/64, Size);
+  QualType ArrayRDType = C.getConstantArrayType(
+      RDType, ArraySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
+  // kmp_task_red_input_t .rd_input.[Size];
+  Address TaskRedInput = CGF.CreateMemTemp(ArrayRDType, ".rd_input.");
+  ReductionCodeGen RCG(Data.ReductionVars, Data.ReductionCopies,
+                       Data.ReductionOps);
+  for (unsigned Cnt = 0; Cnt < Size; ++Cnt) {
+    // kmp_task_red_input_t &ElemLVal = .rd_input.[Cnt];
+    llvm::Value *Idxs[] = {llvm::ConstantInt::get(CGM.SizeTy, /*V=*/0),
+                           llvm::ConstantInt::get(CGM.SizeTy, Cnt)};
+    llvm::Value *GEP = CGF.EmitCheckedInBoundsGEP(
+        TaskRedInput.getPointer(), Idxs,
+        /*SignedIndices=*/false, /*IsSubtraction=*/false, Loc,
+        ".rd_input.gep.");
+    LValue ElemLVal = CGF.MakeNaturalAlignAddrLValue(GEP, RDType);
+    // ElemLVal.reduce_shar = &Shareds[Cnt];
+    LValue SharedLVal = CGF.EmitLValueForField(ElemLVal, SharedFD);
+    RCG.emitSharedLValue(CGF, Cnt);
+    llvm::Value *CastedShared =
+        CGF.EmitCastToVoidPtr(RCG.getSharedLValue(Cnt).getPointer());
+    CGF.EmitStoreOfScalar(CastedShared, SharedLVal);
+    RCG.emitAggregateType(CGF, Cnt);
+    llvm::Value *SizeValInChars;
+    llvm::Value *SizeVal;
+    std::tie(SizeValInChars, SizeVal) = RCG.getSizes(Cnt);
+    // We use delayed creation/initialization for VLAs, array sections and
+    // custom reduction initializations. It is required because runtime does not
+    // provide the way to pass the sizes of VLAs/array sections to
+    // initializer/combiner/finalizer functions and does not pass the pointer to
+    // original reduction item to the initializer. Instead threadprivate global
+    // variables are used to store these values and use them in the functions.
+    bool DelayedCreation = !!SizeVal;
+    SizeValInChars = CGF.Builder.CreateIntCast(SizeValInChars, CGM.SizeTy,
+                                               /*isSigned=*/false);
+    LValue SizeLVal = CGF.EmitLValueForField(ElemLVal, SizeFD);
+    CGF.EmitStoreOfScalar(SizeValInChars, SizeLVal);
+    // ElemLVal.reduce_init = init;
+    LValue InitLVal = CGF.EmitLValueForField(ElemLVal, InitFD);
+    llvm::Value *InitAddr =
+        CGF.EmitCastToVoidPtr(emitReduceInitFunction(CGM, Loc, RCG, Cnt));
+    CGF.EmitStoreOfScalar(InitAddr, InitLVal);
+    DelayedCreation = DelayedCreation || RCG.usesReductionInitializer(Cnt);
+    // ElemLVal.reduce_fini = fini;
+    LValue FiniLVal = CGF.EmitLValueForField(ElemLVal, FiniFD);
+    llvm::Value *Fini = emitReduceFiniFunction(CGM, Loc, RCG, Cnt);
+    llvm::Value *FiniAddr = Fini
+                                ? CGF.EmitCastToVoidPtr(Fini)
+                                : llvm::ConstantPointerNull::get(CGM.VoidPtrTy);
+    CGF.EmitStoreOfScalar(FiniAddr, FiniLVal);
+    // ElemLVal.reduce_comb = comb;
+    LValue CombLVal = CGF.EmitLValueForField(ElemLVal, CombFD);
+    llvm::Value *CombAddr = CGF.EmitCastToVoidPtr(emitReduceCombFunction(
+        CGM, Loc, RCG, Cnt, Data.ReductionOps[Cnt], LHSExprs[Cnt],
+        RHSExprs[Cnt], Data.ReductionCopies[Cnt]));
+    CGF.EmitStoreOfScalar(CombAddr, CombLVal);
+    // ElemLVal.flags = 0;
+    LValue FlagsLVal = CGF.EmitLValueForField(ElemLVal, FlagsFD);
+    if (DelayedCreation) {
+      CGF.EmitStoreOfScalar(
+          llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1, /*IsSigned=*/true),
+          FlagsLVal);
+    } else
+      CGF.EmitNullInitialization(FlagsLVal.getAddress(), FlagsLVal.getType());
+  }
+  // Build call void *__kmpc_task_reduction_init(int gtid, int num_data, void
+  // *data);
+  llvm::Value *Args[] = {
+      CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy,
+                                /*isSigned=*/true),
+      llvm::ConstantInt::get(CGM.IntTy, Size, /*isSigned=*/true),
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TaskRedInput.getPointer(),
+                                                      CGM.VoidPtrTy)};
+  return CGF.EmitRuntimeCall(
+      createRuntimeFunction(OMPRTL__kmpc_task_reduction_init), Args);
+}
+
+void CGOpenMPRuntime::emitTaskReductionFixups(CodeGenFunction &CGF,
+                                              SourceLocation Loc,
+                                              ReductionCodeGen &RCG,
+                                              unsigned N) {
+  auto Sizes = RCG.getSizes(N);
+  // Emit threadprivate global variable if the type is non-constant
+  // (Sizes.second = nullptr).
+  if (Sizes.second) {
+    llvm::Value *SizeVal = CGF.Builder.CreateIntCast(Sizes.second, CGM.SizeTy,
+                                                     /*isSigned=*/false);
+    Address SizeAddr = getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().getSizeType(),
+        generateUniqueName("reduction_size", Loc, N));
+    CGF.Builder.CreateStore(SizeVal, SizeAddr, /*IsVolatile=*/false);
+  }
+  // Store address of the original reduction item if custom initializer is used.
+  if (RCG.usesReductionInitializer(N)) {
+    Address SharedAddr = getAddrOfArtificialThreadPrivate(
+        CGF, CGM.getContext().VoidPtrTy,
+        generateUniqueName("reduction", Loc, N));
+    CGF.Builder.CreateStore(
+        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            RCG.getSharedLValue(N).getPointer(), CGM.VoidPtrTy),
+        SharedAddr, /*IsVolatile=*/false);
+  }
+}
+
+Address CGOpenMPRuntime::getTaskReductionItem(CodeGenFunction &CGF,
+                                              SourceLocation Loc,
+                                              llvm::Value *ReductionsPtr,
+                                              LValue SharedLVal) {
+  // Build call void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
+  // *d);
+  llvm::Value *Args[] = {
+      CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy,
+                                /*isSigned=*/true),
+      ReductionsPtr,
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(SharedLVal.getPointer(),
+                                                      CGM.VoidPtrTy)};
+  return Address(
+      CGF.EmitRuntimeCall(
+          createRuntimeFunction(OMPRTL__kmpc_task_reduction_get_th_data), Args),
+      SharedLVal.getAlignment());
+}
+
 void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF,
                                        SourceLocation Loc) {
   if (!CGF.HaveInsertPoint())
diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h
index 6f460f121791..5dcf999bea37 100644
--- a/lib/CodeGen/CGOpenMPRuntime.h
+++ b/lib/CodeGen/CGOpenMPRuntime.h
@@ -96,15 +96,106 @@ struct OMPTaskDataTy final {
   SmallVector<const Expr *, 4> FirstprivateInits;
   SmallVector<const Expr *, 4> LastprivateVars;
   SmallVector<const Expr *, 4> LastprivateCopies;
+  SmallVector<const Expr *, 4> ReductionVars;
+  SmallVector<const Expr *, 4> ReductionCopies;
+  SmallVector<const Expr *, 4> ReductionOps;
   SmallVector<std::pair<OpenMPDependClauseKind, const Expr *>, 4> Dependences;
   llvm::PointerIntPair<llvm::Value *, 1, bool> Final;
   llvm::PointerIntPair<llvm::Value *, 1, bool> Schedule;
   llvm::PointerIntPair<llvm::Value *, 1, bool> Priority;
+  llvm::Value *Reductions = nullptr;
   unsigned NumberOfParts = 0;
   bool Tied = true;
   bool Nogroup = false;
 };
 
+/// Class intended to support codegen of all kind of the reduction clauses.
+class ReductionCodeGen {
+private:
+  /// Data required for codegen of reduction clauses.
+  struct ReductionData {
+    /// Reference to the original shared item.
+    const Expr *Ref = nullptr;
+    /// Helper expression for generation of private copy.
+    const Expr *Private = nullptr;
+    /// Helper expression for generation reduction operation.
+    const Expr *ReductionOp = nullptr;
+    ReductionData(const Expr *Ref, const Expr *Private, const Expr *ReductionOp)
+        : Ref(Ref), Private(Private), ReductionOp(ReductionOp) {}
+  };
+  /// List of reduction-based clauses.
+  SmallVector<ReductionData, 4> ClausesData;
+
+  /// List of addresses of original shared variables/expressions.
+  SmallVector<std::pair<LValue, LValue>, 4> SharedAddresses;
+  /// Sizes of the reduction items in chars.
+  SmallVector<std::pair<llvm::Value *, llvm::Value *>, 4> Sizes;
+  /// Base declarations for the reduction items.
+  SmallVector<const VarDecl *, 4> BaseDecls;
+
+  /// Emits lvalue for shared expresion.
+  LValue emitSharedLValue(CodeGenFunction &CGF, const Expr *E);
+  /// Emits upper bound for shared expression (if array section).
+  LValue emitSharedLValueUB(CodeGenFunction &CGF, const Expr *E);
+  /// Performs aggregate initialization.
+  /// \param N Number of reduction item in the common list.
+  /// \param PrivateAddr Address of the corresponding private item.
+  /// \param SharedLVal Address of the original shared variable.
+  /// \param DRD Declare reduction construct used for reduction item.
+  void emitAggregateInitialization(CodeGenFunction &CGF, unsigned N,
+                                   Address PrivateAddr, LValue SharedLVal,
+                                   const OMPDeclareReductionDecl *DRD);
+
+public:
+  ReductionCodeGen(ArrayRef<const Expr *> Shareds,
+                   ArrayRef<const Expr *> Privates,
+                   ArrayRef<const Expr *> ReductionOps);
+  /// Emits lvalue for a reduction item.
+  /// \param N Number of the reduction item.
+  void emitSharedLValue(CodeGenFunction &CGF, unsigned N);
+  /// Emits the code for the variable-modified type, if required.
+  /// \param N Number of the reduction item.
+  void emitAggregateType(CodeGenFunction &CGF, unsigned N);
+  /// Emits the code for the variable-modified type, if required.
+  /// \param N Number of the reduction item.
+  /// \param Size Size of the type in chars.
+  void emitAggregateType(CodeGenFunction &CGF, unsigned N, llvm::Value *Size);
+  /// Performs initialization of the private copy for the reduction item.
+  /// \param N Number of the reduction item.
+  /// \param PrivateAddr Address of the corresponding private item.
+  /// \param DefaultInit Default initialization sequence that should be
+  /// performed if no reduction specific initialization is found.
+  /// \param SharedLVal Address of the original shared variable.
+  void
+  emitInitialization(CodeGenFunction &CGF, unsigned N, Address PrivateAddr,
+                     LValue SharedLVal,
+                     llvm::function_ref<bool(CodeGenFunction &)> DefaultInit);
+  /// Returns true if the private copy requires cleanups.
+  bool needCleanups(unsigned N);
+  /// Emits cleanup code for the reduction item.
+  /// \param N Number of the reduction item.
+  /// \param PrivateAddr Address of the corresponding private item.
+  void emitCleanups(CodeGenFunction &CGF, unsigned N, Address PrivateAddr);
+  /// Adjusts \p PrivatedAddr for using instead of the original variable
+  /// address in normal operations.
+  /// \param N Number of the reduction item.
+  /// \param PrivateAddr Address of the corresponding private item.
+  Address adjustPrivateAddress(CodeGenFunction &CGF, unsigned N,
+                               Address PrivateAddr);
+  /// Returns LValue for the reduction item.
+  LValue getSharedLValue(unsigned N) const { return SharedAddresses[N].first; }
+  /// Returns the size of the reduction item (in chars and total number of
+  /// elements in the item), or nullptr, if the size is a constant.
+  std::pair<llvm::Value *, llvm::Value *> getSizes(unsigned N) const {
+    return Sizes[N];
+  }
+  /// Returns the base declaration of the reduction item.
+  const VarDecl *getBaseDecl(unsigned N) const { return BaseDecls[N]; }
+  /// Returns true if the initialization of the reduction item uses initializer
+  /// from declare reduction construct.
+  bool usesReductionInitializer(unsigned N) const;
+};
+
 class CGOpenMPRuntime {
 protected:
   CodeGenModule &CGM;
@@ -121,7 +212,7 @@ class CGOpenMPRuntime {
   /// \param OutlinedFnID Outlined function ID value to be defined by this call.
   /// \param IsOffloadEntry True if the outlined function is an offload entry.
   /// \param CodeGen Lambda codegen specific to an accelerator device.
-  /// An oulined function may not be an entry if, e.g. the if clause always
+  /// An outlined function may not be an entry if, e.g. the if clause always
   /// evaluates to false.
   virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D,
                                                 StringRef ParentName,
@@ -699,7 +790,7 @@ class CGOpenMPRuntime {
   /// \param Loc Clang source location.
   /// \param ScheduleKind Schedule kind, specified by the 'schedule' clause.
   /// \param IVSize Size of the iteration variable in bits.
-  /// \param IVSigned Sign of the interation variable.
+  /// \param IVSigned Sign of the iteration variable.
   /// \param Ordered true if loop is ordered, false otherwise.
   /// \param DispatchValues struct containing llvm values for lower bound, upper
   /// bound, and chunk expression.
@@ -723,7 +814,7 @@ class CGOpenMPRuntime {
   /// \param Loc Clang source location.
   /// \param ScheduleKind Schedule kind, specified by the 'schedule' clause.
   /// \param IVSize Size of the iteration variable in bits.
-  /// \param IVSigned Sign of the interation variable.
+  /// \param IVSigned Sign of the iteration variable.
   /// \param Ordered true if loop is ordered, false otherwise.
   /// \param IL Address of the output variable in which the flag of the
   /// last iteration is returned.
@@ -732,7 +823,7 @@ class CGOpenMPRuntime {
   /// \param UB Address of the output variable in which the upper iteration
   /// number is returned.
   /// \param ST Address of the output variable in which the stride value is
-  /// returned nesessary to generated the static_chunked scheduled loop.
+  /// returned necessary to generated the static_chunked scheduled loop.
   /// \param Chunk Value of the chunk for the static_chunked scheduled loop.
   /// For the default (nullptr) value, the chunk 1 will be used.
   ///
@@ -747,7 +838,7 @@ class CGOpenMPRuntime {
   /// \param Loc Clang source location.
   /// \param SchedKind Schedule kind, specified by the 'dist_schedule' clause.
   /// \param IVSize Size of the iteration variable in bits.
-  /// \param IVSigned Sign of the interation variable.
+  /// \param IVSigned Sign of the iteration variable.
   /// \param Ordered true if loop is ordered, false otherwise.
   /// \param IL Address of the output variable in which the flag of the
   /// last iteration is returned.
@@ -756,7 +847,7 @@ class CGOpenMPRuntime {
   /// \param UB Address of the output variable in which the upper iteration
   /// number is returned.
   /// \param ST Address of the output variable in which the stride value is
-  /// returned nesessary to generated the static_chunked scheduled loop.
+  /// returned necessary to generated the static_chunked scheduled loop.
   /// \param Chunk Value of the chunk for the static_chunked scheduled loop.
   /// For the default (nullptr) value, the chunk 1 will be used.
   ///
@@ -773,7 +864,7 @@ class CGOpenMPRuntime {
   /// \param CGF Reference to current CodeGenFunction.
   /// \param Loc Clang source location.
   /// \param IVSize Size of the iteration variable in bits.
-  /// \param IVSigned Sign of the interation variable.
+  /// \param IVSigned Sign of the iteration variable.
   ///
   virtual void emitForOrderedIterationEnd(CodeGenFunction &CGF,
                                           SourceLocation Loc, unsigned IVSize,
@@ -792,7 +883,7 @@ class CGOpenMPRuntime {
   ///          kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper,
   ///          kmp_int[32|64] *p_stride);
   /// \param IVSize Size of the iteration variable in bits.
-  /// \param IVSigned Sign of the interation variable.
+  /// \param IVSigned Sign of the iteration variable.
   /// \param IL Address of the output variable in which the flag of the
   /// last iteration is returned.
   /// \param LB Address of the output variable in which the lower iteration
@@ -844,6 +935,14 @@ class CGOpenMPRuntime {
                                  SourceLocation Loc, bool PerformInit,
                                  CodeGenFunction *CGF = nullptr);
 
+  /// Creates artificial threadprivate variable with name \p Name and type \p
+  /// VarType.
+  /// \param VarType Type of the artificial threadprivate variable.
+  /// \param Name Name of the artificial threadprivate variable.
+  virtual Address getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
+                                                   QualType VarType,
+                                                   StringRef Name);
+
   /// \brief Emit flush of the variables specified in 'omp flush' directive.
   /// \param Vars List of variables to flush.
   virtual void emitFlush(CodeGenFunction &CGF, ArrayRef<const Expr *> Vars,
@@ -1002,6 +1101,51 @@ class CGOpenMPRuntime {
                              ArrayRef<const Expr *> ReductionOps,
                              ReductionOptionsTy Options);
 
+  /// Emit a code for initialization of task reduction clause. Next code
+  /// should be emitted for reduction:
+  /// \code
+  ///
+  /// _task_red_item_t red_data[n];
+  /// ...
+  /// red_data[i].shar = &origs[i];
+  /// red_data[i].size = sizeof(origs[i]);
+  /// red_data[i].f_init = (void*)RedInit<i>;
+  /// red_data[i].f_fini = (void*)RedDest<i>;
+  /// red_data[i].f_comb = (void*)RedOp<i>;
+  /// red_data[i].flags = <Flag_i>;
+  /// ...
+  /// void* tg1 = __kmpc_task_reduction_init(gtid, n, red_data);
+  /// \endcode
+  ///
+  /// \param LHSExprs List of LHS in \a Data.ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a Data.ReductionOps reduction operations.
+  /// \param Data Additional data for task generation like tiedness, final
+  /// state, list of privates, reductions etc.
+  virtual llvm::Value *emitTaskReductionInit(CodeGenFunction &CGF,
+                                             SourceLocation Loc,
+                                             ArrayRef<const Expr *> LHSExprs,
+                                             ArrayRef<const Expr *> RHSExprs,
+                                             const OMPTaskDataTy &Data);
+
+  /// Required to resolve existing problems in the runtime. Emits threadprivate
+  /// variables to store the size of the VLAs/array sections for
+  /// initializer/combiner/finalizer functions + emits threadprivate variable to
+  /// store the pointer to the original reduction item for the custom
+  /// initializer defined by declare reduction construct.
+  /// \param RCG Allows to reuse an existing data for the reductions.
+  /// \param N Reduction item for which fixups must be emitted.
+  virtual void emitTaskReductionFixups(CodeGenFunction &CGF, SourceLocation Loc,
+                                       ReductionCodeGen &RCG, unsigned N);
+
+  /// Get the address of `void *` type of the privatue copy of the reduction
+  /// item specified by the \p SharedLVal.
+  /// \param ReductionsPtr Pointer to the reduction data returned by the
+  /// emitTaskReductionInit function.
+  /// \param SharedLVal Address of the original reduction item.
+  virtual Address getTaskReductionItem(CodeGenFunction &CGF, SourceLocation Loc,
+                                       llvm::Value *ReductionsPtr,
+                                       LValue SharedLVal);
+
   /// \brief Emit code for 'taskwait' directive.
   virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc);
 
@@ -1029,7 +1173,7 @@ class CGOpenMPRuntime {
   /// \param OutlinedFnID Outlined function ID value to be defined by this call.
   /// \param IsOffloadEntry True if the outlined function is an offload entry.
   /// \param CodeGen Code generation sequence for the \a D directive.
-  /// An oulined function may not be an entry if, e.g. the if clause always
+  /// An outlined function may not be an entry if, e.g. the if clause always
   /// evaluates to false.
   virtual void emitTargetOutlinedFunction(const OMPExecutableDirective &D,
                                           StringRef ParentName,
diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp
index 71797e2e6fbe..6135cf31d176 100644
--- a/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/lib/CodeGen/CGStmtOpenMP.cpp
@@ -549,156 +549,6 @@ void CodeGenFunction::EmitOMPAggregateAssign(
   EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
-/// Check if the combiner is a call to UDR combiner and if it is so return the
-/// UDR decl used for reduction.
-static const OMPDeclareReductionDecl *
-getReductionInit(const Expr *ReductionOp) {
-  if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
-    if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
-      if (auto *DRE =
-              dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
-        if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl()))
-          return DRD;
-  return nullptr;
-}
-
-static void emitInitWithReductionInitializer(CodeGenFunction &CGF,
-                                             const OMPDeclareReductionDecl *DRD,
-                                             const Expr *InitOp,
-                                             Address Private, Address Original,
-                                             QualType Ty) {
-  if (DRD->getInitializer()) {
-    std::pair<llvm::Function *, llvm::Function *> Reduction =
-        CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
-    auto *CE = cast<CallExpr>(InitOp);
-    auto *OVE = cast<OpaqueValueExpr>(CE->getCallee());
-    const Expr *LHS = CE->getArg(/*Arg=*/0)->IgnoreParenImpCasts();
-    const Expr *RHS = CE->getArg(/*Arg=*/1)->IgnoreParenImpCasts();
-    auto *LHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(LHS)->getSubExpr());
-    auto *RHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(RHS)->getSubExpr());
-    CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
-    PrivateScope.addPrivate(cast<VarDecl>(LHSDRE->getDecl()),
-                            [=]() -> Address { return Private; });
-    PrivateScope.addPrivate(cast<VarDecl>(RHSDRE->getDecl()),
-                            [=]() -> Address { return Original; });
-    (void)PrivateScope.Privatize();
-    RValue Func = RValue::get(Reduction.second);
-    CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
-    CGF.EmitIgnoredExpr(InitOp);
-  } else {
-    llvm::Constant *Init = CGF.CGM.EmitNullConstant(Ty);
-    auto *GV = new llvm::GlobalVariable(
-        CGF.CGM.getModule(), Init->getType(), /*isConstant=*/true,
-        llvm::GlobalValue::PrivateLinkage, Init, ".init");
-    LValue LV = CGF.MakeNaturalAlignAddrLValue(GV, Ty);
-    RValue InitRVal;
-    switch (CGF.getEvaluationKind(Ty)) {
-    case TEK_Scalar:
-      InitRVal = CGF.EmitLoadOfLValue(LV, SourceLocation());
-      break;
-    case TEK_Complex:
-      InitRVal =
-          RValue::getComplex(CGF.EmitLoadOfComplex(LV, SourceLocation()));
-      break;
-    case TEK_Aggregate:
-      InitRVal = RValue::getAggregate(LV.getAddress());
-      break;
-    }
-    OpaqueValueExpr OVE(SourceLocation(), Ty, VK_RValue);
-    CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, &OVE, InitRVal);
-    CGF.EmitAnyExprToMem(&OVE, Private, Ty.getQualifiers(),
-                         /*IsInitializer=*/false);
-  }
-}
-
-/// \brief Emit initialization of arrays of complex types.
-/// \param DestAddr Address of the array.
-/// \param Type Type of array.
-/// \param Init Initial expression of array.
-/// \param SrcAddr Address of the original array.
-static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr,
-                                 QualType Type, const Expr *Init,
-                                 Address SrcAddr = Address::invalid()) {
-  auto *DRD = getReductionInit(Init);
-  // Perform element-by-element initialization.
-  QualType ElementTy;
-
-  // Drill down to the base element type on both arrays.
-  auto ArrayTy = Type->getAsArrayTypeUnsafe();
-  auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, DestAddr);
-  DestAddr =
-      CGF.Builder.CreateElementBitCast(DestAddr, DestAddr.getElementType());
-  if (DRD)
-    SrcAddr =
-        CGF.Builder.CreateElementBitCast(SrcAddr, DestAddr.getElementType());
-
-  llvm::Value *SrcBegin = nullptr;
-  if (DRD)
-    SrcBegin = SrcAddr.getPointer();
-  auto DestBegin = DestAddr.getPointer();
-  // Cast from pointer to array type to pointer to single element.
-  auto DestEnd = CGF.Builder.CreateGEP(DestBegin, NumElements);
-  // The basic structure here is a while-do loop.
-  auto BodyBB = CGF.createBasicBlock("omp.arrayinit.body");
-  auto DoneBB = CGF.createBasicBlock("omp.arrayinit.done");
-  auto IsEmpty =
-      CGF.Builder.CreateICmpEQ(DestBegin, DestEnd, "omp.arrayinit.isempty");
-  CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
-
-  // Enter the loop body, making that address the current address.
-  auto EntryBB = CGF.Builder.GetInsertBlock();
-  CGF.EmitBlock(BodyBB);
-
-  CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy);
-
-  llvm::PHINode *SrcElementPHI = nullptr;
-  Address SrcElementCurrent = Address::invalid();
-  if (DRD) {
-    SrcElementPHI = CGF.Builder.CreatePHI(SrcBegin->getType(), 2,
-                                          "omp.arraycpy.srcElementPast");
-    SrcElementPHI->addIncoming(SrcBegin, EntryBB);
-    SrcElementCurrent =
-        Address(SrcElementPHI,
-                SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize));
-  }
-  llvm::PHINode *DestElementPHI = CGF.Builder.CreatePHI(
-      DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
-  DestElementPHI->addIncoming(DestBegin, EntryBB);
-  Address DestElementCurrent =
-      Address(DestElementPHI,
-              DestAddr.getAlignment().alignmentOfArrayElement(ElementSize));
-
-  // Emit copy.
-  {
-    CodeGenFunction::RunCleanupsScope InitScope(CGF);
-    if (DRD && (DRD->getInitializer() || !Init)) {
-      emitInitWithReductionInitializer(CGF, DRD, Init, DestElementCurrent,
-                                       SrcElementCurrent, ElementTy);
-    } else
-      CGF.EmitAnyExprToMem(Init, DestElementCurrent, ElementTy.getQualifiers(),
-                           /*IsInitializer=*/false);
-  }
-
-  if (DRD) {
-    // Shift the address forward by one element.
-    auto SrcElementNext = CGF.Builder.CreateConstGEP1_32(
-        SrcElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
-    SrcElementPHI->addIncoming(SrcElementNext, CGF.Builder.GetInsertBlock());
-  }
-
-  // Shift the address forward by one element.
-  auto DestElementNext = CGF.Builder.CreateConstGEP1_32(
-      DestElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
-  // Check whether we've reached the end.
-  auto Done =
-      CGF.Builder.CreateICmpEQ(DestElementNext, DestEnd, "omp.arraycpy.done");
-  CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB);
-  DestElementPHI->addIncoming(DestElementNext, CGF.Builder.GetInsertBlock());
-
-  // Done.
-  CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
-}
-
 void CodeGenFunction::EmitOMPCopy(QualType OriginalType, Address DestAddr,
                                   Address SrcAddr, const VarDecl *DestVD,
                                   const VarDecl *SrcVD, const Expr *Copy) {
@@ -1051,254 +901,107 @@ void CodeGenFunction::EmitOMPLastprivateClauseFinal(
     EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
-static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
-                          LValue BaseLV, llvm::Value *Addr) {
-  Address Tmp = Address::invalid();
-  Address TopTmp = Address::invalid();
-  Address MostTopTmp = Address::invalid();
-  BaseTy = BaseTy.getNonReferenceType();
-  while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
-         !CGF.getContext().hasSameType(BaseTy, ElTy)) {
-    Tmp = CGF.CreateMemTemp(BaseTy);
-    if (TopTmp.isValid())
-      CGF.Builder.CreateStore(Tmp.getPointer(), TopTmp);
-    else
-      MostTopTmp = Tmp;
-    TopTmp = Tmp;
-    BaseTy = BaseTy->getPointeeType();
-  }
-  llvm::Type *Ty = BaseLV.getPointer()->getType();
-  if (Tmp.isValid())
-    Ty = Tmp.getElementType();
-  Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Ty);
-  if (Tmp.isValid()) {
-    CGF.Builder.CreateStore(Addr, Tmp);
-    return MostTopTmp;
-  }
-  return Address(Addr, BaseLV.getAlignment());
-}
-
-static LValue loadToBegin(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
-                          LValue BaseLV) {
-  BaseTy = BaseTy.getNonReferenceType();
-  while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
-         !CGF.getContext().hasSameType(BaseTy, ElTy)) {
-    if (auto *PtrTy = BaseTy->getAs<PointerType>())
-      BaseLV = CGF.EmitLoadOfPointerLValue(BaseLV.getAddress(), PtrTy);
-    else {
-      BaseLV = CGF.EmitLoadOfReferenceLValue(BaseLV.getAddress(),
-                                             BaseTy->castAs<ReferenceType>());
-    }
-    BaseTy = BaseTy->getPointeeType();
-  }
-  return CGF.MakeAddrLValue(
-      Address(
-          CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-              BaseLV.getPointer(), CGF.ConvertTypeForMem(ElTy)->getPointerTo()),
-          BaseLV.getAlignment()),
-      BaseLV.getType(), BaseLV.getBaseInfo());
-}
-
 void CodeGenFunction::EmitOMPReductionClauseInit(
     const OMPExecutableDirective &D,
     CodeGenFunction::OMPPrivateScope &PrivateScope) {
   if (!HaveInsertPoint())
     return;
+  SmallVector<const Expr *, 4> Shareds;
+  SmallVector<const Expr *, 4> Privates;
+  SmallVector<const Expr *, 4> ReductionOps;
+  SmallVector<const Expr *, 4> LHSs;
+  SmallVector<const Expr *, 4> RHSs;
   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
-    auto ILHS = C->lhs_exprs().begin();
-    auto IRHS = C->rhs_exprs().begin();
     auto IPriv = C->privates().begin();
     auto IRed = C->reduction_ops().begin();
-    for (auto IRef : C->varlists()) {
-      auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
-      auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
-      auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(*IPriv)->getDecl());
-      auto *DRD = getReductionInit(*IRed);
-      if (auto *OASE = dyn_cast<OMPArraySectionExpr>(IRef)) {
-        auto *Base = OASE->getBase()->IgnoreParenImpCasts();
-        while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
-          Base = TempOASE->getBase()->IgnoreParenImpCasts();
-        while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
-          Base = TempASE->getBase()->IgnoreParenImpCasts();
-        auto *DE = cast<DeclRefExpr>(Base);
-        auto *OrigVD = cast<VarDecl>(DE->getDecl());
-        auto OASELValueLB = EmitOMPArraySectionExpr(OASE);
-        auto OASELValueUB =
-            EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false);
-        auto OriginalBaseLValue = EmitLValue(DE);
-        LValue BaseLValue =
-            loadToBegin(*this, OrigVD->getType(), OASELValueLB.getType(),
-                        OriginalBaseLValue);
-        // Store the address of the original variable associated with the LHS
-        // implicit variable.
-        PrivateScope.addPrivate(LHSVD, [OASELValueLB]() -> Address {
-          return OASELValueLB.getAddress();
-        });
-        // Emit reduction copy.
-        bool IsRegistered = PrivateScope.addPrivate(
-            OrigVD, [this, OrigVD, PrivateVD, BaseLValue, OASELValueLB,
-                     OASELValueUB, OriginalBaseLValue, DRD, IRed]() -> Address {
-              // Emit VarDecl with copy init for arrays.
-              // Get the address of the original variable captured in current
-              // captured region.
-              auto *Size = Builder.CreatePtrDiff(OASELValueUB.getPointer(),
-                                                 OASELValueLB.getPointer());
-              Size = Builder.CreateNUWAdd(
-                  Size, llvm::ConstantInt::get(Size->getType(), /*V=*/1));
-              CodeGenFunction::OpaqueValueMapping OpaqueMap(
-                  *this, cast<OpaqueValueExpr>(
-                             getContext()
-                                 .getAsVariableArrayType(PrivateVD->getType())
-                                 ->getSizeExpr()),
-                  RValue::get(Size));
-              EmitVariablyModifiedType(PrivateVD->getType());
-              auto Emission = EmitAutoVarAlloca(*PrivateVD);
-              auto Addr = Emission.getAllocatedAddress();
-              auto *Init = PrivateVD->getInit();
-              EmitOMPAggregateInit(*this, Addr, PrivateVD->getType(),
-                                   DRD ? *IRed : Init,
-                                   OASELValueLB.getAddress());
-              EmitAutoVarCleanups(Emission);
-              // Emit private VarDecl with reduction init.
-              auto *Offset = Builder.CreatePtrDiff(BaseLValue.getPointer(),
-                                                   OASELValueLB.getPointer());
-              auto *Ptr = Builder.CreateGEP(Addr.getPointer(), Offset);
-              return castToBase(*this, OrigVD->getType(),
-                                OASELValueLB.getType(), OriginalBaseLValue,
-                                Ptr);
-            });
-        assert(IsRegistered && "private var already registered as private");
-        // Silence the warning about unused variable.
-        (void)IsRegistered;
-        PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address {
-          return GetAddrOfLocalVar(PrivateVD);
-        });
-      } else if (auto *ASE = dyn_cast<ArraySubscriptExpr>(IRef)) {
-        auto *Base = ASE->getBase()->IgnoreParenImpCasts();
-        while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
-          Base = TempASE->getBase()->IgnoreParenImpCasts();
-        auto *DE = cast<DeclRefExpr>(Base);
-        auto *OrigVD = cast<VarDecl>(DE->getDecl());
-        auto ASELValue = EmitLValue(ASE);
-        auto OriginalBaseLValue = EmitLValue(DE);
-        LValue BaseLValue = loadToBegin(
-            *this, OrigVD->getType(), ASELValue.getType(), OriginalBaseLValue);
-        // Store the address of the original variable associated with the LHS
-        // implicit variable.
-        PrivateScope.addPrivate(
-            LHSVD, [ASELValue]() -> Address { return ASELValue.getAddress(); });
-        // Emit reduction copy.
-        bool IsRegistered = PrivateScope.addPrivate(
-            OrigVD, [this, OrigVD, PrivateVD, BaseLValue, ASELValue,
-                     OriginalBaseLValue, DRD, IRed]() -> Address {
-              // Emit private VarDecl with reduction init.
-              AutoVarEmission Emission = EmitAutoVarAlloca(*PrivateVD);
-              auto Addr = Emission.getAllocatedAddress();
-              if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
-                emitInitWithReductionInitializer(*this, DRD, *IRed, Addr,
-                                                 ASELValue.getAddress(),
-                                                 ASELValue.getType());
-              } else
-                EmitAutoVarInit(Emission);
-              EmitAutoVarCleanups(Emission);
-              auto *Offset = Builder.CreatePtrDiff(BaseLValue.getPointer(),
-                                                   ASELValue.getPointer());
-              auto *Ptr = Builder.CreateGEP(Addr.getPointer(), Offset);
-              return castToBase(*this, OrigVD->getType(), ASELValue.getType(),
-                                OriginalBaseLValue, Ptr);
-            });
-        assert(IsRegistered && "private var already registered as private");
-        // Silence the warning about unused variable.
-        (void)IsRegistered;
-        PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD]() -> Address {
-          return Builder.CreateElementBitCast(
-              GetAddrOfLocalVar(PrivateVD), ConvertTypeForMem(RHSVD->getType()),
-              "rhs.begin");
-        });
-      } else {
-        auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(IRef)->getDecl());
-        QualType Type = PrivateVD->getType();
-        if (getContext().getAsArrayType(Type)) {
-          // Store the address of the original variable associated with the LHS
-          // implicit variable.
-          DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
-                          CapturedStmtInfo->lookup(OrigVD) != nullptr,
-                          IRef->getType(), VK_LValue, IRef->getExprLoc());
-          Address OriginalAddr = EmitLValue(&DRE).getAddress();
-          PrivateScope.addPrivate(LHSVD, [this, &OriginalAddr,
-                                          LHSVD]() -> Address {
-            OriginalAddr = Builder.CreateElementBitCast(
-                OriginalAddr, ConvertTypeForMem(LHSVD->getType()), "lhs.begin");
-            return OriginalAddr;
-          });
-          bool IsRegistered = PrivateScope.addPrivate(OrigVD, [&]() -> Address {
-            if (Type->isVariablyModifiedType()) {
-              CodeGenFunction::OpaqueValueMapping OpaqueMap(
-                  *this, cast<OpaqueValueExpr>(
-                             getContext()
-                                 .getAsVariableArrayType(PrivateVD->getType())
-                                 ->getSizeExpr()),
-                  RValue::get(
-                      getTypeSize(OrigVD->getType().getNonReferenceType())));
-              EmitVariablyModifiedType(Type);
-            }
-            auto Emission = EmitAutoVarAlloca(*PrivateVD);
-            auto Addr = Emission.getAllocatedAddress();
-            auto *Init = PrivateVD->getInit();
-            EmitOMPAggregateInit(*this, Addr, PrivateVD->getType(),
-                                 DRD ? *IRed : Init, OriginalAddr);
-            EmitAutoVarCleanups(Emission);
-            return Emission.getAllocatedAddress();
-          });
-          assert(IsRegistered && "private var already registered as private");
-          // Silence the warning about unused variable.
-          (void)IsRegistered;
-          PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD]() -> Address {
-            return Builder.CreateElementBitCast(
-                GetAddrOfLocalVar(PrivateVD),
-                ConvertTypeForMem(RHSVD->getType()), "rhs.begin");
-          });
-        } else {
-          // Store the address of the original variable associated with the LHS
-          // implicit variable.
-          Address OriginalAddr = Address::invalid();
-          PrivateScope.addPrivate(LHSVD, [this, OrigVD, IRef,
-                                          &OriginalAddr]() -> Address {
-            DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD),
-                            CapturedStmtInfo->lookup(OrigVD) != nullptr,
-                            IRef->getType(), VK_LValue, IRef->getExprLoc());
-            OriginalAddr = EmitLValue(&DRE).getAddress();
-            return OriginalAddr;
-          });
-          // Emit reduction copy.
-          bool IsRegistered = PrivateScope.addPrivate(
-              OrigVD, [this, PrivateVD, OriginalAddr, DRD, IRed]() -> Address {
-                // Emit private VarDecl with reduction init.
-                AutoVarEmission Emission = EmitAutoVarAlloca(*PrivateVD);
-                auto Addr = Emission.getAllocatedAddress();
-                if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
-                  emitInitWithReductionInitializer(*this, DRD, *IRed, Addr,
-                                                   OriginalAddr,
-                                                   PrivateVD->getType());
-                } else
-                  EmitAutoVarInit(Emission);
-                EmitAutoVarCleanups(Emission);
-                return Addr;
-              });
-          assert(IsRegistered && "private var already registered as private");
-          // Silence the warning about unused variable.
-          (void)IsRegistered;
-          PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address {
-            return GetAddrOfLocalVar(PrivateVD);
-          });
-        }
-      }
-      ++ILHS;
-      ++IRHS;
-      ++IPriv;
-      ++IRed;
+    auto ILHS = C->lhs_exprs().begin();
+    auto IRHS = C->rhs_exprs().begin();
+    for (const auto *Ref : C->varlists()) {
+      Shareds.emplace_back(Ref);
+      Privates.emplace_back(*IPriv);
+      ReductionOps.emplace_back(*IRed);
+      LHSs.emplace_back(*ILHS);
+      RHSs.emplace_back(*IRHS);
+      std::advance(IPriv, 1);
+      std::advance(IRed, 1);
+      std::advance(ILHS, 1);
+      std::advance(IRHS, 1);
     }
   }
+  ReductionCodeGen RedCG(Shareds, Privates, ReductionOps);
+  unsigned Count = 0;
+  auto ILHS = LHSs.begin();
+  auto IRHS = RHSs.begin();
+  auto IPriv = Privates.begin();
+  for (const auto *IRef : Shareds) {
+    auto *PrivateVD = cast<VarDecl>(cast<DeclRefExpr>(*IPriv)->getDecl());
+    // Emit private VarDecl with reduction init.
+    RedCG.emitSharedLValue(*this, Count);
+    RedCG.emitAggregateType(*this, Count);
+    auto Emission = EmitAutoVarAlloca(*PrivateVD);
+    RedCG.emitInitialization(*this, Count, Emission.getAllocatedAddress(),
+                             RedCG.getSharedLValue(Count),
+                             [&Emission](CodeGenFunction &CGF) {
+                               CGF.EmitAutoVarInit(Emission);
+                               return true;
+                             });
+    EmitAutoVarCleanups(Emission);
+    Address BaseAddr = RedCG.adjustPrivateAddress(
+        *this, Count, Emission.getAllocatedAddress());
+    bool IsRegistered = PrivateScope.addPrivate(
+        RedCG.getBaseDecl(Count), [BaseAddr]() -> Address { return BaseAddr; });
+    assert(IsRegistered && "private var already registered as private");
+    // Silence the warning about unused variable.
+    (void)IsRegistered;
+
+    auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
+    auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
+    if (isa<OMPArraySectionExpr>(IRef)) {
+      // Store the address of the original variable associated with the LHS
+      // implicit variable.
+      PrivateScope.addPrivate(LHSVD, [&RedCG, Count]() -> Address {
+        return RedCG.getSharedLValue(Count).getAddress();
+      });
+      PrivateScope.addPrivate(RHSVD, [this, PrivateVD]() -> Address {
+        return GetAddrOfLocalVar(PrivateVD);
+      });
+    } else if (isa<ArraySubscriptExpr>(IRef)) {
+      // Store the address of the original variable associated with the LHS
+      // implicit variable.
+      PrivateScope.addPrivate(LHSVD, [&RedCG, Count]() -> Address {
+        return RedCG.getSharedLValue(Count).getAddress();
+      });
+      PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD]() -> Address {
+        return Builder.CreateElementBitCast(GetAddrOfLocalVar(PrivateVD),
+                                            ConvertTypeForMem(RHSVD->getType()),
+                                            "rhs.begin");
+      });
+    } else {
+      QualType Type = PrivateVD->getType();
+      bool IsArray = getContext().getAsArrayType(Type) != nullptr;
+      Address OriginalAddr = RedCG.getSharedLValue(Count).getAddress();
+      // Store the address of the original variable associated with the LHS
+      // implicit variable.
+      if (IsArray) {
+        OriginalAddr = Builder.CreateElementBitCast(
+            OriginalAddr, ConvertTypeForMem(LHSVD->getType()), "lhs.begin");
+      }
+      PrivateScope.addPrivate(
+          LHSVD, [OriginalAddr]() -> Address { return OriginalAddr; });
+      PrivateScope.addPrivate(
+          RHSVD, [this, PrivateVD, RHSVD, IsArray]() -> Address {
+            return IsArray
+                       ? Builder.CreateElementBitCast(
+                             GetAddrOfLocalVar(PrivateVD),
+                             ConvertTypeForMem(RHSVD->getType()), "rhs.begin")
+                       : GetAddrOfLocalVar(PrivateVD);
+          });
+    }
+    ++ILHS;
+    ++IRHS;
+    ++IPriv;
+    ++Count;
+  }
 }
 
 void CodeGenFunction::EmitOMPReductionClauseFinal(
@@ -2994,11 +2697,32 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
       ++ID;
     }
   }
+  SmallVector<const Expr *, 4> LHSs;
+  SmallVector<const Expr *, 4> RHSs;
+  for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
+    auto IPriv = C->privates().begin();
+    auto IRed = C->reduction_ops().begin();
+    auto ILHS = C->lhs_exprs().begin();
+    auto IRHS = C->rhs_exprs().begin();
+    for (const auto *Ref : C->varlists()) {
+      Data.ReductionVars.emplace_back(Ref);
+      Data.ReductionCopies.emplace_back(*IPriv);
+      Data.ReductionOps.emplace_back(*IRed);
+      LHSs.emplace_back(*ILHS);
+      RHSs.emplace_back(*IRHS);
+      std::advance(IPriv, 1);
+      std::advance(IRed, 1);
+      std::advance(ILHS, 1);
+      std::advance(IRHS, 1);
+    }
+  }
+  Data.Reductions = CGM.getOpenMPRuntime().emitTaskReductionInit(
+      *this, S.getLocStart(), LHSs, RHSs, Data);
   // Build list of dependences.
   for (const auto *C : S.getClausesOfKind<OMPDependClause>())
     for (auto *IRef : C->varlists())
       Data.Dependences.push_back(std::make_pair(C->getDependencyKind(), IRef));
-  auto &&CodeGen = [&Data, CS, &BodyGen, &LastprivateDstsOrigs](
+  auto &&CodeGen = [&Data, &S, CS, &BodyGen, &LastprivateDstsOrigs](
       CodeGenFunction &CGF, PrePostActionTy &Action) {
     // Set proper addresses for generated private copies.
     OMPPrivateScope Scope(CGF);
@@ -3053,6 +2777,34 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(const OMPExecutableDirective &S,
         Scope.addPrivate(Pair.first, [Replacement]() { return Replacement; });
       }
     }
+    if (Data.Reductions) {
+      OMPLexicalScope LexScope(CGF, S, /*AsInlined=*/true);
+      ReductionCodeGen RedCG(Data.ReductionVars, Data.ReductionCopies,
+                             Data.ReductionOps);
+      llvm::Value *ReductionsPtr = CGF.Builder.CreateLoad(
+          CGF.GetAddrOfLocalVar(CS->getCapturedDecl()->getParam(9)));
+      for (unsigned Cnt = 0, E = Data.ReductionVars.size(); Cnt < E; ++Cnt) {
+        RedCG.emitSharedLValue(CGF, Cnt);
+        RedCG.emitAggregateType(CGF, Cnt);
+        Address Replacement = CGF.CGM.getOpenMPRuntime().getTaskReductionItem(
+            CGF, S.getLocStart(), ReductionsPtr, RedCG.getSharedLValue(Cnt));
+        Replacement =
+            Address(CGF.EmitScalarConversion(
+                        Replacement.getPointer(), CGF.getContext().VoidPtrTy,
+                        CGF.getContext().getPointerType(
+                            Data.ReductionCopies[Cnt]->getType()),
+                        SourceLocation()),
+                    Replacement.getAlignment());
+        Replacement = RedCG.adjustPrivateAddress(CGF, Cnt, Replacement);
+        Scope.addPrivate(RedCG.getBaseDecl(Cnt),
+                         [Replacement]() { return Replacement; });
+        // FIXME: This must removed once the runtime library is fixed.
+        // Emit required threadprivate variables for
+        // initilizer/combiner/finalizer.
+        CGF.CGM.getOpenMPRuntime().emitTaskReductionFixups(CGF, S.getLocStart(),
+                                                           RedCG, Cnt);
+      }
+    }
     (void)Scope.Privatize();
 
     Action.Enter(CGF);
@@ -3714,6 +3466,7 @@ static void EmitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind,
   case OMPC_firstprivate:
   case OMPC_lastprivate:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_safelen:
   case OMPC_simdlen:
   case OMPC_collapse:
diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h
index 5933e029be8d..753dd92f3071 100644
--- a/lib/CodeGen/CodeGenFunction.h
+++ b/lib/CodeGen/CodeGenFunction.h
@@ -3589,12 +3589,19 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// nonnull, if \p LHS is marked _Nonnull.
   void EmitNullabilityCheck(LValue LHS, llvm::Value *RHS, SourceLocation Loc);
 
+  /// An enumeration which makes it easier to specify whether or not an
+  /// operation is a subtraction.
+  enum { NotSubtraction = false, IsSubtraction = true };
+
   /// Same as IRBuilder::CreateInBoundsGEP, but additionally emits a check to
   /// detect undefined behavior when the pointer overflow sanitizer is enabled.
   /// \p SignedIndices indicates whether any of the GEP indices are signed.
+  /// \p IsSubtraction indicates whether the expression used to form the GEP
+  /// is a subtraction.
   llvm::Value *EmitCheckedInBoundsGEP(llvm::Value *Ptr,
                                       ArrayRef<llvm::Value *> IdxList,
                                       bool SignedIndices,
+                                      bool IsSubtraction,
                                       SourceLocation Loc,
                                       const Twine &Name = "");
 
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 4b15b8ac4c71..5561d4520cc8 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -4499,18 +4499,19 @@ void CodeGenModule::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
 
     // Make a copy of the features as passed on the command line into the
     // beginning of the additional features from the function to override.
-    ParsedAttr.first.insert(ParsedAttr.first.begin(),
+    ParsedAttr.Features.insert(ParsedAttr.Features.begin(),
                             Target.getTargetOpts().FeaturesAsWritten.begin(),
                             Target.getTargetOpts().FeaturesAsWritten.end());
 
-    if (ParsedAttr.second != "")
-      TargetCPU = ParsedAttr.second;
+    if (ParsedAttr.Architecture != "")
+      TargetCPU = ParsedAttr.Architecture ;
 
     // Now populate the feature map, first with the TargetCPU which is either
     // the default or a new one from the target attribute string. Then we'll use
     // the passed in features (FeaturesAsWritten) along with the new ones from
     // the attribute.
-    Target.initFeatureMap(FeatureMap, getDiags(), TargetCPU, ParsedAttr.first);
+    Target.initFeatureMap(FeatureMap, getDiags(), TargetCPU,
+                          ParsedAttr.Features);
   } else {
     Target.initFeatureMap(FeatureMap, getDiags(), TargetCPU,
                           Target.getTargetOpts().Features);
diff --git a/lib/CodeGen/MacroPPCallbacks.cpp b/lib/CodeGen/MacroPPCallbacks.cpp
index 6a31dfe53d64..a6f21d8ddcfb 100644
--- a/lib/CodeGen/MacroPPCallbacks.cpp
+++ b/lib/CodeGen/MacroPPCallbacks.cpp
@@ -26,8 +26,8 @@ void MacroPPCallbacks::writeMacroDefinition(const IdentifierInfo &II,
 
   if (MI.isFunctionLike()) {
     Name << '(';
-    if (!MI.arg_empty()) {
-      MacroInfo::arg_iterator AI = MI.arg_begin(), E = MI.arg_end();
+    if (!MI.param_empty()) {
+      MacroInfo::param_iterator AI = MI.param_begin(), E = MI.param_end();
       for (; AI + 1 != E; ++AI) {
         Name << (*AI)->getName();
         Name << ',';
diff --git a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
index 37ecc05aa1ee..d0760b9cc2a6 100644
--- a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
+++ b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp
@@ -152,6 +152,9 @@ class PCHContainerGenerator : public ASTConsumer {
     CodeGenOpts.CodeModel = "default";
     CodeGenOpts.ThreadModel = "single";
     CodeGenOpts.DebugTypeExtRefs = true;
+    // When building a module MainFileName is the name of the modulemap file.
+    CodeGenOpts.MainFileName =
+        LangOpts.CurrentModule.empty() ? MainFileName : LangOpts.CurrentModule;
     CodeGenOpts.setDebugInfo(codegenoptions::FullDebugInfo);
     CodeGenOpts.setDebuggerTuning(CI.getCodeGenOpts().getDebuggerTuning());
   }
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index eeebd60a2d20..c17828974e92 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -4785,7 +4785,8 @@ class AArch64ABIInfo : public SwiftABIInfo {
 public:
   enum ABIKind {
     AAPCS = 0,
-    DarwinPCS
+    DarwinPCS,
+    Win64
   };
 
 private:
@@ -4823,10 +4824,14 @@ class AArch64ABIInfo : public SwiftABIInfo {
 
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                     QualType Ty) const override {
-    return isDarwinPCS() ? EmitDarwinVAArg(VAListAddr, Ty, CGF)
-                         : EmitAAPCSVAArg(VAListAddr, Ty, CGF);
+    return Kind == Win64 ? EmitMSVAArg(CGF, VAListAddr, Ty)
+                         : isDarwinPCS() ? EmitDarwinVAArg(VAListAddr, Ty, CGF)
+                                         : EmitAAPCSVAArg(VAListAddr, Ty, CGF);
   }
 
+  Address EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                      QualType Ty) const override;
+
   bool shouldPassIndirectlyForSwift(CharUnits totalSize,
                                     ArrayRef<llvm::Type*> scalars,
                                     bool asReturnValue) const override {
@@ -5332,6 +5337,14 @@ Address AArch64ABIInfo::EmitDarwinVAArg(Address VAListAddr, QualType Ty,
                           TyInfo, SlotSize, /*AllowHigherAlign*/ true);
 }
 
+Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
+                                    QualType Ty) const {
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*indirect*/ false,
+                          CGF.getContext().getTypeInfoInChars(Ty),
+                          CharUnits::fromQuantity(8),
+                          /*allowHigherAlign*/ false);
+}
+
 //===----------------------------------------------------------------------===//
 // ARM ABI Implementation
 //===----------------------------------------------------------------------===//
@@ -8494,6 +8507,8 @@ const TargetCodeGenInfo &CodeGenModule::getTargetCodeGenInfo() {
     AArch64ABIInfo::ABIKind Kind = AArch64ABIInfo::AAPCS;
     if (getTarget().getABI() == "darwinpcs")
       Kind = AArch64ABIInfo::DarwinPCS;
+    else if (Triple.isOSWindows())
+      Kind = AArch64ABIInfo::Win64;
 
     return SetCGInfo(new AArch64TargetCodeGenInfo(Types, Kind));
   }
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
index 42478013ccec..1d35d6e78cca 100644
--- a/lib/Driver/Driver.cpp
+++ b/lib/Driver/Driver.cpp
@@ -1275,6 +1275,13 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
       // we were requested to print out all option names that start with "-foo".
       // For example, "--autocomplete=-fsyn" is expanded to "-fsyntax-only".
       SuggestedCompletions = Opts->findByPrefix(PassedFlags, DisableFlags);
+
+      // We have to query the -W flags manually as they're not in the OptTable.
+      // TODO: Find a good way to add them to OptTable instead and them remove
+      // this code.
+      for (StringRef S : DiagnosticIDs::getDiagnosticFlags())
+        if (S.startswith(PassedFlags))
+          SuggestedCompletions.push_back(S);
     } else {
       // If the flag is in the form of "--autocomplete=foo,bar", we were
       // requested to print out all option values for "-foo" that start with
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index e759e3ec619a..b82cc2d4fa5d 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -2070,10 +2070,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     if (D.isUsingLTO()) {
       Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
 
-      // The Darwin linker currently uses the legacy LTO API, which does not
-      // support LTO unit features (CFI, whole program vtable opt) under
-      // ThinLTO.
-      if (!getToolChain().getTriple().isOSDarwin() ||
+      // The Darwin and PS4 linkers currently use the legacy LTO API, which
+      // does not support LTO unit features (CFI, whole program vtable opt)
+      // under ThinLTO.
+      if (!(getToolChain().getTriple().isOSDarwin() ||
+            getToolChain().getTriple().isPS4()) ||
           D.getLTOMode() == LTOK_Full)
         CmdArgs.push_back("-flto-unit");
     }
@@ -3200,9 +3201,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_femit_all_decls);
   Args.AddLastArg(CmdArgs, options::OPT_fheinous_gnu_extensions);
   Args.AddLastArg(CmdArgs, options::OPT_fno_operator_names);
-  // Emulated TLS is enabled by default on Android, and can be enabled manually
-  // with -femulated-tls.
-  bool EmulatedTLSDefault = Triple.isAndroid() || Triple.isWindowsCygwinEnvironment();
+  // Emulated TLS is enabled by default on Android and OpenBSD, and can be enabled
+  // manually with -femulated-tls.
+  bool EmulatedTLSDefault = Triple.isAndroid() || Triple.isOSOpenBSD() ||
+                            Triple.isWindowsCygwinEnvironment();
   if (Args.hasFlag(options::OPT_femulated_tls, options::OPT_fno_emulated_tls,
                    EmulatedTLSDefault))
     CmdArgs.push_back("-femulated-tls");
diff --git a/lib/Driver/ToolChains/Darwin.cpp b/lib/Driver/ToolChains/Darwin.cpp
index ba1a5ee95594..0d63858f2cd4 100644
--- a/lib/Driver/ToolChains/Darwin.cpp
+++ b/lib/Driver/ToolChains/Darwin.cpp
@@ -1262,28 +1262,58 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
       }
     }
 
-    // If no OSX or iOS target has been specified, try to guess platform
-    // from arch name and compute the version from the triple.
+    // If no OS targets have been specified, try to guess platform from -target
+    // or arch name and compute the version from the triple.
     if (OSXTarget.empty() && iOSTarget.empty() && TvOSTarget.empty() &&
         WatchOSTarget.empty()) {
-      StringRef MachOArchName = getMachOArchName(Args);
-      unsigned Major, Minor, Micro;
-      if (MachOArchName == "armv7" || MachOArchName == "armv7s" ||
-          MachOArchName == "arm64") {
-        getTriple().getiOSVersion(Major, Minor, Micro);
-        llvm::raw_string_ostream(iOSTarget) << Major << '.' << Minor << '.'
-                                            << Micro;
-      } else if (MachOArchName == "armv7k") {
-        getTriple().getWatchOSVersion(Major, Minor, Micro);
-        llvm::raw_string_ostream(WatchOSTarget) << Major << '.' << Minor << '.'
-                                                << Micro;
-      } else if (MachOArchName != "armv6m" && MachOArchName != "armv7m" &&
-                 MachOArchName != "armv7em") {
-        if (!getTriple().getMacOSXVersion(Major, Minor, Micro)) {
-          getDriver().Diag(diag::err_drv_invalid_darwin_version)
-              << getTriple().getOSName();
+      llvm::Triple::OSType OSTy = llvm::Triple::UnknownOS;
+
+      // Set the OSTy based on -target if -arch isn't present.
+      if (Args.hasArg(options::OPT_target) && !Args.hasArg(options::OPT_arch)) {
+        OSTy = getTriple().getOS();
+      } else {
+        StringRef MachOArchName = getMachOArchName(Args);
+        if (MachOArchName == "armv7" || MachOArchName == "armv7s" ||
+            MachOArchName == "arm64")
+          OSTy = llvm::Triple::IOS;
+        else if (MachOArchName == "armv7k")
+          OSTy = llvm::Triple::WatchOS;
+        else if (MachOArchName != "armv6m" && MachOArchName != "armv7m" &&
+                 MachOArchName != "armv7em")
+          OSTy = llvm::Triple::MacOSX;
+      }
+
+
+      if (OSTy != llvm::Triple::UnknownOS) {
+        unsigned Major, Minor, Micro;
+        std::string *OSTarget;
+
+        switch (OSTy) {
+        case llvm::Triple::Darwin:
+        case llvm::Triple::MacOSX:
+          if (!getTriple().getMacOSXVersion(Major, Minor, Micro))
+            getDriver().Diag(diag::err_drv_invalid_darwin_version)
+                << getTriple().getOSName();
+          OSTarget = &OSXTarget;
+          break;
+        case llvm::Triple::IOS:
+          getTriple().getiOSVersion(Major, Minor, Micro);
+          OSTarget = &iOSTarget;
+          break;
+        case llvm::Triple::TvOS:
+          getTriple().getOSVersion(Major, Minor, Micro);
+          OSTarget = &TvOSTarget;
+          break;
+        case llvm::Triple::WatchOS:
+          getTriple().getWatchOSVersion(Major, Minor, Micro);
+          OSTarget = &WatchOSTarget;
+          break;
+        default:
+          llvm_unreachable("Unexpected OS type");
+          break;
         }
-        llvm::raw_string_ostream(OSXTarget) << Major << '.' << Minor << '.'
+
+        llvm::raw_string_ostream(*OSTarget) << Major << '.' << Minor << '.'
                                             << Micro;
       }
     }
diff --git a/lib/Driver/ToolChains/Fuchsia.cpp b/lib/Driver/ToolChains/Fuchsia.cpp
index d8b8fe8f0bfe..78053aafd16e 100644
--- a/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/lib/Driver/ToolChains/Fuchsia.cpp
@@ -46,6 +46,9 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (llvm::sys::path::stem(Exec).equals_lower("lld")) {
     CmdArgs.push_back("-flavor");
     CmdArgs.push_back("gnu");
+
+    CmdArgs.push_back("-z");
+    CmdArgs.push_back("rodynamic");
   }
 
   if (!D.SysRoot.empty())
diff --git a/lib/Driver/ToolChains/Gnu.cpp b/lib/Driver/ToolChains/Gnu.cpp
index ad5f7df50d2e..bc26ee1de46d 100644
--- a/lib/Driver/ToolChains/Gnu.cpp
+++ b/lib/Driver/ToolChains/Gnu.cpp
@@ -2471,7 +2471,8 @@ void Generic_ELF::addClangTargetOptions(const ArgList &DriverArgs,
        (!V.isOlderThan(4, 7, 0) || getTriple().isAndroid())) ||
       getTriple().getOS() == llvm::Triple::NaCl ||
       (getTriple().getVendor() == llvm::Triple::MipsTechnologies &&
-       !getTriple().hasEnvironment());
+       !getTriple().hasEnvironment()) ||
+      getTriple().getOS() == llvm::Triple::Solaris;
 
   if (DriverArgs.hasFlag(options::OPT_fuse_init_array,
                          options::OPT_fno_use_init_array, UseInitArrayDefault))
diff --git a/lib/Driver/ToolChains/Solaris.cpp b/lib/Driver/ToolChains/Solaris.cpp
index 78797c49d7b6..de98d11b2dc7 100644
--- a/lib/Driver/ToolChains/Solaris.cpp
+++ b/lib/Driver/ToolChains/Solaris.cpp
@@ -126,7 +126,7 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
 Solaris::Solaris(const Driver &D, const llvm::Triple &Triple,
                  const ArgList &Args)
-    : Generic_GCC(D, Triple, Args) {
+    : Generic_ELF(D, Triple, Args) {
 
   GCCInstallation.init(Triple, Args);
 
diff --git a/lib/Driver/ToolChains/Solaris.h b/lib/Driver/ToolChains/Solaris.h
index edb44373b31d..787917afab6e 100644
--- a/lib/Driver/ToolChains/Solaris.h
+++ b/lib/Driver/ToolChains/Solaris.h
@@ -50,7 +50,7 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY Solaris : public Generic_GCC {
+class LLVM_LIBRARY_VISIBILITY Solaris : public Generic_ELF {
 public:
   Solaris(const Driver &D, const llvm::Triple &Triple,
           const llvm::opt::ArgList &Args);
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index c6e90a9175e1..46ea06b880ed 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -1383,7 +1383,8 @@ class AnnotatingParser {
 
     if (PrevToken->isOneOf(tok::l_paren, tok::l_square, tok::l_brace,
                            tok::comma, tok::semi, tok::kw_return, tok::colon,
-                           tok::equal, tok::kw_delete, tok::kw_sizeof) ||
+                           tok::equal, tok::kw_delete, tok::kw_sizeof,
+                           tok::kw_throw) ||
         PrevToken->isOneOf(TT_BinaryOperator, TT_ConditionalExpr,
                            TT_UnaryOperator, TT_CastRParen))
       return TT_UnaryOperator;
diff --git a/lib/Format/UnwrappedLineParser.cpp b/lib/Format/UnwrappedLineParser.cpp
index 4b57919d1929..faac5a371c26 100644
--- a/lib/Format/UnwrappedLineParser.cpp
+++ b/lib/Format/UnwrappedLineParser.cpp
@@ -747,7 +747,7 @@ static bool mustBeJSIdent(const AdditionalKeywords &Keywords,
               Keywords.kw_let, Keywords.kw_var, tok::kw_const,
               Keywords.kw_abstract, Keywords.kw_extends, Keywords.kw_implements,
               Keywords.kw_instanceof, Keywords.kw_interface,
-              Keywords.kw_throws));
+              Keywords.kw_throws, Keywords.kw_from));
 }
 
 static bool mustBeJSIdentOrValue(const AdditionalKeywords &Keywords,
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 00f6b9b46f03..b2c14554a4b5 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -1683,6 +1683,7 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
   Opts.CPlusPlus11 = Std.isCPlusPlus11();
   Opts.CPlusPlus14 = Std.isCPlusPlus14();
   Opts.CPlusPlus1z = Std.isCPlusPlus1z();
+  Opts.CPlusPlus2a = Std.isCPlusPlus2a();
   Opts.Digraphs = Std.hasDigraphs();
   Opts.GNUMode = Std.isGNUMode();
   Opts.GNUInline = !Opts.C99 && !Opts.CPlusPlus;
diff --git a/lib/Frontend/InitPreprocessor.cpp b/lib/Frontend/InitPreprocessor.cpp
index 71420df00025..64128dfdf534 100644
--- a/lib/Frontend/InitPreprocessor.cpp
+++ b/lib/Frontend/InitPreprocessor.cpp
@@ -374,10 +374,13 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     else if (!LangOpts.GNUMode && LangOpts.Digraphs)
       Builder.defineMacro("__STDC_VERSION__", "199409L");
   } else {
+    // FIXME: Use correct value for C++20.
+    if (LangOpts.CPlusPlus2a)
+      Builder.defineMacro("__cplusplus", "201707L");
     // C++17 [cpp.predefined]p1:
     //   The name __cplusplus is defined to the value 201703L when compiling a
     //   C++ translation unit.
-    if (LangOpts.CPlusPlus1z)
+    else if (LangOpts.CPlusPlus1z)
       Builder.defineMacro("__cplusplus", "201703L");
     // C++1y [cpp.predefined]p1:
     //   The name __cplusplus is defined to the value 201402L when compiling a
diff --git a/lib/Frontend/PrintPreprocessedOutput.cpp b/lib/Frontend/PrintPreprocessedOutput.cpp
index 5336de1f7468..914039ad5bb1 100644
--- a/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -38,8 +38,8 @@ static void PrintMacroDefinition(const IdentifierInfo &II, const MacroInfo &MI,
 
   if (MI.isFunctionLike()) {
     OS << '(';
-    if (!MI.arg_empty()) {
-      MacroInfo::arg_iterator AI = MI.arg_begin(), E = MI.arg_end();
+    if (!MI.param_empty()) {
+      MacroInfo::param_iterator AI = MI.param_begin(), E = MI.param_end();
       for (; AI+1 != E; ++AI) {
         OS << (*AI)->getName();
         OS << ',';
diff --git a/lib/Frontend/Rewrite/FrontendActions.cpp b/lib/Frontend/Rewrite/FrontendActions.cpp
index e93f737c47fd..5efa6aeaf760 100644
--- a/lib/Frontend/Rewrite/FrontendActions.cpp
+++ b/lib/Frontend/Rewrite/FrontendActions.cpp
@@ -10,6 +10,7 @@
 #include "clang/Rewrite/Frontend/FrontendActions.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/Basic/CharInfo.h"
+#include "clang/Config/config.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
diff --git a/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 38be684cec86..21686b8c78ea 100644
--- a/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -21,6 +21,7 @@
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/Config/config.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Rewrite/Core/Rewriter.h"
 #include "llvm/ADT/DenseSet.h"
diff --git a/lib/Frontend/Rewrite/RewriteObjC.cpp b/lib/Frontend/Rewrite/RewriteObjC.cpp
index 5a1e001d65b8..e0d813df70f8 100644
--- a/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -20,6 +20,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Config/config.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Rewrite/Core/Rewriter.h"
 #include "llvm/ADT/DenseSet.h"
diff --git a/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index a7c140188b35..166631558806 100644
--- a/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -15,6 +15,7 @@
 #include "clang/FrontendTool/Utils.h"
 #include "clang/ARCMigrate/ARCMTActions.h"
 #include "clang/CodeGen/CodeGenAction.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Options.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
diff --git a/lib/Headers/vecintrin.h b/lib/Headers/vecintrin.h
index ca7acb4731f9..f7061e88949f 100644
--- a/lib/Headers/vecintrin.h
+++ b/lib/Headers/vecintrin.h
@@ -116,6 +116,13 @@ vec_extract(vector unsigned long long __vec, int __index) {
   return __vec[__index & 1];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai float
+vec_extract(vector float __vec, int __index) {
+  return __vec[__index & 3];
+}
+#endif
+
 static inline __ATTRS_o_ai double
 vec_extract(vector double __vec, int __index) {
   return __vec[__index & 1];
@@ -129,6 +136,7 @@ vec_insert(signed char __scalar, vector signed char __vec, int __index) {
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
   vector unsigned char __newvec = (vector unsigned char)__vec;
@@ -148,6 +156,7 @@ vec_insert(signed short __scalar, vector signed short __vec, int __index) {
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
   vector unsigned short __newvec = (vector unsigned short)__vec;
@@ -167,6 +176,7 @@ vec_insert(signed int __scalar, vector signed int __vec, int __index) {
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
   vector unsigned int __newvec = (vector unsigned int)__vec;
@@ -187,6 +197,7 @@ vec_insert(signed long long __scalar, vector signed long long __vec,
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_insert(unsigned long long __scalar, vector bool long long __vec,
            int __index) {
@@ -202,6 +213,14 @@ vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_insert(float __scalar, vector float __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_insert(double __scalar, vector double __vec, int __index) {
   __vec[__index & 1] = __scalar;
@@ -282,6 +301,16 @@ vec_promote(unsigned long long __scalar, int __index) {
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_promote(float __scalar, int __index) {
+  const vector float __zero = (vector float)0;
+  vector float __vec = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_promote(double __scalar, int __index) {
   const vector double __zero = (vector double)0;
@@ -348,6 +377,15 @@ vec_insert_and_zero(const unsigned long long *__ptr) {
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_insert_and_zero(const float *__ptr) {
+  vector float __vec = (vector float)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_insert_and_zero(const double *__ptr) {
   vector double __vec = (vector double)0;
@@ -441,6 +479,15 @@ vec_perm(vector bool long long __a, vector bool long long __b,
            (vector unsigned char)__a, (vector unsigned char)__b, __c);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_perm(vector float __a, vector float __b,
+         vector unsigned char __c) {
+  return (vector float)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_perm(vector double __a, vector double __b,
          vector unsigned char __c) {
@@ -450,18 +497,22 @@ vec_perm(vector double __a, vector double __b,
 
 /*-- vec_permi --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 extern __ATTRS_o vector signed long long
 vec_permi(vector signed long long __a, vector signed long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector unsigned long long
 vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector bool long long
 vec_permi(vector bool long long __a, vector bool long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector double
 vec_permi(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 3);
@@ -471,6 +522,15 @@ vec_permi(vector double __a, vector double __b, int __c)
                       (vector unsigned long long)(Y), \
                       (((Z) & 2) << 1) | ((Z) & 1)))
 
+/*-- vec_bperm_u128 ---------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector unsigned long long
+vec_bperm_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vbperm(__a, __b);
+}
+#endif
+
 /*-- vec_sel ----------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed char
@@ -614,6 +674,22 @@ vec_sel(vector unsigned long long __a, vector unsigned long long __b,
           (~(vector unsigned long long)__c & __a));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_sel(vector float __a, vector float __b, vector unsigned int __c) {
+  return (vector float)((__c & (vector unsigned int)__b) |
+                        (~__c & (vector unsigned int)__a));
+}
+
+static inline __ATTRS_o_ai vector float
+vec_sel(vector float __a, vector float __b, vector bool int __c) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  vector unsigned int __cc = (vector unsigned int)__c;
+  return (vector float)((__cc & __bc) | (~__cc & __ac));
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
   return (vector double)((__c & (vector unsigned long long)__b) |
@@ -687,6 +763,17 @@ vec_gather_element(vector unsigned long long __vec,
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_gather_element(vector float __vec, vector unsigned int __offset,
+                   const float *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const float *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_gather_element(vector double __vec, vector unsigned long long __offset,
                    const double *__ptr, int __index)
@@ -749,6 +836,16 @@ vec_scatter_element(vector unsigned long long __vec,
     __vec[__index];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector float __vec, vector unsigned int __offset,
+                    float *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(float *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+#endif
+
 static inline __ATTRS_o_ai void
 vec_scatter_element(vector double __vec, vector unsigned long long __offset,
                     double *__ptr, int __index)
@@ -757,48 +854,111 @@ vec_scatter_element(vector double __vec, vector unsigned long long __offset,
     __vec[__index];
 }
 
+/*-- vec_xl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xl(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xl(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xl(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xl(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_xl(long __offset, const float *__ptr) {
+  return *(const vector float *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_xl(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
 /*-- vec_xld2 ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_xld2(long __offset, const signed char *__ptr) {
   return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_xld2(long __offset, const unsigned char *__ptr) {
   return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_xld2(long __offset, const signed short *__ptr) {
   return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_xld2(long __offset, const unsigned short *__ptr) {
   return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_xld2(long __offset, const signed int *__ptr) {
   return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_xld2(long __offset, const unsigned int *__ptr) {
   return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_xld2(long __offset, const signed long long *__ptr) {
   return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_xld2(long __offset, const unsigned long long *__ptr) {
   return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_xld2(long __offset, const double *__ptr) {
   return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
@@ -806,74 +966,145 @@ vec_xld2(long __offset, const double *__ptr) {
 
 /*-- vec_xlw4 ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_xlw4(long __offset, const signed char *__ptr) {
   return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_xlw4(long __offset, const unsigned char *__ptr) {
   return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_xlw4(long __offset, const signed short *__ptr) {
   return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_xlw4(long __offset, const unsigned short *__ptr) {
   return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_xlw4(long __offset, const signed int *__ptr) {
   return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_xlw4(long __offset, const unsigned int *__ptr) {
   return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+/*-- vec_xst ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_xst(vector float __vec, long __offset, float *__ptr) {
+  *(vector float *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+#endif
+
+static inline __ATTRS_o_ai void
+vec_xst(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
 /*-- vec_xstd2 --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
   *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
   *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
   *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
   *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
   *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
   *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed long long __vec, long __offset,
           signed long long *__ptr) {
   *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned long long __vec, long __offset,
           unsigned long long *__ptr) {
@@ -881,6 +1112,7 @@ vec_xstd2(vector unsigned long long __vec, long __offset,
     __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector double __vec, long __offset, double *__ptr) {
   *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
@@ -888,31 +1120,37 @@ vec_xstd2(vector double __vec, long __offset, double *__ptr) {
 
 /*-- vec_xstw4 --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
   *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
   *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
   *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
   *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
   *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
   *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
@@ -952,6 +1190,12 @@ extern __ATTRS_o vector unsigned long long
 vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector float
+vec_load_bndry(const float *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+#endif
+
 extern __ATTRS_o vector double
 vec_load_bndry(const double *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
@@ -1007,11 +1251,27 @@ vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
   return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_load_len(const float *__ptr, unsigned int __len) {
+  return (vector float)__builtin_s390_vll(__len, __ptr);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_load_len(const double *__ptr, unsigned int __len) {
   return (vector double)__builtin_s390_vll(__len, __ptr);
 }
 
+/*-- vec_load_len_r ---------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector unsigned char
+vec_load_len_r(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vlrl(__len, __ptr);
+}
+#endif
+
 /*-- vec_store_len ----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai void
@@ -1062,12 +1322,30 @@ vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
   __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_store_len(vector float __vec, float *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+#endif
+
 static inline __ATTRS_o_ai void
 vec_store_len(vector double __vec, double *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
 }
 
+/*-- vec_store_len_r --------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai void
+vec_store_len_r(vector unsigned char __vec, unsigned char *__ptr,
+                unsigned int __len) {
+  __builtin_s390_vstrl((vector signed char)__vec, __len, __ptr);
+}
+#endif
+
 /*-- vec_load_pair ----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed long long
@@ -1232,6 +1510,14 @@ vec_splat(vector unsigned long long __vec, int __index)
   return (vector unsigned long long)__vec[__index];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_splat(vector float __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector float)__vec[__index];
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_splat(vector double __vec, int __index)
   __constant_range(__index, 0, 1) {
@@ -1332,6 +1618,13 @@ vec_splats(unsigned long long __scalar) {
   return (vector unsigned long long)__scalar;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_splats(float __scalar) {
+  return (vector float)__scalar;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_splats(double __scalar) {
   return (vector double)__scalar;
@@ -1425,6 +1718,13 @@ vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)(__a[0], __b[0]);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_mergeh(vector float __a, vector float __b) {
+  return (vector float)(__a[0], __b[0], __a[1], __b[1]);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_mergeh(vector double __a, vector double __b) {
   return (vector double)(__a[0], __b[0]);
@@ -1501,6 +1801,13 @@ vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)(__a[1], __b[1]);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_mergel(vector float __a, vector float __b) {
+  return (vector float)(__a[2], __b[2], __a[3], __b[3]);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_mergel(vector double __a, vector double __b) {
   return (vector double)(__a[1], __b[1]);
@@ -1866,6 +2173,13 @@ vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a == __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector float __a, vector float __b) {
+  return (vector bool int)(__a == __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpeq(vector double __a, vector double __b) {
   return (vector bool long long)(__a == __b);
@@ -1913,6 +2227,13 @@ vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a >= __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector float __a, vector float __b) {
+  return (vector bool int)(__a >= __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpge(vector double __a, vector double __b) {
   return (vector bool long long)(__a >= __b);
@@ -1960,6 +2281,13 @@ vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a > __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector float __a, vector float __b) {
+  return (vector bool int)(__a > __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpgt(vector double __a, vector double __b) {
   return (vector bool long long)(__a > __b);
@@ -2007,6 +2335,13 @@ vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a <= __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector float __a, vector float __b) {
+  return (vector bool int)(__a <= __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmple(vector double __a, vector double __b) {
   return (vector bool long long)(__a <= __b);
@@ -2054,6 +2389,13 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a < __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector float __a, vector float __b) {
+  return (vector bool int)(__a < __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmplt(vector double __a, vector double __b) {
   return (vector bool long long)(__a < __b);
@@ -2068,6 +2410,7 @@ vec_all_eq(vector signed char __a, vector signed char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2075,6 +2418,7 @@ vec_all_eq(vector signed char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2090,6 +2434,7 @@ vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2098,6 +2443,7 @@ vec_all_eq(vector unsigned char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2121,6 +2467,7 @@ vec_all_eq(vector signed short __a, vector signed short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2128,6 +2475,7 @@ vec_all_eq(vector signed short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2143,6 +2491,7 @@ vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2151,6 +2500,7 @@ vec_all_eq(vector unsigned short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2174,6 +2524,7 @@ vec_all_eq(vector signed int __a, vector signed int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2181,6 +2532,7 @@ vec_all_eq(vector signed int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2196,6 +2548,7 @@ vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2204,6 +2557,7 @@ vec_all_eq(vector unsigned int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2227,6 +2581,7 @@ vec_all_eq(vector signed long long __a, vector signed long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2234,6 +2589,7 @@ vec_all_eq(vector signed long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2249,6 +2605,7 @@ vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2257,6 +2614,7 @@ vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2273,6 +2631,15 @@ vec_all_eq(vector bool long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_eq(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_eq(vector double __a, vector double __b) {
   int __cc;
@@ -2289,6 +2656,7 @@ vec_all_ne(vector signed char __a, vector signed char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2296,6 +2664,7 @@ vec_all_ne(vector signed char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2311,6 +2680,7 @@ vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2319,6 +2689,7 @@ vec_all_ne(vector unsigned char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2342,6 +2713,7 @@ vec_all_ne(vector signed short __a, vector signed short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2349,6 +2721,7 @@ vec_all_ne(vector signed short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2364,6 +2737,7 @@ vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2372,6 +2746,7 @@ vec_all_ne(vector unsigned short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2395,6 +2770,7 @@ vec_all_ne(vector signed int __a, vector signed int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2402,6 +2778,7 @@ vec_all_ne(vector signed int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2417,6 +2794,7 @@ vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2425,6 +2803,7 @@ vec_all_ne(vector unsigned int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2448,6 +2827,7 @@ vec_all_ne(vector signed long long __a, vector signed long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2455,6 +2835,7 @@ vec_all_ne(vector signed long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2470,6 +2851,7 @@ vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2478,6 +2860,7 @@ vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2494,6 +2877,15 @@ vec_all_ne(vector bool long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ne(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_ne(vector double __a, vector double __b) {
   int __cc;
@@ -2510,6 +2902,7 @@ vec_all_ge(vector signed char __a, vector signed char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2517,6 +2910,7 @@ vec_all_ge(vector signed char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2531,6 +2925,7 @@ vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2538,6 +2933,7 @@ vec_all_ge(vector unsigned char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2545,6 +2941,7 @@ vec_all_ge(vector bool char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2560,6 +2957,7 @@ vec_all_ge(vector signed short __a, vector signed short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2567,6 +2965,7 @@ vec_all_ge(vector signed short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2581,6 +2980,7 @@ vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2588,6 +2988,7 @@ vec_all_ge(vector unsigned short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2595,6 +2996,7 @@ vec_all_ge(vector bool short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -2610,6 +3012,7 @@ vec_all_ge(vector signed int __a, vector signed int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2617,6 +3020,7 @@ vec_all_ge(vector signed int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2631,6 +3035,7 @@ vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2638,6 +3043,7 @@ vec_all_ge(vector unsigned int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2645,6 +3051,7 @@ vec_all_ge(vector bool int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -2660,6 +3067,7 @@ vec_all_ge(vector signed long long __a, vector signed long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2667,6 +3075,7 @@ vec_all_ge(vector signed long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2681,6 +3090,7 @@ vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2688,6 +3098,7 @@ vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2695,6 +3106,7 @@ vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -2703,6 +3115,15 @@ vec_all_ge(vector bool long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_ge(vector double __a, vector double __b) {
   int __cc;
@@ -2719,6 +3140,7 @@ vec_all_gt(vector signed char __a, vector signed char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2726,6 +3148,7 @@ vec_all_gt(vector signed char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2740,6 +3163,7 @@ vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2747,6 +3171,7 @@ vec_all_gt(vector unsigned char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2754,6 +3179,7 @@ vec_all_gt(vector bool char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2769,6 +3195,7 @@ vec_all_gt(vector signed short __a, vector signed short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2776,6 +3203,7 @@ vec_all_gt(vector signed short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2790,6 +3218,7 @@ vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2797,6 +3226,7 @@ vec_all_gt(vector unsigned short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2804,6 +3234,7 @@ vec_all_gt(vector bool short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -2819,6 +3250,7 @@ vec_all_gt(vector signed int __a, vector signed int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2826,6 +3258,7 @@ vec_all_gt(vector signed int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2840,6 +3273,7 @@ vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2847,6 +3281,7 @@ vec_all_gt(vector unsigned int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2854,6 +3289,7 @@ vec_all_gt(vector bool int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -2869,6 +3305,7 @@ vec_all_gt(vector signed long long __a, vector signed long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2876,6 +3313,7 @@ vec_all_gt(vector signed long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2890,6 +3328,7 @@ vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2897,6 +3336,7 @@ vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2904,6 +3344,7 @@ vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -2912,6 +3353,15 @@ vec_all_gt(vector bool long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_gt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_gt(vector double __a, vector double __b) {
   int __cc;
@@ -2928,6 +3378,7 @@ vec_all_le(vector signed char __a, vector signed char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2935,6 +3386,7 @@ vec_all_le(vector signed char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2949,6 +3401,7 @@ vec_all_le(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2956,6 +3409,7 @@ vec_all_le(vector unsigned char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2963,6 +3417,7 @@ vec_all_le(vector bool char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2978,6 +3433,7 @@ vec_all_le(vector signed short __a, vector signed short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2985,6 +3441,7 @@ vec_all_le(vector signed short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2999,6 +3456,7 @@ vec_all_le(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3006,6 +3464,7 @@ vec_all_le(vector unsigned short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3013,6 +3472,7 @@ vec_all_le(vector bool short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3028,6 +3488,7 @@ vec_all_le(vector signed int __a, vector signed int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3035,6 +3496,7 @@ vec_all_le(vector signed int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3049,6 +3511,7 @@ vec_all_le(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3056,6 +3519,7 @@ vec_all_le(vector unsigned int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3063,6 +3527,7 @@ vec_all_le(vector bool int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3078,6 +3543,7 @@ vec_all_le(vector signed long long __a, vector signed long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3085,6 +3551,7 @@ vec_all_le(vector signed long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3099,6 +3566,7 @@ vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3106,6 +3574,7 @@ vec_all_le(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3113,6 +3582,7 @@ vec_all_le(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -3121,6 +3591,15 @@ vec_all_le(vector bool long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_le(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_le(vector double __a, vector double __b) {
   int __cc;
@@ -3137,6 +3616,7 @@ vec_all_lt(vector signed char __a, vector signed char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3144,6 +3624,7 @@ vec_all_lt(vector signed char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3158,6 +3639,7 @@ vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3165,6 +3647,7 @@ vec_all_lt(vector unsigned char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3172,6 +3655,7 @@ vec_all_lt(vector bool char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -3187,6 +3671,7 @@ vec_all_lt(vector signed short __a, vector signed short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3194,6 +3679,7 @@ vec_all_lt(vector signed short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3208,6 +3694,7 @@ vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3215,6 +3702,7 @@ vec_all_lt(vector unsigned short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3222,6 +3710,7 @@ vec_all_lt(vector bool short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3237,6 +3726,7 @@ vec_all_lt(vector signed int __a, vector signed int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3244,6 +3734,7 @@ vec_all_lt(vector signed int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3258,6 +3749,7 @@ vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3265,6 +3757,7 @@ vec_all_lt(vector unsigned int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3272,6 +3765,7 @@ vec_all_lt(vector bool int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3287,6 +3781,7 @@ vec_all_lt(vector signed long long __a, vector signed long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3294,6 +3789,7 @@ vec_all_lt(vector signed long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3308,6 +3804,7 @@ vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3315,6 +3812,7 @@ vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3322,6 +3820,7 @@ vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -3330,6 +3829,15 @@ vec_all_lt(vector bool long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_lt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_lt(vector double __a, vector double __b) {
   int __cc;
@@ -3339,7 +3847,16 @@ vec_all_lt(vector double __a, vector double __b) {
 
 /*-- vec_all_nge ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nge(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__a, __b, &__cc);
@@ -3348,7 +3865,16 @@ vec_all_nge(vector double __a, vector double __b) {
 
 /*-- vec_all_ngt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ngt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_ngt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__a, __b, &__cc);
@@ -3357,7 +3883,16 @@ vec_all_ngt(vector double __a, vector double __b) {
 
 /*-- vec_all_nle ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nle(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nle(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__b, __a, &__cc);
@@ -3366,7 +3901,16 @@ vec_all_nle(vector double __a, vector double __b) {
 
 /*-- vec_all_nlt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nlt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nlt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__b, __a, &__cc);
@@ -3375,7 +3919,16 @@ vec_all_nlt(vector double __a, vector double __b) {
 
 /*-- vec_all_nan ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nan(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc == 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nan(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -3384,7 +3937,16 @@ vec_all_nan(vector double __a) {
 
 /*-- vec_all_numeric --------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_numeric(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_numeric(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -3400,6 +3962,7 @@ vec_any_eq(vector signed char __a, vector signed char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3407,6 +3970,7 @@ vec_any_eq(vector signed char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3422,6 +3986,7 @@ vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3430,6 +3995,7 @@ vec_any_eq(vector unsigned char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3453,6 +4019,7 @@ vec_any_eq(vector signed short __a, vector signed short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3460,6 +4027,7 @@ vec_any_eq(vector signed short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3475,6 +4043,7 @@ vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3483,6 +4052,7 @@ vec_any_eq(vector unsigned short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3506,6 +4076,7 @@ vec_any_eq(vector signed int __a, vector signed int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3513,6 +4084,7 @@ vec_any_eq(vector signed int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3528,6 +4100,7 @@ vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3536,6 +4109,7 @@ vec_any_eq(vector unsigned int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3559,6 +4133,7 @@ vec_any_eq(vector signed long long __a, vector signed long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3566,6 +4141,7 @@ vec_any_eq(vector signed long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3581,6 +4157,7 @@ vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3589,6 +4166,7 @@ vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3605,6 +4183,15 @@ vec_any_eq(vector bool long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_eq(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_eq(vector double __a, vector double __b) {
   int __cc;
@@ -3621,6 +4208,7 @@ vec_any_ne(vector signed char __a, vector signed char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3628,6 +4216,7 @@ vec_any_ne(vector signed char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3643,6 +4232,7 @@ vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3651,6 +4241,7 @@ vec_any_ne(vector unsigned char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3674,6 +4265,7 @@ vec_any_ne(vector signed short __a, vector signed short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3681,6 +4273,7 @@ vec_any_ne(vector signed short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3696,6 +4289,7 @@ vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3704,6 +4298,7 @@ vec_any_ne(vector unsigned short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3727,6 +4322,7 @@ vec_any_ne(vector signed int __a, vector signed int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3734,6 +4330,7 @@ vec_any_ne(vector signed int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3749,6 +4346,7 @@ vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3757,6 +4355,7 @@ vec_any_ne(vector unsigned int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3780,6 +4379,7 @@ vec_any_ne(vector signed long long __a, vector signed long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3787,6 +4387,7 @@ vec_any_ne(vector signed long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3802,6 +4403,7 @@ vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3810,6 +4412,7 @@ vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3826,6 +4429,15 @@ vec_any_ne(vector bool long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ne(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_ne(vector double __a, vector double __b) {
   int __cc;
@@ -3842,6 +4454,7 @@ vec_any_ge(vector signed char __a, vector signed char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3849,6 +4462,7 @@ vec_any_ge(vector signed char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3863,6 +4477,7 @@ vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3870,6 +4485,7 @@ vec_any_ge(vector unsigned char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3877,6 +4493,7 @@ vec_any_ge(vector bool char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -3892,6 +4509,7 @@ vec_any_ge(vector signed short __a, vector signed short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3899,6 +4517,7 @@ vec_any_ge(vector signed short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3913,6 +4532,7 @@ vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3920,6 +4540,7 @@ vec_any_ge(vector unsigned short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3927,6 +4548,7 @@ vec_any_ge(vector bool short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3942,6 +4564,7 @@ vec_any_ge(vector signed int __a, vector signed int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3949,6 +4572,7 @@ vec_any_ge(vector signed int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3963,6 +4587,7 @@ vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3970,6 +4595,7 @@ vec_any_ge(vector unsigned int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3977,6 +4603,7 @@ vec_any_ge(vector bool int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3992,6 +4619,7 @@ vec_any_ge(vector signed long long __a, vector signed long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3999,6 +4627,7 @@ vec_any_ge(vector signed long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4013,6 +4642,7 @@ vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4020,6 +4650,7 @@ vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4027,6 +4658,7 @@ vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4035,6 +4667,15 @@ vec_any_ge(vector bool long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_ge(vector double __a, vector double __b) {
   int __cc;
@@ -4051,6 +4692,7 @@ vec_any_gt(vector signed char __a, vector signed char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4058,6 +4700,7 @@ vec_any_gt(vector signed char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4072,6 +4715,7 @@ vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4079,6 +4723,7 @@ vec_any_gt(vector unsigned char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4086,6 +4731,7 @@ vec_any_gt(vector bool char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4101,6 +4747,7 @@ vec_any_gt(vector signed short __a, vector signed short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4108,6 +4755,7 @@ vec_any_gt(vector signed short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4122,6 +4770,7 @@ vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4129,6 +4778,7 @@ vec_any_gt(vector unsigned short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4136,6 +4786,7 @@ vec_any_gt(vector bool short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4151,6 +4802,7 @@ vec_any_gt(vector signed int __a, vector signed int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4158,6 +4810,7 @@ vec_any_gt(vector signed int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4172,6 +4825,7 @@ vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4179,6 +4833,7 @@ vec_any_gt(vector unsigned int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4186,6 +4841,7 @@ vec_any_gt(vector bool int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4201,6 +4857,7 @@ vec_any_gt(vector signed long long __a, vector signed long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4208,6 +4865,7 @@ vec_any_gt(vector signed long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4222,6 +4880,7 @@ vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4229,6 +4888,7 @@ vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4236,6 +4896,7 @@ vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4244,6 +4905,15 @@ vec_any_gt(vector bool long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_gt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_gt(vector double __a, vector double __b) {
   int __cc;
@@ -4260,6 +4930,7 @@ vec_any_le(vector signed char __a, vector signed char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4267,6 +4938,7 @@ vec_any_le(vector signed char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4281,6 +4953,7 @@ vec_any_le(vector unsigned char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4288,6 +4961,7 @@ vec_any_le(vector unsigned char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4295,6 +4969,7 @@ vec_any_le(vector bool char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4310,6 +4985,7 @@ vec_any_le(vector signed short __a, vector signed short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4317,6 +4993,7 @@ vec_any_le(vector signed short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4331,6 +5008,7 @@ vec_any_le(vector unsigned short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4338,6 +5016,7 @@ vec_any_le(vector unsigned short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4345,6 +5024,7 @@ vec_any_le(vector bool short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4360,6 +5040,7 @@ vec_any_le(vector signed int __a, vector signed int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4367,6 +5048,7 @@ vec_any_le(vector signed int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4381,6 +5063,7 @@ vec_any_le(vector unsigned int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4388,6 +5071,7 @@ vec_any_le(vector unsigned int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4395,6 +5079,7 @@ vec_any_le(vector bool int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4410,6 +5095,7 @@ vec_any_le(vector signed long long __a, vector signed long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4417,6 +5103,7 @@ vec_any_le(vector signed long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4431,6 +5118,7 @@ vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4438,6 +5126,7 @@ vec_any_le(vector unsigned long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4445,6 +5134,7 @@ vec_any_le(vector bool long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4453,6 +5143,15 @@ vec_any_le(vector bool long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_le(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_le(vector double __a, vector double __b) {
   int __cc;
@@ -4469,6 +5168,7 @@ vec_any_lt(vector signed char __a, vector signed char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4476,6 +5176,7 @@ vec_any_lt(vector signed char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4490,6 +5191,7 @@ vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4497,6 +5199,7 @@ vec_any_lt(vector unsigned char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4504,6 +5207,7 @@ vec_any_lt(vector bool char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4519,6 +5223,7 @@ vec_any_lt(vector signed short __a, vector signed short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4526,6 +5231,7 @@ vec_any_lt(vector signed short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4540,6 +5246,7 @@ vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4547,6 +5254,7 @@ vec_any_lt(vector unsigned short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4554,6 +5262,7 @@ vec_any_lt(vector bool short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4569,6 +5278,7 @@ vec_any_lt(vector signed int __a, vector signed int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4576,6 +5286,7 @@ vec_any_lt(vector signed int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4590,6 +5301,7 @@ vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4597,6 +5309,7 @@ vec_any_lt(vector unsigned int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4604,6 +5317,7 @@ vec_any_lt(vector bool int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4619,6 +5333,7 @@ vec_any_lt(vector signed long long __a, vector signed long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4626,6 +5341,7 @@ vec_any_lt(vector signed long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4640,6 +5356,7 @@ vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4647,6 +5364,7 @@ vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4654,6 +5372,7 @@ vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4662,6 +5381,15 @@ vec_any_lt(vector bool long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_lt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_lt(vector double __a, vector double __b) {
   int __cc;
@@ -4671,7 +5399,16 @@ vec_any_lt(vector double __a, vector double __b) {
 
 /*-- vec_any_nge ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nge(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__a, __b, &__cc);
@@ -4680,7 +5417,16 @@ vec_any_nge(vector double __a, vector double __b) {
 
 /*-- vec_any_ngt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ngt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_ngt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__a, __b, &__cc);
@@ -4689,7 +5435,16 @@ vec_any_ngt(vector double __a, vector double __b) {
 
 /*-- vec_any_nle ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nle(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nle(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__b, __a, &__cc);
@@ -4698,7 +5453,16 @@ vec_any_nle(vector double __a, vector double __b) {
 
 /*-- vec_any_nlt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nlt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nlt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__b, __a, &__cc);
@@ -4707,7 +5471,16 @@ vec_any_nlt(vector double __a, vector double __b) {
 
 /*-- vec_any_nan ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nan(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nan(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -4716,7 +5489,16 @@ vec_any_nan(vector double __a) {
 
 /*-- vec_any_numeric --------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_numeric(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_numeric(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -4735,11 +5517,13 @@ vec_andc(vector signed char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_andc(vector bool char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_andc(vector signed char __a, vector bool char __b) {
   return __a & ~__b;
@@ -4750,11 +5534,13 @@ vec_andc(vector unsigned char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_andc(vector bool char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_andc(vector unsigned char __a, vector bool char __b) {
   return __a & ~__b;
@@ -4770,11 +5556,13 @@ vec_andc(vector signed short __a, vector signed short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_andc(vector bool short __a, vector signed short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_andc(vector signed short __a, vector bool short __b) {
   return __a & ~__b;
@@ -4785,11 +5573,13 @@ vec_andc(vector unsigned short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_andc(vector bool short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_andc(vector unsigned short __a, vector bool short __b) {
   return __a & ~__b;
@@ -4805,11 +5595,13 @@ vec_andc(vector signed int __a, vector signed int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_andc(vector bool int __a, vector signed int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_andc(vector signed int __a, vector bool int __b) {
   return __a & ~__b;
@@ -4820,11 +5612,13 @@ vec_andc(vector unsigned int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_andc(vector bool int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_andc(vector unsigned int __a, vector bool int __b) {
   return __a & ~__b;
@@ -4840,11 +5634,13 @@ vec_andc(vector signed long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_andc(vector bool long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_andc(vector signed long long __a, vector bool long long __b) {
   return __a & ~__b;
@@ -4855,28 +5651,40 @@ vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_andc(vector bool long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_andc(vector unsigned long long __a, vector bool long long __b) {
   return __a & ~__b;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_andc(vector float __a, vector float __b) {
+  return (vector float)((vector unsigned int)__a &
+                         ~(vector unsigned int)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_andc(vector double __a, vector double __b) {
   return (vector double)((vector unsigned long long)__a &
                          ~(vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_andc(vector bool long long __a, vector double __b) {
   return (vector double)((vector unsigned long long)__a &
                          ~(vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_andc(vector double __a, vector bool long long __b) {
   return (vector double)((vector unsigned long long)__a &
@@ -4895,11 +5703,13 @@ vec_nor(vector signed char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_nor(vector bool char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_nor(vector signed char __a, vector bool char __b) {
   return ~(__a | __b);
@@ -4910,11 +5720,13 @@ vec_nor(vector unsigned char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_nor(vector bool char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_nor(vector unsigned char __a, vector bool char __b) {
   return ~(__a | __b);
@@ -4930,11 +5742,13 @@ vec_nor(vector signed short __a, vector signed short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_nor(vector bool short __a, vector signed short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_nor(vector signed short __a, vector bool short __b) {
   return ~(__a | __b);
@@ -4945,11 +5759,13 @@ vec_nor(vector unsigned short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_nor(vector bool short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_nor(vector unsigned short __a, vector bool short __b) {
   return ~(__a | __b);
@@ -4965,11 +5781,13 @@ vec_nor(vector signed int __a, vector signed int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_nor(vector bool int __a, vector signed int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_nor(vector signed int __a, vector bool int __b) {
   return ~(__a | __b);
@@ -4980,11 +5798,13 @@ vec_nor(vector unsigned int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_nor(vector bool int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_nor(vector unsigned int __a, vector bool int __b) {
   return ~(__a | __b);
@@ -5000,11 +5820,13 @@ vec_nor(vector signed long long __a, vector signed long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_nor(vector bool long long __a, vector signed long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_nor(vector signed long long __a, vector bool long long __b) {
   return ~(__a | __b);
@@ -5015,34 +5837,274 @@ vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_nor(vector bool long long __a, vector unsigned long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_nor(vector unsigned long long __a, vector bool long long __b) {
   return ~(__a | __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nor(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a |
+                         (vector unsigned int)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_nor(vector double __a, vector double __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_nor(vector bool long long __a, vector double __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_nor(vector double __a, vector bool long long __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+/*-- vec_orc ----------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_orc(vector bool char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_orc(vector signed char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_orc(vector unsigned char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_orc(vector bool short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_orc(vector signed short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_orc(vector unsigned short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_orc(vector bool int __a, vector bool int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_orc(vector signed int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_orc(vector unsigned int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector float
+vec_orc(vector float __a, vector float __b) {
+  return (vector float)((vector unsigned int)__a &
+                        ~(vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_orc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+#endif
+
+/*-- vec_nand ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_nand(vector bool char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nand(vector signed char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nand(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nand(vector bool short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nand(vector signed short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nand(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nand(vector bool int __a, vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nand(vector signed int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nand(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a &
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a &
+                          (vector unsigned long long)__b);
+}
+#endif
+
+/*-- vec_eqv ----------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_eqv(vector bool char __a, vector bool char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_eqv(vector signed char __a, vector signed char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_eqv(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_eqv(vector bool short __a, vector bool short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_eqv(vector signed short __a, vector signed short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_eqv(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_eqv(vector bool int __a, vector bool int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_eqv(vector signed int __a, vector signed int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_eqv(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_eqv(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a ^
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_eqv(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a ^
+                          (vector unsigned long long)__b);
+}
+#endif
+
 /*-- vec_cntlz --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector unsigned char
@@ -5323,30 +6385,35 @@ vec_sll(vector signed char __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sll(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sll(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsl(
@@ -5358,11 +6425,13 @@ vec_sll(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_s390_vsl(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sll(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsl(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sll(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsl(__a, (vector unsigned char)__b);
@@ -5374,30 +6443,35 @@ vec_sll(vector signed short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sll(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sll(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsl(
@@ -5410,12 +6484,14 @@ vec_sll(vector unsigned short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sll(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sll(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsl(
@@ -5428,30 +6504,35 @@ vec_sll(vector signed int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sll(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sll(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsl(
@@ -5464,12 +6545,14 @@ vec_sll(vector unsigned int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sll(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sll(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsl(
@@ -5482,30 +6565,35 @@ vec_sll(vector signed long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sll(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sll(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsl(
@@ -5518,12 +6606,14 @@ vec_sll(vector unsigned long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sll(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sll(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsl(
@@ -5626,6 +6716,20 @@ vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_slb(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_slb(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_slb(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vslb(
@@ -5644,6 +6748,10 @@ extern __ATTRS_o vector signed char
 vec_sld(vector signed char __a, vector signed char __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool char
+vec_sld(vector bool char __a, vector bool char __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned char
 vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5652,6 +6760,10 @@ extern __ATTRS_o vector signed short
 vec_sld(vector signed short __a, vector signed short __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool short
+vec_sld(vector bool short __a, vector bool short __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned short
 vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5660,6 +6772,10 @@ extern __ATTRS_o vector signed int
 vec_sld(vector signed int __a, vector signed int __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool int
+vec_sld(vector bool int __a, vector bool int __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned int
 vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5668,10 +6784,20 @@ extern __ATTRS_o vector signed long long
 vec_sld(vector signed long long __a, vector signed long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool long long
+vec_sld(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned long long
 vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector float
+vec_sld(vector float __a, vector float __b, int __c)
+  __constant_range(__c, 0, 15);
+#endif
+
 extern __ATTRS_o vector double
 vec_sld(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5714,6 +6840,7 @@ extern __ATTRS_o vector unsigned long long
 vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector double
 vec_sldw(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 3);
@@ -5730,30 +6857,35 @@ vec_sral(vector signed char __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sral(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sral(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsra(
@@ -5765,11 +6897,13 @@ vec_sral(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_s390_vsra(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sral(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsra(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sral(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsra(__a, (vector unsigned char)__b);
@@ -5781,30 +6915,35 @@ vec_sral(vector signed short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sral(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sral(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsra(
@@ -5817,12 +6956,14 @@ vec_sral(vector unsigned short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sral(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sral(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsra(
@@ -5835,30 +6976,35 @@ vec_sral(vector signed int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sral(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sral(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsra(
@@ -5871,12 +7017,14 @@ vec_sral(vector unsigned int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sral(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sral(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsra(
@@ -5889,30 +7037,35 @@ vec_sral(vector signed long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sral(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sral(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsra(
@@ -5925,12 +7078,14 @@ vec_sral(vector unsigned long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sral(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sral(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsra(
@@ -6033,6 +7188,20 @@ vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_srab(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_srab(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_srab(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vsrab(
@@ -6053,30 +7222,35 @@ vec_srl(vector signed char __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_srl(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_srl(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsrl(
@@ -6088,11 +7262,13 @@ vec_srl(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_s390_vsrl(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_srl(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_srl(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
@@ -6104,30 +7280,35 @@ vec_srl(vector signed short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_srl(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_srl(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsrl(
@@ -6140,12 +7321,14 @@ vec_srl(vector unsigned short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_srl(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_srl(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsrl(
@@ -6158,30 +7341,35 @@ vec_srl(vector signed int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_srl(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_srl(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsrl(
@@ -6194,12 +7382,14 @@ vec_srl(vector unsigned int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_srl(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_srl(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsrl(
@@ -6212,30 +7402,35 @@ vec_srl(vector signed long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_srl(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_srl(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsrl(
@@ -6248,12 +7443,14 @@ vec_srl(vector unsigned long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_srl(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_srl(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsrl(
@@ -6356,6 +7553,20 @@ vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_srb(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_srb(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_srb(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vsrlb(
@@ -6390,6 +7601,13 @@ vec_abs(vector signed long long __a) {
   return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_abs(vector float __a) {
+  return __builtin_s390_vflpsb(__a);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_abs(vector double __a) {
   return __builtin_s390_vflpdb(__a);
@@ -6397,7 +7615,14 @@ vec_abs(vector double __a) {
 
 /*-- vec_nabs ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nabs(vector float __a) {
+  return __builtin_s390_vflnsb(__a);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_nabs(vector double __a) {
   return __builtin_s390_vflndb(__a);
 }
@@ -6409,12 +7634,14 @@ vec_max(vector signed char __a, vector signed char __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_max(vector signed char __a, vector bool char __b) {
   vector signed char __bc = (vector signed char)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_max(vector bool char __a, vector signed char __b) {
   vector signed char __ac = (vector signed char)__a;
@@ -6426,12 +7653,14 @@ vec_max(vector unsigned char __a, vector unsigned char __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_max(vector unsigned char __a, vector bool char __b) {
   vector unsigned char __bc = (vector unsigned char)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_max(vector bool char __a, vector unsigned char __b) {
   vector unsigned char __ac = (vector unsigned char)__a;
@@ -6443,12 +7672,14 @@ vec_max(vector signed short __a, vector signed short __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_max(vector signed short __a, vector bool short __b) {
   vector signed short __bc = (vector signed short)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_max(vector bool short __a, vector signed short __b) {
   vector signed short __ac = (vector signed short)__a;
@@ -6460,12 +7691,14 @@ vec_max(vector unsigned short __a, vector unsigned short __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_max(vector unsigned short __a, vector bool short __b) {
   vector unsigned short __bc = (vector unsigned short)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_max(vector bool short __a, vector unsigned short __b) {
   vector unsigned short __ac = (vector unsigned short)__a;
@@ -6477,12 +7710,14 @@ vec_max(vector signed int __a, vector signed int __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_max(vector signed int __a, vector bool int __b) {
   vector signed int __bc = (vector signed int)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_max(vector bool int __a, vector signed int __b) {
   vector signed int __ac = (vector signed int)__a;
@@ -6494,12 +7729,14 @@ vec_max(vector unsigned int __a, vector unsigned int __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_max(vector unsigned int __a, vector bool int __b) {
   vector unsigned int __bc = (vector unsigned int)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_max(vector bool int __a, vector unsigned int __b) {
   vector unsigned int __ac = (vector unsigned int)__a;
@@ -6511,12 +7748,14 @@ vec_max(vector signed long long __a, vector signed long long __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_max(vector signed long long __a, vector bool long long __b) {
   vector signed long long __bc = (vector signed long long)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_max(vector bool long long __a, vector signed long long __b) {
   vector signed long long __ac = (vector signed long long)__a;
@@ -6528,21 +7767,34 @@ vec_max(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_max(vector unsigned long long __a, vector bool long long __b) {
   vector unsigned long long __bc = (vector unsigned long long)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_max(vector bool long long __a, vector unsigned long long __b) {
   vector unsigned long long __ac = (vector unsigned long long)__a;
   return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_max(vector float __a, vector float __b) {
+  return __builtin_s390_vfmaxsb(__a, __b, 0);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_max(vector double __a, vector double __b) {
+#if __ARCH__ >= 12
+  return __builtin_s390_vfmaxdb(__a, __b, 0);
+#else
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+#endif
 }
 
 /*-- vec_min ----------------------------------------------------------------*/
@@ -6552,12 +7804,14 @@ vec_min(vector signed char __a, vector signed char __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_min(vector signed char __a, vector bool char __b) {
   vector signed char __bc = (vector signed char)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_min(vector bool char __a, vector signed char __b) {
   vector signed char __ac = (vector signed char)__a;
@@ -6569,12 +7823,14 @@ vec_min(vector unsigned char __a, vector unsigned char __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_min(vector unsigned char __a, vector bool char __b) {
   vector unsigned char __bc = (vector unsigned char)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_min(vector bool char __a, vector unsigned char __b) {
   vector unsigned char __ac = (vector unsigned char)__a;
@@ -6586,12 +7842,14 @@ vec_min(vector signed short __a, vector signed short __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_min(vector signed short __a, vector bool short __b) {
   vector signed short __bc = (vector signed short)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_min(vector bool short __a, vector signed short __b) {
   vector signed short __ac = (vector signed short)__a;
@@ -6603,12 +7861,14 @@ vec_min(vector unsigned short __a, vector unsigned short __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_min(vector unsigned short __a, vector bool short __b) {
   vector unsigned short __bc = (vector unsigned short)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_min(vector bool short __a, vector unsigned short __b) {
   vector unsigned short __ac = (vector unsigned short)__a;
@@ -6620,12 +7880,14 @@ vec_min(vector signed int __a, vector signed int __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_min(vector signed int __a, vector bool int __b) {
   vector signed int __bc = (vector signed int)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_min(vector bool int __a, vector signed int __b) {
   vector signed int __ac = (vector signed int)__a;
@@ -6637,12 +7899,14 @@ vec_min(vector unsigned int __a, vector unsigned int __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_min(vector unsigned int __a, vector bool int __b) {
   vector unsigned int __bc = (vector unsigned int)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_min(vector bool int __a, vector unsigned int __b) {
   vector unsigned int __ac = (vector unsigned int)__a;
@@ -6654,12 +7918,14 @@ vec_min(vector signed long long __a, vector signed long long __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_min(vector signed long long __a, vector bool long long __b) {
   vector signed long long __bc = (vector signed long long)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_min(vector bool long long __a, vector signed long long __b) {
   vector signed long long __ac = (vector signed long long)__a;
@@ -6671,21 +7937,34 @@ vec_min(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_min(vector unsigned long long __a, vector bool long long __b) {
   vector unsigned long long __bc = (vector unsigned long long)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_min(vector bool long long __a, vector unsigned long long __b) {
   vector unsigned long long __ac = (vector unsigned long long)__a;
   return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_min(vector float __a, vector float __b) {
+  return __builtin_s390_vfminsb(__a, __b, 0);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_min(vector double __a, vector double __b) {
+#if __ARCH__ >= 12
+  return __builtin_s390_vfmindb(__a, __b, 0);
+#else
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+#endif
 }
 
 /*-- vec_add_u128 -----------------------------------------------------------*/
@@ -7126,6 +8405,13 @@ vec_mulo(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_s390_vmlof(__a, __b);
 }
 
+/*-- vec_msum_u128 ----------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+#define vec_msum_u128(X, Y, Z, W) \
+  ((vector unsigned char)__builtin_s390_vmslg((X), (Y), (Z), (W)));
+#endif
+
 /*-- vec_sub_u128 -----------------------------------------------------------*/
 
 static inline __ATTRS_ai vector unsigned char
@@ -7263,6 +8549,14 @@ vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
                             (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_test_mask(vector float __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_test_mask(vector double __a, vector unsigned long long __b) {
   return __builtin_s390_vtm((vector unsigned char)__a,
@@ -7271,27 +8565,77 @@ vec_test_mask(vector double __a, vector unsigned long long __b) {
 
 /*-- vec_madd ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_madd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfmasb(__a, __b, __c);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_madd(vector double __a, vector double __b, vector double __c) {
   return __builtin_s390_vfmadb(__a, __b, __c);
 }
 
 /*-- vec_msub ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_msub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfmssb(__a, __b, __c);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_msub(vector double __a, vector double __b, vector double __c) {
   return __builtin_s390_vfmsdb(__a, __b, __c);
 }
 
+/*-- vec_nmadd ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nmadd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfnmasb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nmadd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfnmadb(__a, __b, __c);
+}
+#endif
+
+/*-- vec_nmsub ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nmsub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfnmssb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nmsub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfnmsdb(__a, __b, __c);
+}
+#endif
+
 /*-- vec_sqrt ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_sqrt(vector float __a) {
+  return __builtin_s390_vfsqsb(__a);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_sqrt(vector double __a) {
   return __builtin_s390_vfsqdb(__a);
 }
 
 /*-- vec_ld2f ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai vector double
 vec_ld2f(const float *__ptr) {
   typedef float __v2f32 __attribute__((__vector_size__(8)));
@@ -7300,6 +8644,7 @@ vec_ld2f(const float *__ptr) {
 
 /*-- vec_st2f ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai void
 vec_st2f(vector double __a, float *__ptr) {
   typedef float __v2f32 __attribute__((__vector_size__(8)));
@@ -7308,6 +8653,7 @@ vec_st2f(vector double __a, float *__ptr) {
 
 /*-- vec_ctd ----------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_ctd(vector signed long long __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7316,6 +8662,7 @@ vec_ctd(vector signed long long __a, int __b)
   return __conv;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_ctd(vector unsigned long long __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7326,6 +8673,7 @@ vec_ctd(vector unsigned long long __a, int __b)
 
 /*-- vec_ctsl ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_ctsl(vector double __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7335,6 +8683,7 @@ vec_ctsl(vector double __a, int __b)
 
 /*-- vec_ctul ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_ctul(vector double __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7342,16 +8691,79 @@ vec_ctul(vector double __a, int __b)
   return __builtin_convertvector(__a, vector unsigned long long);
 }
 
+/*-- vec_doublee ------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector double
+vec_doublee(vector float __a) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  __v2f32 __pack = __builtin_shufflevector(__a, __a, 0, 2);
+  return __builtin_convertvector(__pack, vector double);
+}
+#endif
+
+/*-- vec_floate -------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector float
+vec_floate(vector double __a) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  __v2f32 __pack = __builtin_convertvector(__a, __v2f32);
+  return __builtin_shufflevector(__pack, __pack, 0, -1, 1, -1);
+}
+#endif
+
+/*-- vec_double -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+/*-- vec_signed -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_unsigned -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
 /*-- vec_roundp -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundp(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 6);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundp(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 6);
 }
 
 /*-- vec_ceil ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_ceil(vector float __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 6);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_ceil(vector double __a) {
   // On this platform, vec_ceil never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 6);
@@ -7359,14 +8771,29 @@ vec_ceil(vector double __a) {
 
 /*-- vec_roundm -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundm(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 7);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundm(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 7);
 }
 
 /*-- vec_floor --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_floor(vector float __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 7);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_floor(vector double __a) {
   // On this platform, vec_floor never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 7);
@@ -7374,14 +8801,29 @@ vec_floor(vector double __a) {
 
 /*-- vec_roundz -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundz(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 5);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundz(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 5);
 }
 
 /*-- vec_trunc --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_trunc(vector float __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 5);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_trunc(vector double __a) {
   // On this platform, vec_trunc never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 5);
@@ -7389,22 +8831,104 @@ vec_trunc(vector double __a) {
 
 /*-- vec_roundc -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundc(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 0);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundc(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 0);
 }
 
+/*-- vec_rint ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_rint(vector float __a) {
+  // vec_rint may trigger the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 0, 0);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_rint(vector double __a) {
+  // vec_rint may trigger the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 0, 0);
+}
+
 /*-- vec_round --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_round(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 4);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_round(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 4);
 }
 
 /*-- vec_fp_test_data_class -------------------------------------------------*/
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector bool int
+vec_fp_test_data_class(vector float __a, int __b, int *__c)
+  __constant_range(__b, 0, 4095);
+
+extern __ATTRS_o vector bool long long
+vec_fp_test_data_class(vector double __a, int __b, int *__c)
+  __constant_range(__b, 0, 4095);
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((__typeof__((vec_fp_test_data_class)((X), (Y), (Z)))) \
+   __extension__ ({ \
+     vector unsigned char __res; \
+     vector unsigned char __x = (vector unsigned char)(X); \
+     int *__z = (Z); \
+     switch (sizeof ((X)[0])) { \
+     case 4:  __res = (vector unsigned char) \
+                      __builtin_s390_vftcisb((vector float)__x, (Y), __z); \
+              break; \
+     default: __res = (vector unsigned char) \
+                      __builtin_s390_vftcidb((vector double)__x, (Y), __z); \
+              break; \
+     } __res; }))
+#else
 #define vec_fp_test_data_class(X, Y, Z) \
   ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+#endif
+
+#define __VEC_CLASS_FP_ZERO_P (1 << 11)
+#define __VEC_CLASS_FP_ZERO_N (1 << 10)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P | __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_NORMAL_P (1 << 9)
+#define __VEC_CLASS_FP_NORMAL_N (1 << 8)
+#define __VEC_CLASS_FP_NORMAL (__VEC_CLASS_FP_NORMAL_P | \
+                               __VEC_CLASS_FP_NORMAL_N)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 7)
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 6)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_INFINITY_P (1 << 5)
+#define __VEC_CLASS_FP_INFINITY_N (1 << 4)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_QNAN_P (1 << 3)
+#define __VEC_CLASS_FP_QNAN_N (1 << 2)
+#define __VEC_CLASS_FP_QNAN (__VEC_CLASS_FP_QNAN_P | __VEC_CLASS_FP_QNAN_N)
+#define __VEC_CLASS_FP_SNAN_P (1 << 1)
+#define __VEC_CLASS_FP_SNAN_N (1 << 0)
+#define __VEC_CLASS_FP_SNAN (__VEC_CLASS_FP_SNAN_P | __VEC_CLASS_FP_SNAN_N)
+#define __VEC_CLASS_FP_NAN (__VEC_CLASS_FP_QNAN | __VEC_CLASS_FP_SNAN)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN | \
+                                   __VEC_CLASS_FP_SUBNORMAL | \
+                                   __VEC_CLASS_FP_ZERO | \
+                                   __VEC_CLASS_FP_INFINITY)
 
 /*-- vec_cp_until_zero ------------------------------------------------------*/
 
diff --git a/lib/Index/IndexingAction.cpp b/lib/Index/IndexingAction.cpp
index cac24d4b9c4c..84d31200bab4 100644
--- a/lib/Index/IndexingAction.cpp
+++ b/lib/Index/IndexingAction.cpp
@@ -177,6 +177,18 @@ void index::indexASTUnit(ASTUnit &Unit,
   DataConsumer->finish();
 }
 
+void index::indexTopLevelDecls(ASTContext &Ctx, ArrayRef<const Decl *> Decls,
+                               std::shared_ptr<IndexDataConsumer> DataConsumer,
+                               IndexingOptions Opts) {
+  IndexingContext IndexCtx(Opts, *DataConsumer);
+  IndexCtx.setASTContext(Ctx);
+
+  DataConsumer->initialize(Ctx);
+  for (const Decl *D : Decls)
+    IndexCtx.indexTopLevelDecl(D);
+  DataConsumer->finish();
+}
+
 void index::indexModuleFile(serialization::ModuleFile &Mod,
                             ASTReader &Reader,
                             std::shared_ptr<IndexDataConsumer> DataConsumer,
diff --git a/lib/Index/IndexingContext.cpp b/lib/Index/IndexingContext.cpp
index c4aa51d62f02..addee691e804 100644
--- a/lib/Index/IndexingContext.cpp
+++ b/lib/Index/IndexingContext.cpp
@@ -260,8 +260,10 @@ static const Decl *adjustParent(const Decl *Parent) {
 static const Decl *getCanonicalDecl(const Decl *D) {
   D = D->getCanonicalDecl();
   if (auto TD = dyn_cast<TemplateDecl>(D)) {
-    D = TD->getTemplatedDecl();
-    assert(D->isCanonicalDecl());
+    if (auto TTD = TD->getTemplatedDecl()) {
+      D = TTD;
+      assert(D->isCanonicalDecl());
+    }
   }
 
   return D;
diff --git a/lib/Lex/MacroArgs.cpp b/lib/Lex/MacroArgs.cpp
index a201d1659073..f791d8d4bacc 100644
--- a/lib/Lex/MacroArgs.cpp
+++ b/lib/Lex/MacroArgs.cpp
@@ -52,14 +52,14 @@ MacroArgs *MacroArgs::create(const MacroInfo *MI,
                                  UnexpArgTokens.size() * sizeof(Token));
     // Construct the MacroArgs object.
     new (Result)
-        MacroArgs(UnexpArgTokens.size(), VarargsElided, MI->getNumArgs());
+        MacroArgs(UnexpArgTokens.size(), VarargsElided, MI->getNumParams());
   } else {
     Result = *ResultEnt;
     // Unlink this node from the preprocessors singly linked list.
     *ResultEnt = Result->ArgCache;
     Result->NumUnexpArgTokens = UnexpArgTokens.size();
     Result->VarargsElided = VarargsElided;
-    Result->NumMacroArgs = MI->getNumArgs();
+    Result->NumMacroArgs = MI->getNumParams();
   }
 
   // Copy the actual unexpanded tokens to immediately after the result ptr.
@@ -148,11 +148,11 @@ bool MacroArgs::ArgNeedsPreexpansion(const Token *ArgTok,
 const std::vector<Token> &
 MacroArgs::getPreExpArgument(unsigned Arg, const MacroInfo *MI, 
                              Preprocessor &PP) {
-  assert(Arg < MI->getNumArgs() && "Invalid argument number!");
+  assert(Arg < MI->getNumParams() && "Invalid argument number!");
 
   // If we have already computed this, return it.
-  if (PreExpArgTokens.size() < MI->getNumArgs())
-    PreExpArgTokens.resize(MI->getNumArgs());
+  if (PreExpArgTokens.size() < MI->getNumParams())
+    PreExpArgTokens.resize(MI->getNumParams());
   
   std::vector<Token> &Result = PreExpArgTokens[Arg];
   if (!Result.empty()) return Result;
diff --git a/lib/Lex/MacroInfo.cpp b/lib/Lex/MacroInfo.cpp
index 1e5deeb1919b..6dc7841bc160 100644
--- a/lib/Lex/MacroInfo.cpp
+++ b/lib/Lex/MacroInfo.cpp
@@ -17,8 +17,8 @@ using namespace clang;
 
 MacroInfo::MacroInfo(SourceLocation DefLoc)
   : Location(DefLoc),
-    ArgumentList(nullptr),
-    NumArguments(0),
+    ParameterList(nullptr),
+    NumParameters(0),
     IsDefinitionLengthCached(false),
     IsFunctionLike(false),
     IsC99Varargs(false),
@@ -74,7 +74,7 @@ bool MacroInfo::isIdenticalTo(const MacroInfo &Other, Preprocessor &PP,
 
   // Check # tokens in replacement, number of args, and various flags all match.
   if (ReplacementTokens.size() != Other.ReplacementTokens.size() ||
-      getNumArgs() != Other.getNumArgs() ||
+      getNumParams() != Other.getNumParams() ||
       isFunctionLike() != Other.isFunctionLike() ||
       isC99Varargs() != Other.isC99Varargs() ||
       isGNUVarargs() != Other.isGNUVarargs())
@@ -82,7 +82,8 @@ bool MacroInfo::isIdenticalTo(const MacroInfo &Other, Preprocessor &PP,
 
   if (Lexically) {
     // Check arguments.
-    for (arg_iterator I = arg_begin(), OI = Other.arg_begin(), E = arg_end();
+    for (param_iterator I = param_begin(), OI = Other.param_begin(),
+                        E = param_end();
          I != E; ++I, ++OI)
       if (*I != *OI) return false;
   }
@@ -109,10 +110,10 @@ bool MacroInfo::isIdenticalTo(const MacroInfo &Other, Preprocessor &PP,
         return false;
       // With syntactic equivalence the parameter names can be different as long
       // as they are used in the same place.
-      int AArgNum = getArgumentNum(A.getIdentifierInfo());
+      int AArgNum = getParameterNum(A.getIdentifierInfo());
       if (AArgNum == -1)
         return false;
-      if (AArgNum != Other.getArgumentNum(B.getIdentifierInfo()))
+      if (AArgNum != Other.getParameterNum(B.getIdentifierInfo()))
         return false;
       continue;
     }
@@ -141,12 +142,12 @@ LLVM_DUMP_METHOD void MacroInfo::dump() const {
   Out << "\n    #define <macro>";
   if (IsFunctionLike) {
     Out << "(";
-    for (unsigned I = 0; I != NumArguments; ++I) {
+    for (unsigned I = 0; I != NumParameters; ++I) {
       if (I) Out << ", ";
-      Out << ArgumentList[I]->getName();
+      Out << ParameterList[I]->getName();
     }
     if (IsC99Varargs || IsGNUVarargs) {
-      if (NumArguments && IsC99Varargs) Out << ", ";
+      if (NumParameters && IsC99Varargs) Out << ", ";
       Out << "...";
     }
     Out << ")";
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 8c79e50176e1..b2450f516ba2 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -220,26 +220,18 @@ bool Preprocessor::CheckMacroName(Token &MacroNameTok, MacroUse isDefineUndef,
     return Diag(MacroNameTok, diag::err_pp_missing_macro_name);
 
   IdentifierInfo *II = MacroNameTok.getIdentifierInfo();
-  if (!II) {
-    bool Invalid = false;
-    std::string Spelling = getSpelling(MacroNameTok, &Invalid);
-    if (Invalid)
-      return Diag(MacroNameTok, diag::err_pp_macro_not_identifier);
-    II = getIdentifierInfo(Spelling);
-
-    if (!II->isCPlusPlusOperatorKeyword())
-      return Diag(MacroNameTok, diag::err_pp_macro_not_identifier);
+  if (!II)
+    return Diag(MacroNameTok, diag::err_pp_macro_not_identifier);
 
+  if (II->isCPlusPlusOperatorKeyword()) {
     // C++ 2.5p2: Alternative tokens behave the same as its primary token
     // except for their spellings.
     Diag(MacroNameTok, getLangOpts().MicrosoftExt
                            ? diag::ext_pp_operator_used_as_macro_name
                            : diag::err_pp_operator_used_as_macro_name)
         << II << MacroNameTok.getKind();
-
     // Allow #defining |and| and friends for Microsoft compatibility or
     // recovery when legacy C headers are included in C++.
-    MacroNameTok.setIdentifierInfo(II);
   }
 
   if ((isDefineUndef != MU_Other) && II->getPPKeywordID() == tok::pp_defined) {
@@ -2143,11 +2135,11 @@ void Preprocessor::HandleIncludeMacrosDirective(SourceLocation HashLoc,
 // Preprocessor Macro Directive Handling.
 //===----------------------------------------------------------------------===//
 
-/// ReadMacroDefinitionArgList - The ( starting an argument list of a macro
+/// ReadMacroParameterList - The ( starting an argument list of a macro
 /// definition has just been read.  Lex the rest of the arguments and the
 /// closing ), updating MI with what we learn.  Return true if an error occurs
 /// parsing the arg list.
-bool Preprocessor::ReadMacroDefinitionArgList(MacroInfo *MI, Token &Tok) {
+bool Preprocessor::ReadMacroParameterList(MacroInfo *MI, Token &Tok) {
   SmallVector<IdentifierInfo*, 32> Arguments;
 
   while (true) {
@@ -2181,7 +2173,7 @@ bool Preprocessor::ReadMacroDefinitionArgList(MacroInfo *MI, Token &Tok) {
       // Add the __VA_ARGS__ identifier as an argument.
       Arguments.push_back(Ident__VA_ARGS__);
       MI->setIsC99Varargs();
-      MI->setArgumentList(Arguments, BP);
+      MI->setParameterList(Arguments, BP);
       return false;
     case tok::eod:  // #define X(
       Diag(Tok, diag::err_pp_missing_rparen_in_macro_def);
@@ -2215,7 +2207,7 @@ bool Preprocessor::ReadMacroDefinitionArgList(MacroInfo *MI, Token &Tok) {
         Diag(Tok, diag::err_pp_expected_comma_in_arg_list);
         return true;
       case tok::r_paren: // #define X(A)
-        MI->setArgumentList(Arguments, BP);
+        MI->setParameterList(Arguments, BP);
         return false;
       case tok::comma:  // #define X(A,
         break;
@@ -2231,7 +2223,7 @@ bool Preprocessor::ReadMacroDefinitionArgList(MacroInfo *MI, Token &Tok) {
         }
 
         MI->setIsGNUVarargs();
-        MI->setArgumentList(Arguments, BP);
+        MI->setParameterList(Arguments, BP);
         return false;
       }
     }
@@ -2280,28 +2272,20 @@ static bool isConfigurationPattern(Token &MacroName, MacroInfo *MI,
          MI->getNumTokens() == 0;
 }
 
-/// HandleDefineDirective - Implements \#define.  This consumes the entire macro
-/// line then lets the caller lex the next real token.
-void Preprocessor::HandleDefineDirective(Token &DefineTok,
-                                         bool ImmediatelyAfterHeaderGuard) {
-  ++NumDefined;
+// ReadOptionalMacroParameterListAndBody - This consumes all (i.e. the
+// entire line) of the macro's tokens and adds them to MacroInfo, and while
+// doing so performs certain validity checks including (but not limited to):
+//   - # (stringization) is followed by a macro parameter
+//
+//  Returns a nullptr if an invalid sequence of tokens is encountered or returns
+//  a pointer to a MacroInfo object.
 
-  Token MacroNameTok;
-  bool MacroShadowsKeyword;
-  ReadMacroName(MacroNameTok, MU_Define, &MacroShadowsKeyword);
-
-  // Error reading macro name?  If so, diagnostic already issued.
-  if (MacroNameTok.is(tok::eod))
-    return;
+MacroInfo *Preprocessor::ReadOptionalMacroParameterListAndBody(
+    const Token &MacroNameTok, const bool ImmediatelyAfterHeaderGuard) {
 
   Token LastTok = MacroNameTok;
-
-  // If we are supposed to keep comments in #defines, reenable comment saving
-  // mode.
-  if (CurLexer) CurLexer->SetCommentRetentionState(KeepMacroComments);
-
   // Create the new macro.
-  MacroInfo *MI = AllocateMacroInfo(MacroNameTok.getLocation());
+  MacroInfo *const MI = AllocateMacroInfo(MacroNameTok.getLocation());
 
   Token Tok;
   LexUnexpandedToken(Tok);
@@ -2323,11 +2307,11 @@ void Preprocessor::HandleDefineDirective(Token &DefineTok,
   } else if (Tok.is(tok::l_paren)) {
     // This is a function-like macro definition.  Read the argument list.
     MI->setIsFunctionLike();
-    if (ReadMacroDefinitionArgList(MI, LastTok)) {
+    if (ReadMacroParameterList(MI, LastTok)) {
       // Throw away the rest of the line.
       if (CurPPLexer->ParsingPreprocessorDirective)
         DiscardUntilEndOfDirective();
-      return;
+      return nullptr;
     }
 
     // If this is a definition of a variadic C99 function-like macro, not using
@@ -2434,7 +2418,7 @@ void Preprocessor::HandleDefineDirective(Token &DefineTok,
 
       // Check for a valid macro arg identifier.
       if (Tok.getIdentifierInfo() == nullptr ||
-          MI->getArgumentNum(Tok.getIdentifierInfo()) == -1) {
+          MI->getParameterNum(Tok.getIdentifierInfo()) == -1) {
 
         // If this is assembler-with-cpp mode, we accept random gibberish after
         // the '#' because '#' is often a comment character.  However, change
@@ -2450,7 +2434,7 @@ void Preprocessor::HandleDefineDirective(Token &DefineTok,
 
           // Disable __VA_ARGS__ again.
           Ident__VA_ARGS__->setIsPoisoned(true);
-          return;
+          return nullptr;
         }
       }
 
@@ -2463,15 +2447,39 @@ void Preprocessor::HandleDefineDirective(Token &DefineTok,
       LexUnexpandedToken(Tok);
     }
   }
+  MI->setDefinitionEndLoc(LastTok.getLocation());
+  // Disable __VA_ARGS__ again.
+  Ident__VA_ARGS__->setIsPoisoned(true);
+
+  return MI;
+}
+/// HandleDefineDirective - Implements \#define.  This consumes the entire macro
+/// line then lets the caller lex the next real token.
+void Preprocessor::HandleDefineDirective(
+    Token &DefineTok, const bool ImmediatelyAfterHeaderGuard) {
+  ++NumDefined;
+
+  Token MacroNameTok;
+  bool MacroShadowsKeyword;
+  ReadMacroName(MacroNameTok, MU_Define, &MacroShadowsKeyword);
+
+  // Error reading macro name?  If so, diagnostic already issued.
+  if (MacroNameTok.is(tok::eod))
+    return;
+
+  // If we are supposed to keep comments in #defines, reenable comment saving
+  // mode.
+  if (CurLexer) CurLexer->SetCommentRetentionState(KeepMacroComments);
+
+  MacroInfo *const MI = ReadOptionalMacroParameterListAndBody(
+      MacroNameTok, ImmediatelyAfterHeaderGuard);
+  
+  if (!MI) return;
 
   if (MacroShadowsKeyword &&
       !isConfigurationPattern(MacroNameTok, MI, getLangOpts())) {
     Diag(MacroNameTok, diag::warn_pp_macro_hides_keyword);
-  }
-
-  // Disable __VA_ARGS__ again.
-  Ident__VA_ARGS__->setIsPoisoned(true);
-
+  }  
   // Check that there is no paste (##) operator at the beginning or end of the
   // replacement list.
   unsigned NumTokens = MI->getNumTokens();
@@ -2486,7 +2494,7 @@ void Preprocessor::HandleDefineDirective(Token &DefineTok,
     }
   }
 
-  MI->setDefinitionEndLoc(LastTok.getLocation());
+  
 
   // Finally, if this identifier already had a macro defined for it, verify that
   // the macro bodies are identical, and issue diagnostics if they are not.
diff --git a/lib/Lex/PPExpressions.cpp b/lib/Lex/PPExpressions.cpp
index 12f5084298df..d8431827e9cd 100644
--- a/lib/Lex/PPExpressions.cpp
+++ b/lib/Lex/PPExpressions.cpp
@@ -237,35 +237,32 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
     PP.setCodeCompletionReached();
     PP.LexNonComment(PeekTok);
   }
-      
-  // If this token's spelling is a pp-identifier, check to see if it is
-  // 'defined' or if it is a macro.  Note that we check here because many
-  // keywords are pp-identifiers, so we can't check the kind.
-  if (IdentifierInfo *II = PeekTok.getIdentifierInfo()) {
-    // Handle "defined X" and "defined(X)".
-    if (II->isStr("defined"))
-      return EvaluateDefined(Result, PeekTok, DT, ValueLive, PP);
-
-    // If this identifier isn't 'defined' or one of the special
-    // preprocessor keywords and it wasn't macro expanded, it turns
-    // into a simple 0, unless it is the C++ keyword "true", in which case it
-    // turns into "1".
-    if (ValueLive &&
-        II->getTokenID() != tok::kw_true &&
-        II->getTokenID() != tok::kw_false)
-      PP.Diag(PeekTok, diag::warn_pp_undef_identifier) << II;
-    Result.Val = II->getTokenID() == tok::kw_true;
-    Result.Val.setIsUnsigned(false);  // "0" is signed intmax_t 0.
-    Result.setIdentifier(II);
-    Result.setRange(PeekTok.getLocation());
-    DT.IncludedUndefinedIds = (II->getTokenID() != tok::kw_true &&
-                               II->getTokenID() != tok::kw_false);
-    PP.LexNonComment(PeekTok);
-    return false;
-  }
 
   switch (PeekTok.getKind()) {
-  default:  // Non-value token.
+  default:
+    // If this token's spelling is a pp-identifier, check to see if it is
+    // 'defined' or if it is a macro.  Note that we check here because many
+    // keywords are pp-identifiers, so we can't check the kind.
+    if (IdentifierInfo *II = PeekTok.getIdentifierInfo()) {
+      // Handle "defined X" and "defined(X)".
+      if (II->isStr("defined"))
+        return EvaluateDefined(Result, PeekTok, DT, ValueLive, PP);
+
+      if (!II->isCPlusPlusOperatorKeyword()) {
+        // If this identifier isn't 'defined' or one of the special
+        // preprocessor keywords and it wasn't macro expanded, it turns
+        // into a simple 0
+        if (ValueLive)
+          PP.Diag(PeekTok, diag::warn_pp_undef_identifier) << II;
+        Result.Val = 0;
+        Result.Val.setIsUnsigned(false); // "0" is signed intmax_t 0.
+        Result.setIdentifier(II);
+        Result.setRange(PeekTok.getLocation());
+        DT.IncludedUndefinedIds = true;
+        PP.LexNonComment(PeekTok);
+        return false;
+      }
+    }
     PP.Diag(PeekTok, diag::err_pp_expr_bad_token_start_expr);
     return true;
   case tok::eod:
@@ -481,6 +478,14 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT,
       DT.State = DefinedTracker::DefinedMacro;
     return false;
   }
+  case tok::kw_true:
+  case tok::kw_false:
+    Result.Val = PeekTok.getKind() == tok::kw_true;
+    Result.Val.setIsUnsigned(false); // "0" is signed intmax_t 0.
+    Result.setIdentifier(PeekTok.getIdentifierInfo());
+    Result.setRange(PeekTok.getLocation());
+    PP.LexNonComment(PeekTok);
+    return false;
 
   // FIXME: Handle #assert
   }
diff --git a/lib/Lex/PPMacroExpansion.cpp b/lib/Lex/PPMacroExpansion.cpp
index 8af9a50cc204..3f8ede23da56 100644
--- a/lib/Lex/PPMacroExpansion.cpp
+++ b/lib/Lex/PPMacroExpansion.cpp
@@ -412,7 +412,7 @@ static bool isTrivialSingleTokenExpansion(const MacroInfo *MI,
 
   // If this is a function-like macro invocation, it's safe to trivially expand
   // as long as the identifier is not a macro argument.
-  return std::find(MI->arg_begin(), MI->arg_end(), II) == MI->arg_end();
+  return std::find(MI->param_begin(), MI->param_end(), II) == MI->param_end();
 }
 
 /// isNextPPTokenLParen - Determine whether the next preprocessor token to be
@@ -492,7 +492,7 @@ bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier,
     // Preprocessor directives used inside macro arguments are not portable, and
     // this enables the warning.
     InMacroArgs = true;
-    Args = ReadFunctionLikeMacroArgs(Identifier, MI, ExpansionEnd);
+    Args = ReadMacroCallArgumentList(Identifier, MI, ExpansionEnd);
 
     // Finished parsing args.
     InMacroArgs = false;
@@ -745,11 +745,11 @@ static bool GenerateNewArgTokens(Preprocessor &PP,
 /// token is the '(' of the macro, this method is invoked to read all of the
 /// actual arguments specified for the macro invocation.  This returns null on
 /// error.
-MacroArgs *Preprocessor::ReadFunctionLikeMacroArgs(Token &MacroName,
+MacroArgs *Preprocessor::ReadMacroCallArgumentList(Token &MacroName,
                                                    MacroInfo *MI,
                                                    SourceLocation &MacroEnd) {
   // The number of fixed arguments to parse.
-  unsigned NumFixedArgsLeft = MI->getNumArgs();
+  unsigned NumFixedArgsLeft = MI->getNumParams();
   bool isVariadic = MI->isVariadic();
 
   // Outer loop, while there are more arguments, keep reading them.
@@ -889,7 +889,7 @@ MacroArgs *Preprocessor::ReadFunctionLikeMacroArgs(Token &MacroName,
 
   // Okay, we either found the r_paren.  Check to see if we parsed too few
   // arguments.
-  unsigned MinArgsExpected = MI->getNumArgs();
+  unsigned MinArgsExpected = MI->getNumParams();
 
   // If this is not a variadic macro, and too many args were specified, emit
   // an error.
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index 63f39524d12a..d1dc8e1c0010 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -712,14 +712,6 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
     II.setIsFutureCompatKeyword(false);
   }
 
-  // C++ 2.11p2: If this is an alternative representation of a C++ operator,
-  // then we act as if it is the actual operator and not the textual
-  // representation of it.
-  if (II.isCPlusPlusOperatorKeyword() &&
-      !(getLangOpts().MSVCCompat &&
-        getSourceManager().isInSystemHeader(Identifier.getLocation())))
-    Identifier.setIdentifierInfo(nullptr);
-
   // If this is an extension token, diagnose its use.
   // We avoid diagnosing tokens that originate from macro definitions.
   // FIXME: This warning is disabled in cases where it shouldn't be,
diff --git a/lib/Lex/TokenLexer.cpp b/lib/Lex/TokenLexer.cpp
index 049e046cece1..c2e49ba919a9 100644
--- a/lib/Lex/TokenLexer.cpp
+++ b/lib/Lex/TokenLexer.cpp
@@ -67,7 +67,7 @@ void TokenLexer::Init(Token &Tok, SourceLocation ELEnd, MacroInfo *MI,
 
   // If this is a function-like macro, expand the arguments and change
   // Tokens to point to the expanded tokens.
-  if (Macro->isFunctionLike() && Macro->getNumArgs())
+  if (Macro->isFunctionLike() && Macro->getNumParams())
     ExpandFunctionArguments();
 
   // Mark the macro as currently disabled, so that it is not recursively
@@ -122,7 +122,7 @@ bool TokenLexer::MaybeRemoveCommaBeforeVaArgs(
     SmallVectorImpl<Token> &ResultToks, bool HasPasteOperator, MacroInfo *Macro,
     unsigned MacroArgNo, Preprocessor &PP) {
   // Is the macro argument __VA_ARGS__?
-  if (!Macro->isVariadic() || MacroArgNo != Macro->getNumArgs()-1)
+  if (!Macro->isVariadic() || MacroArgNo != Macro->getNumParams()-1)
     return false;
 
   // In Microsoft-compatibility mode, a comma is removed in the expansion
@@ -137,7 +137,7 @@ bool TokenLexer::MaybeRemoveCommaBeforeVaArgs(
   // with GNU extensions, it is removed regardless of named arguments.
   // Microsoft also appears to support this extension, unofficially.
   if (PP.getLangOpts().C99 && !PP.getLangOpts().GNUMode
-        && Macro->getNumArgs() < 2)
+        && Macro->getNumParams() < 2)
     return false;
 
   // Is a comma available to be removed?
@@ -193,7 +193,7 @@ void TokenLexer::ExpandFunctionArguments() {
       NextTokGetsSpace = true;
 
     if (CurTok.isOneOf(tok::hash, tok::hashat)) {
-      int ArgNo = Macro->getArgumentNum(Tokens[i+1].getIdentifierInfo());
+      int ArgNo = Macro->getParameterNum(Tokens[i+1].getIdentifierInfo());
       assert(ArgNo != -1 && "Token following # is not an argument?");
 
       SourceLocation ExpansionLocStart =
@@ -237,7 +237,7 @@ void TokenLexer::ExpandFunctionArguments() {
     // Otherwise, if this is not an argument token, just add the token to the
     // output buffer.
     IdentifierInfo *II = CurTok.getIdentifierInfo();
-    int ArgNo = II ? Macro->getArgumentNum(II) : -1;
+    int ArgNo = II ? Macro->getParameterNum(II) : -1;
     if (ArgNo == -1) {
       // This isn't an argument, just add it.
       ResultToks.push_back(CurTok);
@@ -330,7 +330,7 @@ void TokenLexer::ExpandFunctionArguments() {
       // expansion.
       if (NonEmptyPasteBefore && ResultToks.size() >= 2 &&
           ResultToks[ResultToks.size()-2].is(tok::comma) &&
-          (unsigned)ArgNo == Macro->getNumArgs()-1 &&
+          (unsigned)ArgNo == Macro->getNumParams()-1 &&
           Macro->isVariadic()) {
         VaArgsPseudoPaste = true;
         // Remove the paste operator, report use of the extension.
diff --git a/lib/Parse/ParseObjc.cpp b/lib/Parse/ParseObjc.cpp
index f7410b8a092a..01b1bf48e473 100644
--- a/lib/Parse/ParseObjc.cpp
+++ b/lib/Parse/ParseObjc.cpp
@@ -1007,6 +1007,10 @@ IdentifierInfo *Parser::ParseObjCSelectorPiece(SourceLocation &SelectorLoc) {
   switch (Tok.getKind()) {
   default:
     return nullptr;
+  case tok::colon:
+    // Empty selector piece uses the location of the ':'.
+    SelectorLoc = Tok.getLocation();
+    return nullptr;
   case tok::ampamp:
   case tok::ampequal:
   case tok::amp:
diff --git a/lib/Parse/ParseOpenMP.cpp b/lib/Parse/ParseOpenMP.cpp
index 2e5e36242ed5..d9a088595ab7 100644
--- a/lib/Parse/ParseOpenMP.cpp
+++ b/lib/Parse/ParseOpenMP.cpp
@@ -1102,7 +1102,7 @@ bool Parser::ParseOpenMPSimpleVarList(
 ///       simdlen-clause | threads-clause | simd-clause | num_teams-clause |
 ///       thread_limit-clause | priority-clause | grainsize-clause |
 ///       nogroup-clause | num_tasks-clause | hint-clause | to-clause |
-///       from-clause | is_device_ptr-clause
+///       from-clause | is_device_ptr-clause | task_reduction-clause
 ///
 OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
                                      OpenMPClauseKind CKind, bool FirstClause) {
@@ -1220,6 +1220,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_linear:
   case OMPC_aligned:
   case OMPC_copyin:
@@ -1585,7 +1586,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
   BalancedDelimiterTracker LinearT(*this, tok::l_paren,
                                   tok::annot_pragma_openmp_end);
   // Handle reduction-identifier for reduction clause.
-  if (Kind == OMPC_reduction) {
+  if (Kind == OMPC_reduction || Kind == OMPC_task_reduction) {
     ColonProtectionRAIIObject ColonRAII(*this);
     if (getLangOpts().CPlusPlus)
       ParseOptionalCXXScopeSpecifier(Data.ReductionIdScopeSpec,
@@ -1733,13 +1734,13 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
       Diag(Tok, diag::warn_pragma_expected_colon) << "map type";
   }
 
-  bool IsComma =
-      (Kind != OMPC_reduction && Kind != OMPC_depend && Kind != OMPC_map) ||
-      (Kind == OMPC_reduction && !InvalidReductionId) ||
-      (Kind == OMPC_map && Data.MapType != OMPC_MAP_unknown &&
-       (!MapTypeModifierSpecified ||
-        Data.MapTypeModifier == OMPC_MAP_always)) ||
-      (Kind == OMPC_depend && Data.DepKind != OMPC_DEPEND_unknown);
+  bool IsComma = (Kind != OMPC_reduction && Kind != OMPC_task_reduction &&
+                  Kind != OMPC_depend && Kind != OMPC_map) ||
+                 (Kind == OMPC_reduction && !InvalidReductionId) ||
+                 (Kind == OMPC_map && Data.MapType != OMPC_MAP_unknown &&
+                  (!MapTypeModifierSpecified ||
+                   Data.MapTypeModifier == OMPC_MAP_always)) ||
+                 (Kind == OMPC_depend && Data.DepKind != OMPC_DEPEND_unknown);
   const bool MayHaveTail = (Kind == OMPC_linear || Kind == OMPC_aligned);
   while (IsComma || (Tok.isNot(tok::r_paren) && Tok.isNot(tok::colon) &&
                      Tok.isNot(tok::annot_pragma_openmp_end))) {
@@ -1795,7 +1796,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
 }
 
 /// \brief Parsing of OpenMP clause 'private', 'firstprivate', 'lastprivate',
-/// 'shared', 'copyin', 'copyprivate', 'flush' or 'reduction'.
+/// 'shared', 'copyin', 'copyprivate', 'flush', 'reduction' or 'task_reduction'.
 ///
 ///    private-clause:
 ///       'private' '(' list ')'
@@ -1811,6 +1812,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
 ///       'aligned' '(' list [ ':' alignment ] ')'
 ///    reduction-clause:
 ///       'reduction' '(' reduction-identifier ':' list ')'
+///    task_reduction-clause:
+///       'task_reduction' '(' reduction-identifier ':' list ')'
 ///    copyprivate-clause:
 ///       'copyprivate' '(' list ')'
 ///    flush-clause:
diff --git a/lib/Sema/DeclSpec.cpp b/lib/Sema/DeclSpec.cpp
index a55cdcccee5d..e4e84fcec954 100644
--- a/lib/Sema/DeclSpec.cpp
+++ b/lib/Sema/DeclSpec.cpp
@@ -1082,8 +1082,10 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
                !S.getLangOpts().ZVector)
         S.Diag(TSTLoc, diag::err_invalid_vector_double_decl_spec);
     } else if (TypeSpecType == TST_float) {
-      // vector float is unsupported for ZVector.
-      if (S.getLangOpts().ZVector)
+      // vector float is unsupported for ZVector unless we have the
+      // vector-enhancements facility 1 (ISA revision 12).
+      if (S.getLangOpts().ZVector &&
+          !S.Context.getTargetInfo().hasFeature("arch12"))
         S.Diag(TSTLoc, diag::err_invalid_vector_float_decl_spec);
     } else if (TypeSpecWidth == TSW_long) {
       // vector long is unsupported for ZVector and deprecated for AltiVec.
diff --git a/lib/Sema/Sema.cpp b/lib/Sema/Sema.cpp
index dc9f977d41ac..6f0db6ce1c6a 100644
--- a/lib/Sema/Sema.cpp
+++ b/lib/Sema/Sema.cpp
@@ -850,7 +850,8 @@ void Sema::ActOnEndOfTranslationUnit() {
     emitAndClearUnusedLocalTypedefWarnings();
 
     // Modules don't need any of the checking below.
-    TUScope = nullptr;
+    if (!PP.isIncrementalProcessingEnabled())
+      TUScope = nullptr;
     return;
   }
 
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 8446601334ee..b2223b755061 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -760,6 +760,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (CheckObjCString(TheCall->getArg(0)))
       return ExprError();
     break;
+  case Builtin::BI__builtin_ms_va_start:
   case Builtin::BI__builtin_stdarg_start:
   case Builtin::BI__builtin_va_start:
     if (SemaBuiltinVAStart(BuiltinID, TheCall))
@@ -1739,9 +1740,11 @@ bool Sema::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
   case SystemZ::BI__builtin_s390_vfaezbs:
   case SystemZ::BI__builtin_s390_vfaezhs:
   case SystemZ::BI__builtin_s390_vfaezfs: i = 2; l = 0; u = 15; break;
+  case SystemZ::BI__builtin_s390_vfisb:
   case SystemZ::BI__builtin_s390_vfidb:
     return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15) ||
            SemaBuiltinConstantArgRange(TheCall, 2, 0, 15);
+  case SystemZ::BI__builtin_s390_vftcisb:
   case SystemZ::BI__builtin_s390_vftcidb: i = 1; l = 0; u = 4095; break;
   case SystemZ::BI__builtin_s390_vlbb: i = 1; l = 0; u = 15; break;
   case SystemZ::BI__builtin_s390_vpdi: i = 2; l = 0; u = 15; break;
@@ -1758,6 +1761,11 @@ bool Sema::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
   case SystemZ::BI__builtin_s390_vstrczbs:
   case SystemZ::BI__builtin_s390_vstrczhs:
   case SystemZ::BI__builtin_s390_vstrczfs: i = 3; l = 0; u = 15; break;
+  case SystemZ::BI__builtin_s390_vmslg: i = 3; l = 0; u = 15; break;
+  case SystemZ::BI__builtin_s390_vfminsb:
+  case SystemZ::BI__builtin_s390_vfmaxsb:
+  case SystemZ::BI__builtin_s390_vfmindb:
+  case SystemZ::BI__builtin_s390_vfmaxdb: i = 2; l = 0; u = 15; break;
   }
   return SemaBuiltinConstantArgRange(TheCall, i, l, u);
 }
@@ -2095,9 +2103,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   if (BuiltinID == X86::BI__builtin_cpu_supports)
     return SemaBuiltinCpuSupports(*this, TheCall);
 
-  if (BuiltinID == X86::BI__builtin_ms_va_start)
-    return SemaBuiltinVAStart(BuiltinID, TheCall);
-
   // If the intrinsic has rounding or SAE make sure its valid.
   if (CheckX86BuiltinRoundingOrSAE(BuiltinID, TheCall))
     return true;
@@ -3622,24 +3627,25 @@ ExprResult Sema::CheckOSLogFormatStringArg(Expr *Arg) {
 static bool checkVAStartABI(Sema &S, unsigned BuiltinID, Expr *Fn) {
   const llvm::Triple &TT = S.Context.getTargetInfo().getTriple();
   bool IsX64 = TT.getArch() == llvm::Triple::x86_64;
+  bool IsAArch64 = TT.getArch() == llvm::Triple::aarch64;
   bool IsWindows = TT.isOSWindows();
-  bool IsMSVAStart = BuiltinID == X86::BI__builtin_ms_va_start;
-  if (IsX64) {
+  bool IsMSVAStart = BuiltinID == Builtin::BI__builtin_ms_va_start;
+  if (IsX64 || IsAArch64) {
     clang::CallingConv CC = CC_C;
     if (const FunctionDecl *FD = S.getCurFunctionDecl())
       CC = FD->getType()->getAs<FunctionType>()->getCallConv();
     if (IsMSVAStart) {
       // Don't allow this in System V ABI functions.
-      if (CC == CC_X86_64SysV || (!IsWindows && CC != CC_X86_64Win64))
+      if (CC == CC_X86_64SysV || (!IsWindows && CC != CC_Win64))
         return S.Diag(Fn->getLocStart(),
                       diag::err_ms_va_start_used_in_sysv_function);
     } else {
-      // On x86-64 Unix, don't allow this in Win64 ABI functions.
+      // On x86-64/AArch64 Unix, don't allow this in Win64 ABI functions.
       // On x64 Windows, don't allow this in System V ABI functions.
       // (Yes, that means there's no corresponding way to support variadic
       // System V ABI functions on Windows.)
       if ((IsWindows && CC == CC_X86_64SysV) ||
-          (!IsWindows && CC == CC_X86_64Win64))
+          (!IsWindows && CC == CC_Win64))
         return S.Diag(Fn->getLocStart(),
                       diag::err_va_start_used_in_wrong_abi_function)
                << !IsWindows;
@@ -3648,7 +3654,7 @@ static bool checkVAStartABI(Sema &S, unsigned BuiltinID, Expr *Fn) {
   }
 
   if (IsMSVAStart)
-    return S.Diag(Fn->getLocStart(), diag::err_x86_builtin_64_only);
+    return S.Diag(Fn->getLocStart(), diag::err_builtin_x64_aarch64_only);
   return false;
 }
 
diff --git a/lib/Sema/SemaCodeComplete.cpp b/lib/Sema/SemaCodeComplete.cpp
index 83c3bd27596c..3a53f251b096 100644
--- a/lib/Sema/SemaCodeComplete.cpp
+++ b/lib/Sema/SemaCodeComplete.cpp
@@ -2738,7 +2738,7 @@ CodeCompletionResult::CreateCodeCompletionString(ASTContext &Ctx,
     
     // Format a function-like macro with placeholders for the arguments.
     Result.AddChunk(CodeCompletionString::CK_LeftParen);
-    MacroInfo::arg_iterator A = MI->arg_begin(), AEnd = MI->arg_end();
+    MacroInfo::param_iterator A = MI->param_begin(), AEnd = MI->param_end();
     
     // C99 variadic macros add __VA_ARGS__ at the end. Skip it.
     if (MI->isC99Varargs()) {
@@ -2749,8 +2749,8 @@ CodeCompletionResult::CreateCodeCompletionString(ASTContext &Ctx,
       }
     }
     
-    for (MacroInfo::arg_iterator A = MI->arg_begin(); A != AEnd; ++A) {
-      if (A != MI->arg_begin())
+    for (MacroInfo::param_iterator A = MI->param_begin(); A != AEnd; ++A) {
+      if (A != MI->param_begin())
         Result.AddChunk(CodeCompletionString::CK_Comma);
 
       if (MI->isVariadic() && (A+1) == AEnd) {
diff --git a/lib/Sema/SemaDeclAttr.cpp b/lib/Sema/SemaDeclAttr.cpp
index 5fb79a6bf630..2a310bf41c70 100644
--- a/lib/Sema/SemaDeclAttr.cpp
+++ b/lib/Sema/SemaDeclAttr.cpp
@@ -4280,7 +4280,7 @@ bool Sema::CheckCallingConvAttr(const AttributeList &attr, CallingConv &CC,
   case AttributeList::AT_RegCall: CC = CC_X86RegCall; break;
   case AttributeList::AT_MSABI:
     CC = Context.getTargetInfo().getTriple().isOSWindows() ? CC_C :
-                                                             CC_X86_64Win64;
+                                                             CC_Win64;
     break;
   case AttributeList::AT_SysVABI:
     CC = Context.getTargetInfo().getTriple().isOSWindows() ? CC_X86_64SysV :
@@ -4679,6 +4679,16 @@ void Sema::AddNSConsumedAttr(SourceRange attrRange, Decl *D,
                    CFConsumedAttr(attrRange, Context, spellingIndex));
 }
 
+bool Sema::checkNSReturnsRetainedReturnType(SourceLocation loc,
+                                            QualType type) {
+  if (isValidSubjectOfNSReturnsRetainedAttribute(type))
+    return false;
+
+  Diag(loc, diag::warn_ns_attribute_wrong_return_type)
+    << "'ns_returns_retained'" << 0 << 0;
+  return true;
+}
+
 static void handleNSReturnsRetainedAttr(Sema &S, Decl *D,
                                         const AttributeList &Attr) {
   QualType returnType;
@@ -4700,6 +4710,8 @@ static void handleNSReturnsRetainedAttr(Sema &S, Decl *D,
           << Attr.getRange();
       return;
     }
+  } else if (Attr.isUsedAsTypeAttr()) {
+    return;
   } else {
     AttributeDeclKind ExpectedDeclKind;
     switch (Attr.getKind()) {
@@ -4743,6 +4755,9 @@ static void handleNSReturnsRetainedAttr(Sema &S, Decl *D,
   }
 
   if (!typeOK) {
+    if (Attr.isUsedAsTypeAttr())
+      return;
+
     if (isa<ParmVarDecl>(D)) {
       S.Diag(D->getLocStart(), diag::warn_ns_attribute_wrong_parameter_type)
           << Attr.getName() << /*pointer-to-CF*/2
@@ -6838,6 +6853,50 @@ static const AvailabilityAttr *getAttrForPlatform(ASTContext &Context,
   return nullptr;
 }
 
+/// The diagnostic we should emit for \c D, and the declaration that
+/// originated it, or \c AR_Available.
+///
+/// \param D The declaration to check.
+/// \param Message If non-null, this will be populated with the message from
+/// the availability attribute that is selected.
+static std::pair<AvailabilityResult, const NamedDecl *>
+ShouldDiagnoseAvailabilityOfDecl(const NamedDecl *D, std::string *Message) {
+  AvailabilityResult Result = D->getAvailability(Message);
+
+  // For typedefs, if the typedef declaration appears available look
+  // to the underlying type to see if it is more restrictive.
+  while (const TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(D)) {
+    if (Result == AR_Available) {
+      if (const TagType *TT = TD->getUnderlyingType()->getAs<TagType>()) {
+        D = TT->getDecl();
+        Result = D->getAvailability(Message);
+        continue;
+      }
+    }
+    break;
+  }
+
+  // Forward class declarations get their attributes from their definition.
+  if (const ObjCInterfaceDecl *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
+    if (IDecl->getDefinition()) {
+      D = IDecl->getDefinition();
+      Result = D->getAvailability(Message);
+    }
+  }
+
+  if (const auto *ECD = dyn_cast<EnumConstantDecl>(D))
+    if (Result == AR_Available) {
+      const DeclContext *DC = ECD->getDeclContext();
+      if (const auto *TheEnumDecl = dyn_cast<EnumDecl>(DC)) {
+        Result = TheEnumDecl->getAvailability(Message);
+        D = TheEnumDecl;
+      }
+    }
+
+  return {Result, D};
+}
+
+
 /// \brief whether we should emit a diagnostic for \c K and \c DeclVersion in
 /// the context of \c Ctx. For example, we should emit an unavailable diagnostic
 /// in a deprecated context, but not the other way around.
@@ -7205,24 +7264,24 @@ void Sema::redelayDiagnostics(DelayedDiagnosticPool &pool) {
   curPool->steal(pool);
 }
 
-void Sema::EmitAvailabilityWarning(AvailabilityResult AR,
-                                   const NamedDecl *ReferringDecl,
-                                   const NamedDecl *OffendingDecl,
-                                   StringRef Message, SourceLocation Loc,
-                                   const ObjCInterfaceDecl *UnknownObjCClass,
-                                   const ObjCPropertyDecl *ObjCProperty,
-                                   bool ObjCPropertyAccess) {
+static void EmitAvailabilityWarning(Sema &S, AvailabilityResult AR,
+                                    const NamedDecl *ReferringDecl,
+                                    const NamedDecl *OffendingDecl,
+                                    StringRef Message, SourceLocation Loc,
+                                    const ObjCInterfaceDecl *UnknownObjCClass,
+                                    const ObjCPropertyDecl *ObjCProperty,
+                                    bool ObjCPropertyAccess) {
   // Delay if we're currently parsing a declaration.
-  if (DelayedDiagnostics.shouldDelayDiagnostics()) {
-    DelayedDiagnostics.add(
+  if (S.DelayedDiagnostics.shouldDelayDiagnostics()) {
+    S.DelayedDiagnostics.add(
         DelayedDiagnostic::makeAvailability(
             AR, Loc, ReferringDecl, OffendingDecl, UnknownObjCClass,
             ObjCProperty, Message, ObjCPropertyAccess));
     return;
   }
 
-  Decl *Ctx = cast<Decl>(getCurLexicalContext());
-  DoEmitAvailabilityWarning(*this, AR, Ctx, ReferringDecl, OffendingDecl,
+  Decl *Ctx = cast<Decl>(S.getCurLexicalContext());
+  DoEmitAvailabilityWarning(S, AR, Ctx, ReferringDecl, OffendingDecl,
                             Message, Loc, UnknownObjCClass, ObjCProperty,
                             ObjCPropertyAccess);
 }
@@ -7379,7 +7438,7 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
   AvailabilityResult Result;
   const NamedDecl *OffendingDecl;
   std::tie(Result, OffendingDecl) =
-      SemaRef.ShouldDiagnoseAvailabilityOfDecl(D, nullptr);
+    ShouldDiagnoseAvailabilityOfDecl(D, nullptr);
   if (Result != AR_Available) {
     // All other diagnostic kinds have already been handled in
     // DiagnoseAvailabilityOfDecl.
@@ -7557,3 +7616,44 @@ void Sema::DiagnoseUnguardedAvailabilityViolations(Decl *D) {
 
   DiagnoseUnguardedAvailability(*this, D).IssueDiagnostics(Body);
 }
+
+void Sema::DiagnoseAvailabilityOfDecl(NamedDecl *D, SourceLocation Loc,
+                                      const ObjCInterfaceDecl *UnknownObjCClass,
+                                      bool ObjCPropertyAccess,
+                                      bool AvoidPartialAvailabilityChecks) {
+  std::string Message;
+  AvailabilityResult Result;
+  const NamedDecl* OffendingDecl;
+  // See if this declaration is unavailable, deprecated, or partial.
+  std::tie(Result, OffendingDecl) = ShouldDiagnoseAvailabilityOfDecl(D, &Message);
+  if (Result == AR_Available)
+    return;
+
+  if (Result == AR_NotYetIntroduced) {
+    if (AvoidPartialAvailabilityChecks)
+      return;
+
+    // We need to know the @available context in the current function to
+    // diagnose this use, let DiagnoseUnguardedAvailabilityViolations do that
+    // when we're done parsing the current function.
+    if (getCurFunctionOrMethodDecl()) {
+      getEnclosingFunction()->HasPotentialAvailabilityViolations = true;
+      return;
+    } else if (getCurBlock() || getCurLambda()) {
+      getCurFunction()->HasPotentialAvailabilityViolations = true;
+      return;
+    }
+  }
+
+  const ObjCPropertyDecl *ObjCPDecl = nullptr;
+  if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
+    if (const ObjCPropertyDecl *PD = MD->findPropertyDecl()) {
+      AvailabilityResult PDeclResult = PD->getAvailability(nullptr);
+      if (PDeclResult == Result)
+        ObjCPDecl = PD;
+    }
+  }
+
+  EmitAvailabilityWarning(*this, Result, D, OffendingDecl, Message, Loc,
+                          UnknownObjCClass, ObjCPDecl, ObjCPropertyAccess);
+}
diff --git a/lib/Sema/SemaDeclObjC.cpp b/lib/Sema/SemaDeclObjC.cpp
index 778b8062f68c..967573011d0d 100644
--- a/lib/Sema/SemaDeclObjC.cpp
+++ b/lib/Sema/SemaDeclObjC.cpp
@@ -248,19 +248,41 @@ bool Sema::CheckARCMethodDecl(ObjCMethodDecl *method) {
   return false;
 }
 
-static void DiagnoseObjCImplementedDeprecations(Sema &S,
-                                                NamedDecl *ND,
-                                                SourceLocation ImplLoc,
-                                                int select) {
-  if (ND && ND->isDeprecated()) {
-    S.Diag(ImplLoc, diag::warn_deprecated_def) << select;
-    if (select == 0)
+static void DiagnoseObjCImplementedDeprecations(Sema &S, const NamedDecl *ND,
+                                                SourceLocation ImplLoc) {
+  if (!ND)
+    return;
+  bool IsCategory = false;
+  AvailabilityResult Availability = ND->getAvailability();
+  if (Availability != AR_Deprecated) {
+    if (isa<ObjCMethodDecl>(ND)) {
+      if (Availability != AR_Unavailable)
+        return;
+      // Warn about implementing unavailable methods.
+      S.Diag(ImplLoc, diag::warn_unavailable_def);
       S.Diag(ND->getLocation(), diag::note_method_declared_at)
-        << ND->getDeclName();
-    else
-      S.Diag(ND->getLocation(), diag::note_previous_decl)
-          << (isa<ObjCCategoryDecl>(ND) ? "category" : "class");
+          << ND->getDeclName();
+      return;
+    }
+    if (const auto *CD = dyn_cast<ObjCCategoryDecl>(ND)) {
+      if (!CD->getClassInterface()->isDeprecated())
+        return;
+      ND = CD->getClassInterface();
+      IsCategory = true;
+    } else
+      return;
   }
+  S.Diag(ImplLoc, diag::warn_deprecated_def)
+      << (isa<ObjCMethodDecl>(ND)
+              ? /*Method*/ 0
+              : isa<ObjCCategoryDecl>(ND) || IsCategory ? /*Category*/ 2
+                                                        : /*Class*/ 1);
+  if (isa<ObjCMethodDecl>(ND))
+    S.Diag(ND->getLocation(), diag::note_method_declared_at)
+        << ND->getDeclName();
+  else
+    S.Diag(ND->getLocation(), diag::note_previous_decl)
+        << (isa<ObjCCategoryDecl>(ND) ? "category" : "class");
 }
 
 /// AddAnyMethodToGlobalPool - Add any method, instance or factory to global
@@ -385,9 +407,7 @@ void Sema::ActOnStartOfObjCMethodDef(Scope *FnBodyScope, Decl *D) {
       // No need to issue deprecated warning if deprecated mehod in class/category
       // is being implemented in its own implementation (no overriding is involved).
       if (!ImplDeclOfMethodDecl || ImplDeclOfMethodDecl != ImplDeclOfMethodDef)
-        DiagnoseObjCImplementedDeprecations(*this, 
-                                          dyn_cast<NamedDecl>(IMD), 
-                                          MDecl->getLocation(), 0);
+        DiagnoseObjCImplementedDeprecations(*this, IMD, MDecl->getLocation());
     }
 
     if (MDecl->getMethodFamily() == OMF_init) {
@@ -1873,10 +1893,8 @@ Decl *Sema::ActOnStartCategoryImplementation(
       CatIDecl->setImplementation(CDecl);
       // Warn on implementating category of deprecated class under 
       // -Wdeprecated-implementations flag.
-      DiagnoseObjCImplementedDeprecations(
-          *this,
-          CatIDecl->isDeprecated() ? CatIDecl : dyn_cast<NamedDecl>(IDecl),
-          CDecl->getLocation(), 2);
+      DiagnoseObjCImplementedDeprecations(*this, CatIDecl,
+                                          CDecl->getLocation());
     }
   }
 
@@ -1996,9 +2014,7 @@ Decl *Sema::ActOnStartClassImplementation(
     PushOnScopeChains(IMPDecl, TUScope);
     // Warn on implementating deprecated class under 
     // -Wdeprecated-implementations flag.
-    DiagnoseObjCImplementedDeprecations(*this, 
-                                        dyn_cast<NamedDecl>(IDecl), 
-                                        IMPDecl->getLocation(), 1);
+    DiagnoseObjCImplementedDeprecations(*this, IDecl, IMPDecl->getLocation());
   }
 
   // If the superclass has the objc_runtime_visible attribute, we
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 8016bf99889f..ead80b61586a 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -87,82 +87,6 @@ static void DiagnoseUnusedOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc) {
   }
 }
 
-std::pair<AvailabilityResult, const NamedDecl *>
-Sema::ShouldDiagnoseAvailabilityOfDecl(const NamedDecl *D,
-                                       std::string *Message) {
-  AvailabilityResult Result = D->getAvailability(Message);
-
-  // For typedefs, if the typedef declaration appears available look
-  // to the underlying type to see if it is more restrictive.
-  while (const TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(D)) {
-    if (Result == AR_Available) {
-      if (const TagType *TT = TD->getUnderlyingType()->getAs<TagType>()) {
-        D = TT->getDecl();
-        Result = D->getAvailability(Message);
-        continue;
-      }
-    }
-    break;
-  }
-
-  // Forward class declarations get their attributes from their definition.
-  if (const ObjCInterfaceDecl *IDecl = dyn_cast<ObjCInterfaceDecl>(D)) {
-    if (IDecl->getDefinition()) {
-      D = IDecl->getDefinition();
-      Result = D->getAvailability(Message);
-    }
-  }
-
-  if (const auto *ECD = dyn_cast<EnumConstantDecl>(D))
-    if (Result == AR_Available) {
-      const DeclContext *DC = ECD->getDeclContext();
-      if (const auto *TheEnumDecl = dyn_cast<EnumDecl>(DC)) {
-        Result = TheEnumDecl->getAvailability(Message);
-        D = TheEnumDecl;
-      }
-    }
-
-  return {Result, D};
-}
-
-static void
-DiagnoseAvailabilityOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc,
-                           const ObjCInterfaceDecl *UnknownObjCClass,
-                           bool ObjCPropertyAccess,
-                           bool AvoidPartialAvailabilityChecks = false) {
-  std::string Message;
-  AvailabilityResult Result;
-  const NamedDecl* OffendingDecl;
-  // See if this declaration is unavailable, deprecated, or partial.
-  std::tie(Result, OffendingDecl) = S.ShouldDiagnoseAvailabilityOfDecl(D, &Message);
-  if (Result == AR_Available)
-    return;
-
-  if (Result == AR_NotYetIntroduced) {
-    if (AvoidPartialAvailabilityChecks)
-      return;
-    if (S.getCurFunctionOrMethodDecl()) {
-      S.getEnclosingFunction()->HasPotentialAvailabilityViolations = true;
-      return;
-    } else if (S.getCurBlock() || S.getCurLambda()) {
-      S.getCurFunction()->HasPotentialAvailabilityViolations = true;
-      return;
-    }
-  }
-
-  const ObjCPropertyDecl *ObjCPDecl = nullptr;
-  if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
-    if (const ObjCPropertyDecl *PD = MD->findPropertyDecl()) {
-      AvailabilityResult PDeclResult = PD->getAvailability(nullptr);
-      if (PDeclResult == Result)
-        ObjCPDecl = PD;
-    }
-  }
-
-  S.EmitAvailabilityWarning(Result, D, OffendingDecl, Message, Loc,
-                            UnknownObjCClass, ObjCPDecl, ObjCPropertyAccess);
-}
-
 /// \brief Emit a note explaining that this function is deleted.
 void Sema::NoteDeletedFunction(FunctionDecl *Decl) {
   assert(Decl->isDeleted());
@@ -363,8 +287,7 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
     return true;
   }
 
-  DiagnoseAvailabilityOfDecl(*this, D, Loc, UnknownObjCClass,
-                             ObjCPropertyAccess,
+  DiagnoseAvailabilityOfDecl(D, Loc, UnknownObjCClass, ObjCPropertyAccess,
                              AvoidPartialAvailabilityChecks);
 
   DiagnoseUnusedOfDecl(*this, D, Loc);
diff --git a/lib/Sema/SemaObjCProperty.cpp b/lib/Sema/SemaObjCProperty.cpp
index 62a771bcffa0..e1e85dfd5e55 100644
--- a/lib/Sema/SemaObjCProperty.cpp
+++ b/lib/Sema/SemaObjCProperty.cpp
@@ -814,53 +814,185 @@ static void setImpliedPropertyAttributeForReadOnlyProperty(
     property->setPropertyAttributes(ObjCPropertyDecl::OBJC_PR_weak);
 }
 
-/// DiagnosePropertyMismatchDeclInProtocols - diagnose properties declared
-/// in inherited protocols with mismatched types. Since any of them can
-/// be candidate for synthesis.
-static void
-DiagnosePropertyMismatchDeclInProtocols(Sema &S, SourceLocation AtLoc,
+static bool
+isIncompatiblePropertyAttribute(unsigned Attr1, unsigned Attr2,
+                                ObjCPropertyDecl::PropertyAttributeKind Kind) {
+  return (Attr1 & Kind) != (Attr2 & Kind);
+}
+
+static bool areIncompatiblePropertyAttributes(unsigned Attr1, unsigned Attr2,
+                                              unsigned Kinds) {
+  return ((Attr1 & Kinds) != 0) != ((Attr2 & Kinds) != 0);
+}
+
+/// SelectPropertyForSynthesisFromProtocols - Finds the most appropriate
+/// property declaration that should be synthesised in all of the inherited
+/// protocols. It also diagnoses properties declared in inherited protocols with
+/// mismatched types or attributes, since any of them can be candidate for
+/// synthesis.
+static ObjCPropertyDecl *
+SelectPropertyForSynthesisFromProtocols(Sema &S, SourceLocation AtLoc,
                                         ObjCInterfaceDecl *ClassDecl,
                                         ObjCPropertyDecl *Property) {
-  ObjCInterfaceDecl::ProtocolPropertyMap PropMap;
+  assert(isa<ObjCProtocolDecl>(Property->getDeclContext()) &&
+         "Expected a property from a protocol");
+  ObjCInterfaceDecl::ProtocolPropertySet ProtocolSet;
+  ObjCInterfaceDecl::PropertyDeclOrder Properties;
   for (const auto *PI : ClassDecl->all_referenced_protocols()) {
     if (const ObjCProtocolDecl *PDecl = PI->getDefinition())
-      PDecl->collectInheritedProtocolProperties(Property, PropMap);
+      PDecl->collectInheritedProtocolProperties(Property, ProtocolSet,
+                                                Properties);
   }
-  if (ObjCInterfaceDecl *SDecl = ClassDecl->getSuperClass())
+  if (ObjCInterfaceDecl *SDecl = ClassDecl->getSuperClass()) {
     while (SDecl) {
       for (const auto *PI : SDecl->all_referenced_protocols()) {
         if (const ObjCProtocolDecl *PDecl = PI->getDefinition())
-          PDecl->collectInheritedProtocolProperties(Property, PropMap);
+          PDecl->collectInheritedProtocolProperties(Property, ProtocolSet,
+                                                    Properties);
       }
       SDecl = SDecl->getSuperClass();
     }
-  
-  if (PropMap.empty())
-    return;
-  
+  }
+
+  if (Properties.empty())
+    return Property;
+
+  ObjCPropertyDecl *OriginalProperty = Property;
+  size_t SelectedIndex = 0;
+  for (const auto &Prop : llvm::enumerate(Properties)) {
+    // Select the 'readwrite' property if such property exists.
+    if (Property->isReadOnly() && !Prop.value()->isReadOnly()) {
+      Property = Prop.value();
+      SelectedIndex = Prop.index();
+    }
+  }
+  if (Property != OriginalProperty) {
+    // Check that the old property is compatible with the new one.
+    Properties[SelectedIndex] = OriginalProperty;
+  }
+
   QualType RHSType = S.Context.getCanonicalType(Property->getType());
-  bool FirsTime = true;
-  for (ObjCInterfaceDecl::ProtocolPropertyMap::iterator
-       I = PropMap.begin(), E = PropMap.end(); I != E; I++) {
-    ObjCPropertyDecl *Prop = I->second;
+  unsigned OriginalAttributes = Property->getPropertyAttributes();
+  enum MismatchKind {
+    IncompatibleType = 0,
+    HasNoExpectedAttribute,
+    HasUnexpectedAttribute,
+    DifferentGetter,
+    DifferentSetter
+  };
+  // Represents a property from another protocol that conflicts with the
+  // selected declaration.
+  struct MismatchingProperty {
+    const ObjCPropertyDecl *Prop;
+    MismatchKind Kind;
+    StringRef AttributeName;
+  };
+  SmallVector<MismatchingProperty, 4> Mismatches;
+  for (ObjCPropertyDecl *Prop : Properties) {
+    // Verify the property attributes.
+    unsigned Attr = Prop->getPropertyAttributes();
+    if (Attr != OriginalAttributes) {
+      auto Diag = [&](bool OriginalHasAttribute, StringRef AttributeName) {
+        MismatchKind Kind = OriginalHasAttribute ? HasNoExpectedAttribute
+                                                 : HasUnexpectedAttribute;
+        Mismatches.push_back({Prop, Kind, AttributeName});
+      };
+      if (isIncompatiblePropertyAttribute(OriginalAttributes, Attr,
+                                          ObjCPropertyDecl::OBJC_PR_copy)) {
+        Diag(OriginalAttributes & ObjCPropertyDecl::OBJC_PR_copy, "copy");
+        continue;
+      }
+      if (areIncompatiblePropertyAttributes(
+              OriginalAttributes, Attr, ObjCPropertyDecl::OBJC_PR_retain |
+                                            ObjCPropertyDecl::OBJC_PR_strong)) {
+        Diag(OriginalAttributes & (ObjCPropertyDecl::OBJC_PR_retain |
+                                   ObjCPropertyDecl::OBJC_PR_strong),
+             "retain (or strong)");
+        continue;
+      }
+      if (isIncompatiblePropertyAttribute(OriginalAttributes, Attr,
+                                          ObjCPropertyDecl::OBJC_PR_atomic)) {
+        Diag(OriginalAttributes & ObjCPropertyDecl::OBJC_PR_atomic, "atomic");
+        continue;
+      }
+    }
+    if (Property->getGetterName() != Prop->getGetterName()) {
+      Mismatches.push_back({Prop, DifferentGetter, ""});
+      continue;
+    }
+    if (!Property->isReadOnly() && !Prop->isReadOnly() &&
+        Property->getSetterName() != Prop->getSetterName()) {
+      Mismatches.push_back({Prop, DifferentSetter, ""});
+      continue;
+    }
     QualType LHSType = S.Context.getCanonicalType(Prop->getType());
     if (!S.Context.propertyTypesAreCompatible(LHSType, RHSType)) {
       bool IncompatibleObjC = false;
       QualType ConvertedType;
       if (!S.isObjCPointerConversion(RHSType, LHSType, ConvertedType, IncompatibleObjC)
           || IncompatibleObjC) {
-        if (FirsTime) {
-          S.Diag(Property->getLocation(), diag::warn_protocol_property_mismatch)
-            << Property->getType();
-          FirsTime = false;
-        }
-        S.Diag(Prop->getLocation(), diag::note_protocol_property_declare)
-          << Prop->getType();
+        Mismatches.push_back({Prop, IncompatibleType, ""});
+        continue;
       }
     }
   }
-  if (!FirsTime && AtLoc.isValid())
+
+  if (Mismatches.empty())
+    return Property;
+
+  // Diagnose incompability.
+  {
+    bool HasIncompatibleAttributes = false;
+    for (const auto &Note : Mismatches)
+      HasIncompatibleAttributes =
+          Note.Kind != IncompatibleType ? true : HasIncompatibleAttributes;
+    // Promote the warning to an error if there are incompatible attributes or
+    // incompatible types together with readwrite/readonly incompatibility.
+    auto Diag = S.Diag(Property->getLocation(),
+                       Property != OriginalProperty || HasIncompatibleAttributes
+                           ? diag::err_protocol_property_mismatch
+                           : diag::warn_protocol_property_mismatch);
+    Diag << Mismatches[0].Kind;
+    switch (Mismatches[0].Kind) {
+    case IncompatibleType:
+      Diag << Property->getType();
+      break;
+    case HasNoExpectedAttribute:
+    case HasUnexpectedAttribute:
+      Diag << Mismatches[0].AttributeName;
+      break;
+    case DifferentGetter:
+      Diag << Property->getGetterName();
+      break;
+    case DifferentSetter:
+      Diag << Property->getSetterName();
+      break;
+    }
+  }
+  for (const auto &Note : Mismatches) {
+    auto Diag =
+        S.Diag(Note.Prop->getLocation(), diag::note_protocol_property_declare)
+        << Note.Kind;
+    switch (Note.Kind) {
+    case IncompatibleType:
+      Diag << Note.Prop->getType();
+      break;
+    case HasNoExpectedAttribute:
+    case HasUnexpectedAttribute:
+      Diag << Note.AttributeName;
+      break;
+    case DifferentGetter:
+      Diag << Note.Prop->getGetterName();
+      break;
+    case DifferentSetter:
+      Diag << Note.Prop->getSetterName();
+      break;
+    }
+  }
+  if (AtLoc.isValid())
     S.Diag(AtLoc, diag::note_property_synthesize);
+
+  return Property;
 }
 
 /// Determine whether any storage attributes were written on the property.
@@ -996,8 +1128,9 @@ Decl *Sema::ActOnPropertyImplDecl(Scope *S,
       }
     }
     if (Synthesize && isa<ObjCProtocolDecl>(property->getDeclContext()))
-      DiagnosePropertyMismatchDeclInProtocols(*this, AtLoc, IDecl, property);
-        
+      property = SelectPropertyForSynthesisFromProtocols(*this, AtLoc, IDecl,
+                                                         property);
+
   } else if ((CatImplClass = dyn_cast<ObjCCategoryImplDecl>(ClassImpDecl))) {
     if (Synthesize) {
       Diag(AtLoc, diag::err_synthesize_category_decl);
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index 1e0b6c158348..01f574b6aeeb 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -1807,6 +1807,8 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".lb.", KmpUInt64Ty),
         std::make_pair(".ub.", KmpUInt64Ty), std::make_pair(".st.", KmpInt64Ty),
         std::make_pair(".liter.", KmpInt32Ty),
+        std::make_pair(".reductions.",
+                       Context.VoidPtrTy.withConst().withRestrict()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
@@ -2498,9 +2500,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPTaskwaitDirective(StartLoc, EndLoc);
     break;
   case OMPD_taskgroup:
-    assert(ClausesWithImplicit.empty() &&
-           "No clauses are allowed for 'omp taskgroup' directive");
-    Res = ActOnOpenMPTaskgroupDirective(AStmt, StartLoc, EndLoc);
+    Res = ActOnOpenMPTaskgroupDirective(ClausesWithImplicit, AStmt, StartLoc,
+                                        EndLoc);
     break;
   case OMPD_flush:
     assert(AStmt == nullptr &&
@@ -5067,7 +5068,8 @@ StmtResult Sema::ActOnOpenMPTaskwaitDirective(SourceLocation StartLoc,
   return OMPTaskwaitDirective::Create(Context, StartLoc, EndLoc);
 }
 
-StmtResult Sema::ActOnOpenMPTaskgroupDirective(Stmt *AStmt,
+StmtResult Sema::ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
+                                               Stmt *AStmt,
                                                SourceLocation StartLoc,
                                                SourceLocation EndLoc) {
   if (!AStmt)
@@ -5077,7 +5079,8 @@ StmtResult Sema::ActOnOpenMPTaskgroupDirective(Stmt *AStmt,
 
   getCurFunction()->setHasBranchProtectedScope();
 
-  return OMPTaskgroupDirective::Create(Context, StartLoc, EndLoc, AStmt);
+  return OMPTaskgroupDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                       AStmt);
 }
 
 StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
@@ -6849,6 +6852,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_linear:
   case OMPC_aligned:
   case OMPC_copyin:
@@ -7152,6 +7156,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
   case OMPC_firstprivate:
   case OMPC_lastprivate:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_linear:
   case OMPC_default:
   case OMPC_proc_bind:
@@ -7467,6 +7472,7 @@ OMPClause *Sema::ActOnOpenMPSimpleClause(
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_linear:
   case OMPC_aligned:
   case OMPC_copyin:
@@ -7624,6 +7630,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_linear:
   case OMPC_aligned:
   case OMPC_copyin:
@@ -7821,6 +7828,7 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_lastprivate:
   case OMPC_shared:
   case OMPC_reduction:
+  case OMPC_task_reduction:
   case OMPC_linear:
   case OMPC_aligned:
   case OMPC_copyin:
@@ -7933,6 +7941,11 @@ OMPClause *Sema::ActOnOpenMPVarListClause(
     Res = ActOnOpenMPReductionClause(VarList, StartLoc, LParenLoc, ColonLoc,
                                      EndLoc, ReductionIdScopeSpec, ReductionId);
     break;
+  case OMPC_task_reduction:
+    Res = ActOnOpenMPTaskReductionClause(VarList, StartLoc, LParenLoc, ColonLoc,
+                                         EndLoc, ReductionIdScopeSpec,
+                                         ReductionId);
+    break;
   case OMPC_linear:
     Res = ActOnOpenMPLinearClause(VarList, TailExpr, StartLoc, LParenLoc,
                                   LinKind, DepLinMapLoc, ColonLoc, EndLoc);
@@ -8901,15 +8914,66 @@ buildDeclareReductionRef(Sema &SemaRef, SourceLocation Loc, SourceRange Range,
   return ExprEmpty();
 }
 
-OMPClause *Sema::ActOnOpenMPReductionClause(
+namespace {
+/// Data for the reduction-based clauses.
+struct ReductionData {
+  /// List of original reduction items.
+  SmallVector<Expr *, 8> Vars;
+  /// List of private copies of the reduction items.
+  SmallVector<Expr *, 8> Privates;
+  /// LHS expressions for the reduction_op expressions.
+  SmallVector<Expr *, 8> LHSs;
+  /// RHS expressions for the reduction_op expressions.
+  SmallVector<Expr *, 8> RHSs;
+  /// Reduction operation expression.
+  SmallVector<Expr *, 8> ReductionOps;
+  /// List of captures for clause.
+  SmallVector<Decl *, 4> ExprCaptures;
+  /// List of postupdate expressions.
+  SmallVector<Expr *, 4> ExprPostUpdates;
+  ReductionData() = delete;
+  /// Reserves required memory for the reduction data.
+  ReductionData(unsigned Size) {
+    Vars.reserve(Size);
+    Privates.reserve(Size);
+    LHSs.reserve(Size);
+    RHSs.reserve(Size);
+    ReductionOps.reserve(Size);
+    ExprCaptures.reserve(Size);
+    ExprPostUpdates.reserve(Size);
+  }
+  /// Stores reduction item and reduction operation only (required for dependent
+  /// reduction item).
+  void push(Expr *Item, Expr *ReductionOp) {
+    Vars.emplace_back(Item);
+    Privates.emplace_back(nullptr);
+    LHSs.emplace_back(nullptr);
+    RHSs.emplace_back(nullptr);
+    ReductionOps.emplace_back(ReductionOp);
+  }
+  /// Stores reduction data.
+  void push(Expr *Item, Expr *Private, Expr *LHS, Expr *RHS,
+            Expr *ReductionOp) {
+    Vars.emplace_back(Item);
+    Privates.emplace_back(Private);
+    LHSs.emplace_back(LHS);
+    RHSs.emplace_back(RHS);
+    ReductionOps.emplace_back(ReductionOp);
+  }
+};
+} // namespace
+
+static bool ActOnOMPReductionKindClause(
+    Sema &S, DSAStackTy *Stack, OpenMPClauseKind ClauseKind,
     ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ColonLoc, SourceLocation EndLoc,
     CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
-    ArrayRef<Expr *> UnresolvedReductions) {
+    ArrayRef<Expr *> UnresolvedReductions, ReductionData &RD) {
   auto DN = ReductionId.getName();
   auto OOK = DN.getCXXOverloadedOperator();
   BinaryOperatorKind BOK = BO_Comma;
 
+  ASTContext &Context = S.Context;
   // OpenMP [2.14.3.6, reduction clause]
   // C
   // reduction-identifier is either an identifier or one of the following
@@ -8993,13 +9057,6 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     ReductionIdRange.setBegin(ReductionIdScopeSpec.getBeginLoc());
   ReductionIdRange.setEnd(ReductionId.getEndLoc());
 
-  SmallVector<Expr *, 8> Vars;
-  SmallVector<Expr *, 8> Privates;
-  SmallVector<Expr *, 8> LHSs;
-  SmallVector<Expr *, 8> RHSs;
-  SmallVector<Expr *, 8> ReductionOps;
-  SmallVector<Decl *, 4> ExprCaptures;
-  SmallVector<Expr *, 4> ExprPostUpdates;
   auto IR = UnresolvedReductions.begin(), ER = UnresolvedReductions.end();
   bool FirstIter = true;
   for (auto RefExpr : VarList) {
@@ -9017,27 +9074,23 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+    auto Res = getPrivateItem(S, SimpleRefExpr, ELoc, ERange,
                               /*AllowArraySection=*/true);
     if (Res.second) {
-      // It will be analyzed later.
-      Vars.push_back(RefExpr);
-      Privates.push_back(nullptr);
-      LHSs.push_back(nullptr);
-      RHSs.push_back(nullptr);
       // Try to find 'declare reduction' corresponding construct before using
       // builtin/overloaded operators.
       QualType Type = Context.DependentTy;
       CXXCastPath BasePath;
       ExprResult DeclareReductionRef = buildDeclareReductionRef(
-          *this, ELoc, ERange, DSAStack->getCurScope(), ReductionIdScopeSpec,
+          S, ELoc, ERange, Stack->getCurScope(), ReductionIdScopeSpec,
           ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
-      if (CurContext->isDependentContext() &&
+      Expr *ReductionOp = nullptr;
+      if (S.CurContext->isDependentContext() &&
           (DeclareReductionRef.isUnset() ||
            isa<UnresolvedLookupExpr>(DeclareReductionRef.get())))
-        ReductionOps.push_back(DeclareReductionRef.get());
-      else
-        ReductionOps.push_back(nullptr);
+        ReductionOp = DeclareReductionRef.get();
+      // It will be analyzed later.
+      RD.push(RefExpr, ReductionOp);
     }
     ValueDecl *D = Res.first;
     if (!D)
@@ -9062,21 +9115,19 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
     //  A variable that appears in a private clause must not have an incomplete
     //  type or a reference type.
-    if (RequireCompleteType(ELoc, Type,
-                            diag::err_omp_reduction_incomplete_type))
+    if (S.RequireCompleteType(ELoc, Type,
+                              diag::err_omp_reduction_incomplete_type))
       continue;
     // OpenMP [2.14.3.6, reduction clause, Restrictions]
     // A list item that appears in a reduction clause must not be
     // const-qualified.
     if (Type.getNonReferenceType().isConstant(Context)) {
-      Diag(ELoc, diag::err_omp_const_reduction_list_item)
-          << getOpenMPClauseName(OMPC_reduction) << Type << ERange;
+      S.Diag(ELoc, diag::err_omp_const_reduction_list_item) << ERange;
       if (!ASE && !OASE) {
-        bool IsDecl = !VD ||
-                      VD->isThisDeclarationADefinition(Context) ==
-                          VarDecl::DeclarationOnly;
-        Diag(D->getLocation(),
-             IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+        bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+                                 VarDecl::DeclarationOnly;
+        S.Diag(D->getLocation(),
+               IsDecl ? diag::note_previous_decl : diag::note_defined_here)
             << D;
       }
       continue;
@@ -9087,10 +9138,11 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     if (!ASE && !OASE && VD) {
       VarDecl *VDDef = VD->getDefinition();
       if (VD->getType()->isReferenceType() && VDDef && VDDef->hasInit()) {
-        DSARefChecker Check(DSAStack);
+        DSARefChecker Check(Stack);
         if (Check.Visit(VDDef->getInit())) {
-          Diag(ELoc, diag::err_omp_reduction_ref_type_arg) << ERange;
-          Diag(VDDef->getLocation(), diag::note_defined_here) << VDDef;
+          S.Diag(ELoc, diag::err_omp_reduction_ref_type_arg)
+              << getOpenMPClauseName(ClauseKind) << ERange;
+          S.Diag(VDDef->getLocation(), diag::note_defined_here) << VDDef;
           continue;
         }
       }
@@ -9108,17 +9160,17 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     //  but a list item can appear only once in the reduction clauses for that
     //  directive.
     DSAStackTy::DSAVarData DVar;
-    DVar = DSAStack->getTopDSA(D, false);
+    DVar = Stack->getTopDSA(D, false);
     if (DVar.CKind == OMPC_reduction) {
-      Diag(ELoc, diag::err_omp_once_referenced)
-          << getOpenMPClauseName(OMPC_reduction);
+      S.Diag(ELoc, diag::err_omp_once_referenced)
+          << getOpenMPClauseName(ClauseKind);
       if (DVar.RefExpr)
-        Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_referenced);
+        S.Diag(DVar.RefExpr->getExprLoc(), diag::note_omp_referenced);
     } else if (DVar.CKind != OMPC_unknown) {
-      Diag(ELoc, diag::err_omp_wrong_dsa)
+      S.Diag(ELoc, diag::err_omp_wrong_dsa)
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_reduction);
-      ReportOriginalDSA(*this, DSAStack, D, DVar);
+      ReportOriginalDSA(S, Stack, D, DVar);
       continue;
     }
 
@@ -9126,16 +9178,16 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     //  A list item that appears in a reduction clause of a worksharing
     //  construct must be shared in the parallel regions to which any of the
     //  worksharing regions arising from the worksharing construct bind.
-    OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
+    OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective();
     if (isOpenMPWorksharingDirective(CurrDir) &&
         !isOpenMPParallelDirective(CurrDir) &&
         !isOpenMPTeamsDirective(CurrDir)) {
-      DVar = DSAStack->getImplicitDSA(D, true);
+      DVar = Stack->getImplicitDSA(D, true);
       if (DVar.CKind != OMPC_shared) {
-        Diag(ELoc, diag::err_omp_required_access)
+        S.Diag(ELoc, diag::err_omp_required_access)
             << getOpenMPClauseName(OMPC_reduction)
             << getOpenMPClauseName(OMPC_shared);
-        ReportOriginalDSA(*this, DSAStack, D, DVar);
+        ReportOriginalDSA(S, Stack, D, DVar);
         continue;
       }
     }
@@ -9144,24 +9196,20 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     // builtin/overloaded operators.
     CXXCastPath BasePath;
     ExprResult DeclareReductionRef = buildDeclareReductionRef(
-        *this, ELoc, ERange, DSAStack->getCurScope(), ReductionIdScopeSpec,
+        S, ELoc, ERange, Stack->getCurScope(), ReductionIdScopeSpec,
         ReductionId, Type, BasePath, IR == ER ? nullptr : *IR);
     if (DeclareReductionRef.isInvalid())
       continue;
-    if (CurContext->isDependentContext() &&
+    if (S.CurContext->isDependentContext() &&
         (DeclareReductionRef.isUnset() ||
          isa<UnresolvedLookupExpr>(DeclareReductionRef.get()))) {
-      Vars.push_back(RefExpr);
-      Privates.push_back(nullptr);
-      LHSs.push_back(nullptr);
-      RHSs.push_back(nullptr);
-      ReductionOps.push_back(DeclareReductionRef.get());
+      RD.push(RefExpr, DeclareReductionRef.get());
       continue;
     }
     if (BOK == BO_Comma && DeclareReductionRef.isUnset()) {
       // Not allowed reduction identifier is found.
-      Diag(ReductionId.getLocStart(),
-           diag::err_omp_unknown_reduction_identifier)
+      S.Diag(ReductionId.getLocStart(),
+             diag::err_omp_unknown_reduction_identifier)
           << Type << ReductionIdRange;
       continue;
     }
@@ -9177,28 +9225,27 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     if (DeclareReductionRef.isUnset()) {
       if ((BOK == BO_GT || BOK == BO_LT) &&
           !(Type->isScalarType() ||
-            (getLangOpts().CPlusPlus && Type->isArithmeticType()))) {
-        Diag(ELoc, diag::err_omp_clause_not_arithmetic_type_arg)
-            << getLangOpts().CPlusPlus;
+            (S.getLangOpts().CPlusPlus && Type->isArithmeticType()))) {
+        S.Diag(ELoc, diag::err_omp_clause_not_arithmetic_type_arg)
+            << getOpenMPClauseName(ClauseKind) << S.getLangOpts().CPlusPlus;
         if (!ASE && !OASE) {
-          bool IsDecl = !VD ||
-                        VD->isThisDeclarationADefinition(Context) ==
-                            VarDecl::DeclarationOnly;
-          Diag(D->getLocation(),
-               IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+          bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+                                   VarDecl::DeclarationOnly;
+          S.Diag(D->getLocation(),
+                 IsDecl ? diag::note_previous_decl : diag::note_defined_here)
               << D;
         }
         continue;
       }
       if ((BOK == BO_OrAssign || BOK == BO_AndAssign || BOK == BO_XorAssign) &&
-          !getLangOpts().CPlusPlus && Type->isFloatingType()) {
-        Diag(ELoc, diag::err_omp_clause_floating_type_arg);
+          !S.getLangOpts().CPlusPlus && Type->isFloatingType()) {
+        S.Diag(ELoc, diag::err_omp_clause_floating_type_arg)
+            << getOpenMPClauseName(ClauseKind);
         if (!ASE && !OASE) {
-          bool IsDecl = !VD ||
-                        VD->isThisDeclarationADefinition(Context) ==
-                            VarDecl::DeclarationOnly;
-          Diag(D->getLocation(),
-               IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+          bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+                                   VarDecl::DeclarationOnly;
+          S.Diag(D->getLocation(),
+                 IsDecl ? diag::note_previous_decl : diag::note_defined_here)
               << D;
         }
         continue;
@@ -9206,9 +9253,9 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     }
 
     Type = Type.getNonLValueExprType(Context).getUnqualifiedType();
-    auto *LHSVD = buildVarDecl(*this, ELoc, Type, ".reduction.lhs",
+    auto *LHSVD = buildVarDecl(S, ELoc, Type, ".reduction.lhs",
                                D->hasAttrs() ? &D->getAttrs() : nullptr);
-    auto *RHSVD = buildVarDecl(*this, ELoc, Type, D->getName(),
+    auto *RHSVD = buildVarDecl(S, ELoc, Type, D->getName(),
                                D->hasAttrs() ? &D->getAttrs() : nullptr);
     auto PrivateTy = Type;
     if (OASE ||
@@ -9220,19 +9267,20 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
       // For array subscripts or single variables Private Ty is the same as Type
       // (type of the variable or single array element).
       PrivateTy = Context.getVariableArrayType(
-          Type, new (Context) OpaqueValueExpr(SourceLocation(),
-                                              Context.getSizeType(), VK_RValue),
+          Type,
+          new (Context) OpaqueValueExpr(SourceLocation(), Context.getSizeType(),
+                                        VK_RValue),
           ArrayType::Normal, /*IndexTypeQuals=*/0, SourceRange());
     } else if (!ASE && !OASE &&
                Context.getAsArrayType(D->getType().getNonReferenceType()))
       PrivateTy = D->getType().getNonReferenceType();
     // Private copy.
-    auto *PrivateVD = buildVarDecl(*this, ELoc, PrivateTy, D->getName(),
+    auto *PrivateVD = buildVarDecl(S, ELoc, PrivateTy, D->getName(),
                                    D->hasAttrs() ? &D->getAttrs() : nullptr);
     // Add initializer for private variable.
     Expr *Init = nullptr;
-    auto *LHSDRE = buildDeclRefExpr(*this, LHSVD, Type, ELoc);
-    auto *RHSDRE = buildDeclRefExpr(*this, RHSVD, Type, ELoc);
+    auto *LHSDRE = buildDeclRefExpr(S, LHSVD, Type, ELoc);
+    auto *RHSDRE = buildDeclRefExpr(S, RHSVD, Type, ELoc);
     if (DeclareReductionRef.isUsable()) {
       auto *DRDRef = DeclareReductionRef.getAs<DeclRefExpr>();
       auto *DRD = cast<OMPDeclareReductionDecl>(DRDRef->getDecl());
@@ -9249,13 +9297,13 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
       case BO_LOr:
         // '+', '-', '^', '|', '||' reduction ops - initializer is '0'.
         if (Type->isScalarType() || Type->isAnyComplexType())
-          Init = ActOnIntegerConstant(ELoc, /*Val=*/0).get();
+          Init = S.ActOnIntegerConstant(ELoc, /*Val=*/0).get();
         break;
       case BO_Mul:
       case BO_LAnd:
         if (Type->isScalarType() || Type->isAnyComplexType()) {
           // '*' and '&&' reduction ops - initializer is '1'.
-          Init = ActOnIntegerConstant(ELoc, /*Val=*/1).get();
+          Init = S.ActOnIntegerConstant(ELoc, /*Val=*/1).get();
         }
         break;
       case BO_And: {
@@ -9278,7 +9326,7 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
         if (Init && OrigType->isAnyComplexType()) {
           // Init = 0xFFFF + 0xFFFFi;
           auto *Im = new (Context) ImaginaryLiteral(Init, OrigType);
-          Init = CreateBuiltinBinOp(ELoc, BO_Add, Init, Im).get();
+          Init = S.CreateBuiltinBinOp(ELoc, BO_Add, Init, Im).get();
         }
         Type = OrigType;
         break;
@@ -9295,15 +9343,14 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
           QualType IntTy =
               Context.getIntTypeForBitwidth(Size, /*Signed=*/IsSigned);
           llvm::APInt InitValue =
-              (BOK != BO_LT)
-                  ? IsSigned ? llvm::APInt::getSignedMinValue(Size)
-                             : llvm::APInt::getMinValue(Size)
-                  : IsSigned ? llvm::APInt::getSignedMaxValue(Size)
-                             : llvm::APInt::getMaxValue(Size);
+              (BOK != BO_LT) ? IsSigned ? llvm::APInt::getSignedMinValue(Size)
+                                        : llvm::APInt::getMinValue(Size)
+                             : IsSigned ? llvm::APInt::getSignedMaxValue(Size)
+                                        : llvm::APInt::getMaxValue(Size);
           Init = IntegerLiteral::Create(Context, InitValue, IntTy, ELoc);
           if (Type->isPointerType()) {
             // Cast to pointer type.
-            auto CastExpr = BuildCStyleCastExpr(
+            auto CastExpr = S.BuildCStyleCastExpr(
                 SourceLocation(), Context.getTrivialTypeSourceInfo(Type, ELoc),
                 SourceLocation(), Init);
             if (CastExpr.isInvalid())
@@ -9344,20 +9391,19 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
         llvm_unreachable("Unexpected reduction operation");
       }
     }
-    if (Init && DeclareReductionRef.isUnset()) {
-      AddInitializerToDecl(RHSVD, Init, /*DirectInit=*/false);
-    } else if (!Init)
-      ActOnUninitializedDecl(RHSVD);
+    if (Init && DeclareReductionRef.isUnset())
+      S.AddInitializerToDecl(RHSVD, Init, /*DirectInit=*/false);
+    else if (!Init)
+      S.ActOnUninitializedDecl(RHSVD);
     if (RHSVD->isInvalidDecl())
       continue;
     if (!RHSVD->hasInit() && DeclareReductionRef.isUnset()) {
-      Diag(ELoc, diag::err_omp_reduction_id_not_compatible) << Type
-                                                            << ReductionIdRange;
-      bool IsDecl =
-          !VD ||
-          VD->isThisDeclarationADefinition(Context) == VarDecl::DeclarationOnly;
-      Diag(D->getLocation(),
-           IsDecl ? diag::note_previous_decl : diag::note_defined_here)
+      S.Diag(ELoc, diag::err_omp_reduction_id_not_compatible)
+          << Type << ReductionIdRange;
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+                               VarDecl::DeclarationOnly;
+      S.Diag(D->getLocation(),
+             IsDecl ? diag::note_previous_decl : diag::note_defined_here)
           << D;
       continue;
     }
@@ -9365,16 +9411,16 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
     // codegen.
     PrivateVD->setInit(RHSVD->getInit());
     PrivateVD->setInitStyle(RHSVD->getInitStyle());
-    auto *PrivateDRE = buildDeclRefExpr(*this, PrivateVD, PrivateTy, ELoc);
+    auto *PrivateDRE = buildDeclRefExpr(S, PrivateVD, PrivateTy, ELoc);
     ExprResult ReductionOp;
     if (DeclareReductionRef.isUsable()) {
       QualType RedTy = DeclareReductionRef.get()->getType();
       QualType PtrRedTy = Context.getPointerType(RedTy);
-      ExprResult LHS = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, LHSDRE);
-      ExprResult RHS = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, RHSDRE);
+      ExprResult LHS = S.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, LHSDRE);
+      ExprResult RHS = S.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, RHSDRE);
       if (!BasePath.empty()) {
-        LHS = DefaultLvalueConversion(LHS.get());
-        RHS = DefaultLvalueConversion(RHS.get());
+        LHS = S.DefaultLvalueConversion(LHS.get());
+        RHS = S.DefaultLvalueConversion(RHS.get());
         LHS = ImplicitCastExpr::Create(Context, PtrRedTy,
                                        CK_UncheckedDerivedToBase, LHS.get(),
                                        &BasePath, LHS.get()->getValueKind());
@@ -9387,27 +9433,27 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
       QualType FnTy = Context.getFunctionType(Context.VoidTy, Params, EPI);
       auto *OVE = new (Context) OpaqueValueExpr(
           ELoc, Context.getPointerType(FnTy), VK_RValue, OK_Ordinary,
-          DefaultLvalueConversion(DeclareReductionRef.get()).get());
+          S.DefaultLvalueConversion(DeclareReductionRef.get()).get());
       Expr *Args[] = {LHS.get(), RHS.get()};
       ReductionOp = new (Context)
           CallExpr(Context, OVE, Args, Context.VoidTy, VK_RValue, ELoc);
     } else {
-      ReductionOp = BuildBinOp(DSAStack->getCurScope(),
-                               ReductionId.getLocStart(), BOK, LHSDRE, RHSDRE);
+      ReductionOp = S.BuildBinOp(
+          Stack->getCurScope(), ReductionId.getLocStart(), BOK, LHSDRE, RHSDRE);
       if (ReductionOp.isUsable()) {
         if (BOK != BO_LT && BOK != BO_GT) {
           ReductionOp =
-              BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(),
-                         BO_Assign, LHSDRE, ReductionOp.get());
+              S.BuildBinOp(Stack->getCurScope(), ReductionId.getLocStart(),
+                           BO_Assign, LHSDRE, ReductionOp.get());
         } else {
           auto *ConditionalOp = new (Context) ConditionalOperator(
               ReductionOp.get(), SourceLocation(), LHSDRE, SourceLocation(),
               RHSDRE, Type, VK_LValue, OK_Ordinary);
           ReductionOp =
-              BuildBinOp(DSAStack->getCurScope(), ReductionId.getLocStart(),
-                         BO_Assign, LHSDRE, ConditionalOp);
+              S.BuildBinOp(Stack->getCurScope(), ReductionId.getLocStart(),
+                           BO_Assign, LHSDRE, ConditionalOp);
         }
-        ReductionOp = ActOnFinishFullExpr(ReductionOp.get());
+        ReductionOp = S.ActOnFinishFullExpr(ReductionOp.get());
       }
       if (ReductionOp.isInvalid())
         continue;
@@ -9415,54 +9461,86 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
 
     DeclRefExpr *Ref = nullptr;
     Expr *VarsExpr = RefExpr->IgnoreParens();
-    if (!VD && !CurContext->isDependentContext()) {
+    if (!VD && !S.CurContext->isDependentContext()) {
       if (ASE || OASE) {
-        TransformExprToCaptures RebuildToCapture(*this, D);
+        TransformExprToCaptures RebuildToCapture(S, D);
         VarsExpr =
             RebuildToCapture.TransformExpr(RefExpr->IgnoreParens()).get();
         Ref = RebuildToCapture.getCapturedExpr();
       } else {
-        VarsExpr = Ref =
-            buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+        VarsExpr = Ref = buildCapture(S, D, SimpleRefExpr, /*WithInit=*/false);
       }
-      if (!IsOpenMPCapturedDecl(D)) {
-        ExprCaptures.push_back(Ref->getDecl());
+      if (!S.IsOpenMPCapturedDecl(D)) {
+        RD.ExprCaptures.emplace_back(Ref->getDecl());
         if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
-          ExprResult RefRes = DefaultLvalueConversion(Ref);
+          ExprResult RefRes = S.DefaultLvalueConversion(Ref);
           if (!RefRes.isUsable())
             continue;
           ExprResult PostUpdateRes =
-              BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
-                         SimpleRefExpr, RefRes.get());
+              S.BuildBinOp(Stack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
+                           RefRes.get());
           if (!PostUpdateRes.isUsable())
             continue;
-          if (isOpenMPTaskingDirective(DSAStack->getCurrentDirective())) {
-            Diag(RefExpr->getExprLoc(),
-                 diag::err_omp_reduction_non_addressable_expression)
+          if (isOpenMPTaskingDirective(Stack->getCurrentDirective()) ||
+              Stack->getCurrentDirective() == OMPD_taskgroup) {
+            S.Diag(RefExpr->getExprLoc(),
+                   diag::err_omp_reduction_non_addressable_expression)
                 << RefExpr->getSourceRange();
             continue;
           }
-          ExprPostUpdates.push_back(
-              IgnoredValueConversions(PostUpdateRes.get()).get());
+          RD.ExprPostUpdates.emplace_back(
+              S.IgnoredValueConversions(PostUpdateRes.get()).get());
         }
       }
     }
-    DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref);
-    Vars.push_back(VarsExpr);
-    Privates.push_back(PrivateDRE);
-    LHSs.push_back(LHSDRE);
-    RHSs.push_back(RHSDRE);
-    ReductionOps.push_back(ReductionOp.get());
+    // All reduction items are still marked as reduction (to do not increase
+    // code base size).
+    Stack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref);
+    RD.push(VarsExpr, PrivateDRE, LHSDRE, RHSDRE, ReductionOp.get());
   }
+  return RD.Vars.empty();
+}
 
-  if (Vars.empty())
+OMPClause *Sema::ActOnOpenMPReductionClause(
+    ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation ColonLoc, SourceLocation EndLoc,
+    CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
+    ArrayRef<Expr *> UnresolvedReductions) {
+  ReductionData RD(VarList.size());
+
+  if (ActOnOMPReductionKindClause(*this, DSAStack, OMPC_reduction, VarList,
+                                  StartLoc, LParenLoc, ColonLoc, EndLoc,
+                                  ReductionIdScopeSpec, ReductionId,
+                                  UnresolvedReductions, RD))
     return nullptr;
 
   return OMPReductionClause::Create(
-      Context, StartLoc, LParenLoc, ColonLoc, EndLoc, Vars,
-      ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, Privates,
-      LHSs, RHSs, ReductionOps, buildPreInits(Context, ExprCaptures),
-      buildPostUpdate(*this, ExprPostUpdates));
+      Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
+      ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
+      RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps,
+      buildPreInits(Context, RD.ExprCaptures),
+      buildPostUpdate(*this, RD.ExprPostUpdates));
+}
+
+OMPClause *Sema::ActOnOpenMPTaskReductionClause(
+    ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation ColonLoc, SourceLocation EndLoc,
+    CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
+    ArrayRef<Expr *> UnresolvedReductions) {
+  ReductionData RD(VarList.size());
+
+  if (ActOnOMPReductionKindClause(*this, DSAStack, OMPC_task_reduction,
+                                  VarList, StartLoc, LParenLoc, ColonLoc,
+                                  EndLoc, ReductionIdScopeSpec, ReductionId,
+                                  UnresolvedReductions, RD))
+    return nullptr;
+
+  return OMPTaskReductionClause::Create(
+      Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
+      ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
+      RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps,
+      buildPreInits(Context, RD.ExprCaptures),
+      buildPostUpdate(*this, RD.ExprPostUpdates));
 }
 
 bool Sema::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
diff --git a/lib/Sema/SemaType.cpp b/lib/Sema/SemaType.cpp
index b19dcb2a5099..598a11300b87 100644
--- a/lib/Sema/SemaType.cpp
+++ b/lib/Sema/SemaType.cpp
@@ -120,6 +120,7 @@ static void diagnoseBadTypeAttribute(Sema &S, const AttributeList &attr,
 
 // Function type attributes.
 #define FUNCTION_TYPE_ATTRS_CASELIST \
+  case AttributeList::AT_NSReturnsRetained: \
   case AttributeList::AT_NoReturn: \
   case AttributeList::AT_Regparm: \
   case AttributeList::AT_AnyX86NoCallerSavedRegisters: \
@@ -640,12 +641,6 @@ static void distributeTypeAttrsFromDeclarator(TypeProcessingState &state,
       distributeObjCPointerTypeAttrFromDeclarator(state, *attr, declSpecType);
       break;
 
-    case AttributeList::AT_NSReturnsRetained:
-      if (!state.getSema().getLangOpts().ObjCAutoRefCount)
-        break;
-      // fallthrough
-      LLVM_FALLTHROUGH;
-
     FUNCTION_TYPE_ATTRS_CASELIST:
       distributeFunctionTypeAttrFromDeclarator(state, *attr, declSpecType);
       break;
@@ -2385,6 +2380,11 @@ QualType Sema::BuildFunctionType(QualType T,
                            [=](unsigned i) { return Loc; });
   }
 
+  if (EPI.ExtInfo.getProducesResult()) {
+    // This is just a warning, so we can't fail to build if we see it.
+    checkNSReturnsRetainedReturnType(Loc, T);
+  }
+
   if (Invalid)
     return QualType();
 
@@ -5017,6 +5017,8 @@ static AttributeList::Kind getAttrListKind(AttributedType::Kind kind) {
     return AttributeList::AT_TypeNullUnspecified;
   case AttributedType::attr_objc_kindof:
     return AttributeList::AT_ObjCKindOf;
+  case AttributedType::attr_ns_returns_retained:
+    return AttributeList::AT_NSReturnsRetained;
   }
   llvm_unreachable("unexpected attribute kind!");
 }
@@ -6373,17 +6375,26 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state,
   // ns_returns_retained is not always a type attribute, but if we got
   // here, we're treating it as one right now.
   if (attr.getKind() == AttributeList::AT_NSReturnsRetained) {
-    assert(S.getLangOpts().ObjCAutoRefCount &&
-           "ns_returns_retained treated as type attribute in non-ARC");
     if (attr.getNumArgs()) return true;
 
     // Delay if this is not a function type.
     if (!unwrapped.isFunctionType())
       return false;
 
-    FunctionType::ExtInfo EI
-      = unwrapped.get()->getExtInfo().withProducesResult(true);
-    type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
+    // Check whether the return type is reasonable.
+    if (S.checkNSReturnsRetainedReturnType(attr.getLoc(),
+                                           unwrapped.get()->getReturnType()))
+      return true;
+
+    // Only actually change the underlying type in ARC builds.
+    QualType origType = type;
+    if (state.getSema().getLangOpts().ObjCAutoRefCount) {
+      FunctionType::ExtInfo EI
+        = unwrapped.get()->getExtInfo().withProducesResult(true);
+      type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI));
+    }
+    type = S.Context.getAttributedType(AttributedType::attr_ns_returns_retained,
+                                       origType, type);
     return true;
   }
 
@@ -6945,12 +6956,6 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
       attr.setUsedAsTypeAttr();
       break;
 
-    case AttributeList::AT_NSReturnsRetained:
-      if (!state.getSema().getLangOpts().ObjCAutoRefCount)
-        break;
-      // fallthrough into the function attrs
-      LLVM_FALLTHROUGH;
-
     FUNCTION_TYPE_ATTRS_CASELIST:
       attr.setUsedAsTypeAttr();
 
diff --git a/lib/Sema/TreeTransform.h b/lib/Sema/TreeTransform.h
index 7aa8f64d5081..91da9f88c59b 100644
--- a/lib/Sema/TreeTransform.h
+++ b/lib/Sema/TreeTransform.h
@@ -1651,6 +1651,21 @@ class TreeTransform {
         ReductionId, UnresolvedReductions);
   }
 
+  /// Build a new OpenMP 'task_reduction' clause.
+  ///
+  /// By default, performs semantic analysis to build the new statement.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPTaskReductionClause(
+      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
+      CXXScopeSpec &ReductionIdScopeSpec,
+      const DeclarationNameInfo &ReductionId,
+      ArrayRef<Expr *> UnresolvedReductions) {
+    return getSema().ActOnOpenMPTaskReductionClause(
+        VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec,
+        ReductionId, UnresolvedReductions);
+  }
+
   /// \brief Build a new OpenMP 'linear' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -8399,6 +8414,51 @@ TreeTransform<Derived>::TransformOMPReductionClause(OMPReductionClause *C) {
       C->getLocEnd(), ReductionIdScopeSpec, NameInfo, UnresolvedReductions);
 }
 
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPTaskReductionClause(
+    OMPTaskReductionClause *C) {
+  llvm::SmallVector<Expr *, 16> Vars;
+  Vars.reserve(C->varlist_size());
+  for (auto *VE : C->varlists()) {
+    ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
+    if (EVar.isInvalid())
+      return nullptr;
+    Vars.push_back(EVar.get());
+  }
+  CXXScopeSpec ReductionIdScopeSpec;
+  ReductionIdScopeSpec.Adopt(C->getQualifierLoc());
+
+  DeclarationNameInfo NameInfo = C->getNameInfo();
+  if (NameInfo.getName()) {
+    NameInfo = getDerived().TransformDeclarationNameInfo(NameInfo);
+    if (!NameInfo.getName())
+      return nullptr;
+  }
+  // Build a list of all UDR decls with the same names ranged by the Scopes.
+  // The Scope boundary is a duplication of the previous decl.
+  llvm::SmallVector<Expr *, 16> UnresolvedReductions;
+  for (auto *E : C->reduction_ops()) {
+    // Transform all the decls.
+    if (E) {
+      auto *ULE = cast<UnresolvedLookupExpr>(E);
+      UnresolvedSet<8> Decls;
+      for (auto *D : ULE->decls()) {
+        NamedDecl *InstD =
+            cast<NamedDecl>(getDerived().TransformDecl(E->getExprLoc(), D));
+        Decls.addDecl(InstD, InstD->getAccess());
+      }
+      UnresolvedReductions.push_back(UnresolvedLookupExpr::Create(
+          SemaRef.Context, /*NamingClass=*/nullptr,
+          ReductionIdScopeSpec.getWithLocInContext(SemaRef.Context), NameInfo,
+          /*ADL=*/true, ULE->isOverloaded(), Decls.begin(), Decls.end()));
+    } else
+      UnresolvedReductions.push_back(nullptr);
+  }
+  return getDerived().RebuildOMPTaskReductionClause(
+      Vars, C->getLocStart(), C->getLParenLoc(), C->getColonLoc(),
+      C->getLocEnd(), ReductionIdScopeSpec, NameInfo, UnresolvedReductions);
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPLinearClause(OMPLinearClause *C) {
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index 678ecfc9a3d9..50be74f6bf6e 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -1520,7 +1520,7 @@ MacroInfo *ASTReader::ReadMacroRecord(ModuleFile &F, uint64_t Offset) {
 
   Stream.JumpToBit(Offset);
   RecordData Record;
-  SmallVector<IdentifierInfo*, 16> MacroArgs;
+  SmallVector<IdentifierInfo*, 16> MacroParams;
   MacroInfo *Macro = nullptr;
 
   while (true) {
@@ -1571,17 +1571,17 @@ MacroInfo *ASTReader::ReadMacroRecord(ModuleFile &F, uint64_t Offset) {
         bool isC99VarArgs = Record[NextIndex++];
         bool isGNUVarArgs = Record[NextIndex++];
         bool hasCommaPasting = Record[NextIndex++];
-        MacroArgs.clear();
+        MacroParams.clear();
         unsigned NumArgs = Record[NextIndex++];
         for (unsigned i = 0; i != NumArgs; ++i)
-          MacroArgs.push_back(getLocalIdentifier(F, Record[NextIndex++]));
+          MacroParams.push_back(getLocalIdentifier(F, Record[NextIndex++]));
 
         // Install function-like macro info.
         MI->setIsFunctionLike();
         if (isC99VarArgs) MI->setIsC99Varargs();
         if (isGNUVarArgs) MI->setIsGNUVarargs();
         if (hasCommaPasting) MI->setHasCommaPasting();
-        MI->setArgumentList(MacroArgs, PP.getPreprocessorAllocator());
+        MI->setParameterList(MacroParams, PP.getPreprocessorAllocator());
       }
 
       // Remember that we saw this macro last so that we add the tokens that
@@ -9341,6 +9341,8 @@ void ASTReader::diagnoseOdrViolations() {
         case Decl::Field:
           return Field;
         case Decl::CXXMethod:
+        case Decl::CXXConstructor:
+        case Decl::CXXDestructor:
           return CXXMethod;
         case Decl::TypeAlias:
           return TypeAlias;
@@ -9669,17 +9671,30 @@ void ASTReader::diagnoseOdrViolations() {
         break;
       }
       case CXXMethod: {
+        enum {
+          DiagMethod,
+          DiagConstructor,
+          DiagDestructor,
+        } FirstMethodType,
+            SecondMethodType;
+        auto GetMethodTypeForDiagnostics = [](const CXXMethodDecl* D) {
+          if (isa<CXXConstructorDecl>(D)) return DiagConstructor;
+          if (isa<CXXDestructorDecl>(D)) return DiagDestructor;
+          return DiagMethod;
+        };
         const CXXMethodDecl *FirstMethod = cast<CXXMethodDecl>(FirstDecl);
         const CXXMethodDecl *SecondMethod = cast<CXXMethodDecl>(SecondDecl);
+        FirstMethodType = GetMethodTypeForDiagnostics(FirstMethod);
+        SecondMethodType = GetMethodTypeForDiagnostics(SecondMethod);
         auto FirstName = FirstMethod->getDeclName();
         auto SecondName = SecondMethod->getDeclName();
-        if (FirstName != SecondName) {
+        if (FirstMethodType != SecondMethodType || FirstName != SecondName) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodName)
-              << FirstName;
+              << FirstMethodType << FirstName;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodName)
-              << SecondName;
+              << SecondMethodType << SecondName;
 
           Diagnosed = true;
           break;
@@ -9690,11 +9705,11 @@ void ASTReader::diagnoseOdrViolations() {
         if (FirstDeleted != SecondDeleted) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodDeleted)
-              << FirstName << FirstDeleted;
+              << FirstMethodType << FirstName << FirstDeleted;
 
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodDeleted)
-              << SecondName << SecondDeleted;
+              << SecondMethodType << SecondName << SecondDeleted;
           Diagnosed = true;
           break;
         }
@@ -9707,10 +9722,10 @@ void ASTReader::diagnoseOdrViolations() {
             (FirstVirtual != SecondVirtual || FirstPure != SecondPure)) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodVirtual)
-              << FirstName << FirstPure << FirstVirtual;
+              << FirstMethodType << FirstName << FirstPure << FirstVirtual;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodVirtual)
-              << SecondName << SecondPure << SecondVirtual;
+              << SecondMethodType << SecondName << SecondPure << SecondVirtual;
           Diagnosed = true;
           break;
         }
@@ -9725,10 +9740,10 @@ void ASTReader::diagnoseOdrViolations() {
         if (FirstStatic != SecondStatic) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodStatic)
-              << FirstName << FirstStatic;
+              << FirstMethodType << FirstName << FirstStatic;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodStatic)
-              << SecondName << SecondStatic;
+              << SecondMethodType << SecondName << SecondStatic;
           Diagnosed = true;
           break;
         }
@@ -9738,10 +9753,10 @@ void ASTReader::diagnoseOdrViolations() {
         if (FirstVolatile != SecondVolatile) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodVolatile)
-              << FirstName << FirstVolatile;
+              << FirstMethodType << FirstName << FirstVolatile;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodVolatile)
-              << SecondName << SecondVolatile;
+              << SecondMethodType << SecondName << SecondVolatile;
           Diagnosed = true;
           break;
         }
@@ -9751,10 +9766,10 @@ void ASTReader::diagnoseOdrViolations() {
         if (FirstConst != SecondConst) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodConst)
-              << FirstName << FirstConst;
+              << FirstMethodType << FirstName << FirstConst;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodConst)
-              << SecondName << SecondConst;
+              << SecondMethodType << SecondName << SecondConst;
           Diagnosed = true;
           break;
         }
@@ -9764,10 +9779,10 @@ void ASTReader::diagnoseOdrViolations() {
         if (FirstInline != SecondInline) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodInline)
-              << FirstName << FirstInline;
+              << FirstMethodType << FirstName << FirstInline;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodInline)
-              << SecondName << SecondInline;
+              << SecondMethodType << SecondName << SecondInline;
           Diagnosed = true;
           break;
         }
@@ -9777,10 +9792,10 @@ void ASTReader::diagnoseOdrViolations() {
         if (FirstNumParameters != SecondNumParameters) {
           ODRDiagError(FirstMethod->getLocation(),
                        FirstMethod->getSourceRange(), MethodNumberParameters)
-              << FirstName << FirstNumParameters;
+              << FirstMethodType << FirstName << FirstNumParameters;
           ODRDiagNote(SecondMethod->getLocation(),
                       SecondMethod->getSourceRange(), MethodNumberParameters)
-              << SecondName << SecondNumParameters;
+              << SecondMethodType << SecondName << SecondNumParameters;
           Diagnosed = true;
           break;
         }
@@ -9800,24 +9815,27 @@ void ASTReader::diagnoseOdrViolations() {
                     FirstParamType->getAs<DecayedType>()) {
               ODRDiagError(FirstMethod->getLocation(),
                            FirstMethod->getSourceRange(), MethodParameterType)
-                  << FirstName << (I + 1) << FirstParamType << true
-                  << ParamDecayedType->getOriginalType();
+                  << FirstMethodType << FirstName << (I + 1) << FirstParamType
+                  << true << ParamDecayedType->getOriginalType();
             } else {
               ODRDiagError(FirstMethod->getLocation(),
                            FirstMethod->getSourceRange(), MethodParameterType)
-                  << FirstName << (I + 1) << FirstParamType << false;
+                  << FirstMethodType << FirstName << (I + 1) << FirstParamType
+                  << false;
             }
 
             if (const DecayedType *ParamDecayedType =
                     SecondParamType->getAs<DecayedType>()) {
               ODRDiagNote(SecondMethod->getLocation(),
                           SecondMethod->getSourceRange(), MethodParameterType)
-                  << SecondName << (I + 1) << SecondParamType << true
+                  << SecondMethodType << SecondName << (I + 1)
+                  << SecondParamType << true
                   << ParamDecayedType->getOriginalType();
             } else {
               ODRDiagNote(SecondMethod->getLocation(),
                           SecondMethod->getSourceRange(), MethodParameterType)
-                  << SecondName << (I + 1) << SecondParamType << false;
+                  << SecondMethodType << SecondName << (I + 1)
+                  << SecondParamType << false;
             }
             ParameterMismatch = true;
             break;
@@ -9828,10 +9846,10 @@ void ASTReader::diagnoseOdrViolations() {
           if (FirstParamName != SecondParamName) {
             ODRDiagError(FirstMethod->getLocation(),
                          FirstMethod->getSourceRange(), MethodParameterName)
-                << FirstName << (I + 1) << FirstParamName;
+                << FirstMethodType << FirstName << (I + 1) << FirstParamName;
             ODRDiagNote(SecondMethod->getLocation(),
                         SecondMethod->getSourceRange(), MethodParameterName)
-                << SecondName << (I + 1) << SecondParamName;
+                << SecondMethodType << SecondName << (I + 1) << SecondParamName;
             ParameterMismatch = true;
             break;
           }
@@ -9842,12 +9860,14 @@ void ASTReader::diagnoseOdrViolations() {
             ODRDiagError(FirstMethod->getLocation(),
                          FirstMethod->getSourceRange(),
                          MethodParameterSingleDefaultArgument)
-                << FirstName << (I + 1) << (FirstInit == nullptr)
+                << FirstMethodType << FirstName << (I + 1)
+                << (FirstInit == nullptr)
                 << (FirstInit ? FirstInit->getSourceRange() : SourceRange());
             ODRDiagNote(SecondMethod->getLocation(),
                         SecondMethod->getSourceRange(),
                         MethodParameterSingleDefaultArgument)
-                << SecondName << (I + 1) << (SecondInit == nullptr)
+                << SecondMethodType << SecondName << (I + 1)
+                << (SecondInit == nullptr)
                 << (SecondInit ? SecondInit->getSourceRange() : SourceRange());
             ParameterMismatch = true;
             break;
@@ -9858,11 +9878,13 @@ void ASTReader::diagnoseOdrViolations() {
             ODRDiagError(FirstMethod->getLocation(),
                          FirstMethod->getSourceRange(),
                          MethodParameterDifferentDefaultArgument)
-                << FirstName << (I + 1) << FirstInit->getSourceRange();
+                << FirstMethodType << FirstName << (I + 1)
+                << FirstInit->getSourceRange();
             ODRDiagNote(SecondMethod->getLocation(),
                         SecondMethod->getSourceRange(),
                         MethodParameterDifferentDefaultArgument)
-                << SecondName << (I + 1) << SecondInit->getSourceRange();
+                << SecondMethodType << SecondName << (I + 1)
+                << SecondInit->getSourceRange();
             ParameterMismatch = true;
             break;
 
diff --git a/lib/Serialization/ASTReaderStmt.cpp b/lib/Serialization/ASTReaderStmt.cpp
index afee50ffa3b9..21adcddd3a4a 100644
--- a/lib/Serialization/ASTReaderStmt.cpp
+++ b/lib/Serialization/ASTReaderStmt.cpp
@@ -1834,6 +1834,9 @@ OMPClause *OMPClauseReader::readClause() {
   case OMPC_reduction:
     C = OMPReductionClause::CreateEmpty(Context, Reader->Record.readInt());
     break;
+  case OMPC_task_reduction:
+    C = OMPTaskReductionClause::CreateEmpty(Context, Reader->Record.readInt());
+    break;
   case OMPC_linear:
     C = OMPLinearClause::CreateEmpty(Context, Reader->Record.readInt());
     break;
@@ -2138,6 +2141,40 @@ void OMPClauseReader::VisitOMPReductionClause(OMPReductionClause *C) {
   C->setReductionOps(Vars);
 }
 
+void OMPClauseReader::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {
+  VisitOMPClauseWithPostUpdate(C);
+  C->setLParenLoc(Reader->ReadSourceLocation());
+  C->setColonLoc(Reader->ReadSourceLocation());
+  NestedNameSpecifierLoc NNSL = Reader->Record.readNestedNameSpecifierLoc();
+  DeclarationNameInfo DNI;
+  Reader->ReadDeclarationNameInfo(DNI);
+  C->setQualifierLoc(NNSL);
+  C->setNameInfo(DNI);
+
+  unsigned NumVars = C->varlist_size();
+  SmallVector<Expr *, 16> Vars;
+  Vars.reserve(NumVars);
+  for (unsigned I = 0; I != NumVars; ++I)
+    Vars.push_back(Reader->Record.readSubExpr());
+  C->setVarRefs(Vars);
+  Vars.clear();
+  for (unsigned I = 0; I != NumVars; ++I)
+    Vars.push_back(Reader->Record.readSubExpr());
+  C->setPrivates(Vars);
+  Vars.clear();
+  for (unsigned I = 0; I != NumVars; ++I)
+    Vars.push_back(Reader->Record.readSubExpr());
+  C->setLHSExprs(Vars);
+  Vars.clear();
+  for (unsigned I = 0; I != NumVars; ++I)
+    Vars.push_back(Reader->Record.readSubExpr());
+  C->setRHSExprs(Vars);
+  Vars.clear();
+  for (unsigned I = 0; I != NumVars; ++I)
+    Vars.push_back(Reader->Record.readSubExpr());
+  C->setReductionOps(Vars);
+}
+
 void OMPClauseReader::VisitOMPLinearClause(OMPLinearClause *C) {
   VisitOMPClauseWithPostUpdate(C);
   C->setLParenLoc(Reader->ReadSourceLocation());
@@ -2709,6 +2746,8 @@ void ASTStmtReader::VisitOMPTaskwaitDirective(OMPTaskwaitDirective *D) {
 
 void ASTStmtReader::VisitOMPTaskgroupDirective(OMPTaskgroupDirective *D) {
   VisitStmt(D);
+  // The NumClauses field was read in ReadStmtFromStream.
+  Record.skipInts(1);
   VisitOMPExecutableDirective(D);
 }
 
@@ -3479,7 +3518,8 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case STMT_OMP_TASKGROUP_DIRECTIVE:
-      S = OMPTaskgroupDirective::CreateEmpty(Context, Empty);
+      S = OMPTaskgroupDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
       break;
 
     case STMT_OMP_FLUSH_DIRECTIVE:
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index f7a49e41009d..a875e627bdfb 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -2521,9 +2521,9 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
       Record.push_back(MI->isC99Varargs());
       Record.push_back(MI->isGNUVarargs());
       Record.push_back(MI->hasCommaPasting());
-      Record.push_back(MI->getNumArgs());
-      for (const IdentifierInfo *Arg : MI->args())
-        AddIdentifierRef(Arg, Record);
+      Record.push_back(MI->getNumParams());
+      for (const IdentifierInfo *Param : MI->params())
+        AddIdentifierRef(Param, Record);
     }
 
     // If we have a detailed preprocessing record, record the macro definition
diff --git a/lib/Serialization/ASTWriterStmt.cpp b/lib/Serialization/ASTWriterStmt.cpp
index 90a732e575e2..ae2e0b88c311 100644
--- a/lib/Serialization/ASTWriterStmt.cpp
+++ b/lib/Serialization/ASTWriterStmt.cpp
@@ -1963,6 +1963,25 @@ void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
     Record.AddStmt(E);
 }
 
+void OMPClauseWriter::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {
+  Record.push_back(C->varlist_size());
+  VisitOMPClauseWithPostUpdate(C);
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getColonLoc());
+  Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
+  Record.AddDeclarationNameInfo(C->getNameInfo());
+  for (auto *VE : C->varlists())
+    Record.AddStmt(VE);
+  for (auto *VE : C->privates())
+    Record.AddStmt(VE);
+  for (auto *E : C->lhs_exprs())
+    Record.AddStmt(E);
+  for (auto *E : C->rhs_exprs())
+    Record.AddStmt(E);
+  for (auto *E : C->reduction_ops())
+    Record.AddStmt(E);
+}
+
 void OMPClauseWriter::VisitOMPLinearClause(OMPLinearClause *C) {
   Record.push_back(C->varlist_size());
   VisitOMPClauseWithPostUpdate(C);
@@ -2440,6 +2459,7 @@ void ASTStmtWriter::VisitOMPTaskwaitDirective(OMPTaskwaitDirective *D) {
 
 void ASTStmtWriter::VisitOMPTaskgroupDirective(OMPTaskgroupDirective *D) {
   VisitStmt(D);
+  Record.push_back(D->getNumClauses());
   VisitOMPExecutableDirective(D);
   Code = serialization::STMT_OMP_TASKGROUP_DIRECTIVE;
 }
diff --git a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
index 6bbaaac05e6b..655ce33390c9 100644
--- a/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/LocalizationChecker.cpp
@@ -57,7 +57,7 @@ struct LocalizedState {
 };
 
 class NonLocalizedStringChecker
-    : public Checker<check::PostCall, check::PreObjCMessage,
+    : public Checker<check::PreCall, check::PostCall, check::PreObjCMessage,
                      check::PostObjCMessage,
                      check::PostStmt<ObjCStringLiteral>> {
 
@@ -79,9 +79,10 @@ class NonLocalizedStringChecker
   void setNonLocalizedState(SVal S, CheckerContext &C) const;
   void setLocalizedState(SVal S, CheckerContext &C) const;
 
-  bool isAnnotatedAsLocalized(const Decl *D) const;
-  void reportLocalizationError(SVal S, const ObjCMethodCall &M,
-                               CheckerContext &C, int argumentNumber = 0) const;
+  bool isAnnotatedAsReturningLocalized(const Decl *D) const;
+  bool isAnnotatedAsTakingLocalized(const Decl *D) const;
+  void reportLocalizationError(SVal S, const CallEvent &M, CheckerContext &C,
+                               int argumentNumber = 0) const;
 
   int getLocalizedArgumentForSelector(const IdentifierInfo *Receiver,
                                       Selector S) const;
@@ -97,6 +98,7 @@ class NonLocalizedStringChecker
   void checkPreObjCMessage(const ObjCMethodCall &msg, CheckerContext &C) const;
   void checkPostObjCMessage(const ObjCMethodCall &msg, CheckerContext &C) const;
   void checkPostStmt(const ObjCStringLiteral *SL, CheckerContext &C) const;
+  void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
   void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
 };
 
@@ -644,7 +646,8 @@ void NonLocalizedStringChecker::initLocStringsMethods(ASTContext &Ctx) const {
 
 /// Checks to see if the method / function declaration includes
 /// __attribute__((annotate("returns_localized_nsstring")))
-bool NonLocalizedStringChecker::isAnnotatedAsLocalized(const Decl *D) const {
+bool NonLocalizedStringChecker::isAnnotatedAsReturningLocalized(
+    const Decl *D) const {
   if (!D)
     return false;
   return std::any_of(
@@ -654,6 +657,19 @@ bool NonLocalizedStringChecker::isAnnotatedAsLocalized(const Decl *D) const {
       });
 }
 
+/// Checks to see if the method / function declaration includes
+/// __attribute__((annotate("takes_localized_nsstring")))
+bool NonLocalizedStringChecker::isAnnotatedAsTakingLocalized(
+    const Decl *D) const {
+  if (!D)
+    return false;
+  return std::any_of(
+      D->specific_attr_begin<AnnotateAttr>(),
+      D->specific_attr_end<AnnotateAttr>(), [](const AnnotateAttr *Ann) {
+        return Ann->getAnnotation() == "takes_localized_nsstring";
+      });
+}
+
 /// Returns true if the given SVal is marked as Localized in the program state
 bool NonLocalizedStringChecker::hasLocalizedState(SVal S,
                                                   CheckerContext &C) const {
@@ -733,8 +749,7 @@ static bool isDebuggingContext(CheckerContext &C) {
 
 /// Reports a localization error for the passed in method call and SVal
 void NonLocalizedStringChecker::reportLocalizationError(
-    SVal S, const ObjCMethodCall &M, CheckerContext &C,
-    int argumentNumber) const {
+    SVal S, const CallEvent &M, CheckerContext &C, int argumentNumber) const {
 
   // Don't warn about localization errors in classes and methods that
   // may be debug code.
@@ -832,7 +847,21 @@ void NonLocalizedStringChecker::checkPreObjCMessage(const ObjCMethodCall &msg,
     }
   }
 
-  if (argumentNumber < 0) // There was no match in UIMethods
+  if (argumentNumber < 0) { // There was no match in UIMethods
+    if (const Decl *D = msg.getDecl()) {
+      if (const ObjCMethodDecl *OMD = dyn_cast_or_null<ObjCMethodDecl>(D)) {
+        auto formals = OMD->parameters();
+        for (unsigned i = 0, ei = formals.size(); i != ei; ++i) {
+          if (isAnnotatedAsTakingLocalized(formals[i])) {
+            argumentNumber = i;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (argumentNumber < 0) // Still no match
     return;
 
   SVal svTitle = msg.getArgSVal(argumentNumber);
@@ -855,6 +884,25 @@ void NonLocalizedStringChecker::checkPreObjCMessage(const ObjCMethodCall &msg,
   }
 }
 
+void NonLocalizedStringChecker::checkPreCall(const CallEvent &Call,
+                                             CheckerContext &C) const {
+  const Decl *D = Call.getDecl();
+  if (D && isa<FunctionDecl>(D)) {
+    const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
+    auto formals = FD->parameters();
+    for (unsigned i = 0,
+                  ei = std::min(unsigned(formals.size()), Call.getNumArgs());
+         i != ei; ++i) {
+      if (isAnnotatedAsTakingLocalized(formals[i])) {
+        auto actual = Call.getArgSVal(i);
+        if (hasNonLocalizedState(actual, C)) {
+          reportLocalizationError(actual, Call, C, i + 1);
+        }
+      }
+    }
+  }
+}
+
 static inline bool isNSStringType(QualType T, ASTContext &Ctx) {
 
   const ObjCObjectPointerType *PT = T->getAs<ObjCObjectPointerType>();
@@ -906,7 +954,7 @@ void NonLocalizedStringChecker::checkPostCall(const CallEvent &Call,
   const IdentifierInfo *Identifier = Call.getCalleeIdentifier();
 
   SVal sv = Call.getReturnValue();
-  if (isAnnotatedAsLocalized(D) || LSF.count(Identifier) != 0) {
+  if (isAnnotatedAsReturningLocalized(D) || LSF.count(Identifier) != 0) {
     setLocalizedState(sv, C);
   } else if (isNSStringType(RT, C.getASTContext()) &&
              !hasLocalizedState(sv, C)) {
@@ -940,7 +988,8 @@ void NonLocalizedStringChecker::checkPostObjCMessage(const ObjCMethodCall &msg,
 
   std::pair<const IdentifierInfo *, Selector> MethodDescription = {odInfo, S};
 
-  if (LSM.count(MethodDescription) || isAnnotatedAsLocalized(msg.getDecl())) {
+  if (LSM.count(MethodDescription) ||
+      isAnnotatedAsReturningLocalized(msg.getDecl())) {
     SVal sv = msg.getReturnValue();
     setLocalizedState(sv, C);
   }
diff --git a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
index 89b1291c4f46..21ccf21515b3 100644
--- a/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/RetainCountChecker.cpp
@@ -1304,6 +1304,21 @@ RetainSummaryManager::getCFSummaryGetRule(const FunctionDecl *FD) {
                               DoNothing, DoNothing);
 }
 
+/// Returns true if the declaration 'D' is annotated with 'rcAnnotation'.
+static bool hasRCAnnotation(const Decl *D, StringRef rcAnnotation) {
+  for (const auto *Ann : D->specific_attrs<AnnotateAttr>()) {
+    if (Ann->getAnnotation() == rcAnnotation)
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if the function declaration 'FD' contains
+/// 'rc_ownership_trusted_implementation' annotate attribute.
+static bool isTrustedReferenceCountImplementation(const FunctionDecl *FD) {
+  return hasRCAnnotation(FD, "rc_ownership_trusted_implementation");
+}
+
 //===----------------------------------------------------------------------===//
 // Summary creation for Selectors.
 //===----------------------------------------------------------------------===//
@@ -3380,6 +3395,9 @@ bool RetainCountChecker::evalCall(const CallExpr *CE, CheckerContext &C) const {
 
   // See if it's one of the specific functions we know how to eval.
   bool canEval = false;
+  // See if the function has 'rc_ownership_trusted_implementation'
+  // annotate attribute. If it does, we will not inline it.
+  bool hasTrustedImplementationAnnotation = false;
 
   QualType ResultTy = CE->getCallReturnType(C.getASTContext());
   if (ResultTy->isObjCIdType()) {
@@ -3395,6 +3413,11 @@ bool RetainCountChecker::evalCall(const CallExpr *CE, CheckerContext &C) const {
         cocoa::isRefType(ResultTy, "CV", FName)) {
       canEval = isRetain(FD, FName) || isAutorelease(FD, FName) ||
                 isMakeCollectable(FD, FName);
+    } else {
+      if (FD->getDefinition()) {
+        canEval = isTrustedReferenceCountImplementation(FD->getDefinition());
+        hasTrustedImplementationAnnotation = canEval;
+      }
     }
   }
 
@@ -3404,8 +3427,11 @@ bool RetainCountChecker::evalCall(const CallExpr *CE, CheckerContext &C) const {
   // Bind the return value.
   const LocationContext *LCtx = C.getLocationContext();
   SVal RetVal = state->getSVal(CE->getArg(0), LCtx);
-  if (RetVal.isUnknown()) {
-    // If the receiver is unknown, conjure a return value.
+  if (RetVal.isUnknown() ||
+      (hasTrustedImplementationAnnotation && !ResultTy.isNull())) {
+    // If the receiver is unknown or the function has
+    // 'rc_ownership_trusted_implementation' annotate attribute, conjure a
+    // return value.
     SValBuilder &SVB = C.getSValBuilder();
     RetVal = SVB.conjureSymbolVal(nullptr, CE, LCtx, ResultTy, C.blockCount());
   }
@@ -3421,8 +3447,9 @@ bool RetainCountChecker::evalCall(const CallExpr *CE, CheckerContext &C) const {
       Binding = getRefBinding(state, Sym);
 
     // Invalidate the argument region.
-    state = state->invalidateRegions(ArgRegion, CE, C.blockCount(), LCtx,
-                                     /*CausesPointerEscape*/ false);
+    state = state->invalidateRegions(
+        ArgRegion, CE, C.blockCount(), LCtx,
+        /*CausesPointerEscape*/ hasTrustedImplementationAnnotation);
 
     // Restore the refcount status of the argument.
     if (Binding)
diff --git a/lib/Tooling/Refactoring/Rename/USRFinder.cpp b/lib/Tooling/Refactoring/Rename/USRFinder.cpp
index f36387dafdbf..3bfb5bbe35e4 100644
--- a/lib/Tooling/Refactoring/Rename/USRFinder.cpp
+++ b/lib/Tooling/Refactoring/Rename/USRFinder.cpp
@@ -18,6 +18,7 @@
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Index/USRGeneration.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Tooling/Refactoring/RecursiveSymbolVisitor.h"
 #include "llvm/ADT/SmallVector.h"
 
 using namespace llvm;
@@ -25,132 +26,38 @@ using namespace llvm;
 namespace clang {
 namespace tooling {
 
-// NamedDeclFindingASTVisitor recursively visits each AST node to find the
-// symbol underneath the cursor.
-// FIXME: move to separate .h/.cc file if this gets too large.
 namespace {
-class NamedDeclFindingASTVisitor
-    : public clang::RecursiveASTVisitor<NamedDeclFindingASTVisitor> {
+
+/// Recursively visits each AST node to find the symbol underneath the cursor.
+class NamedDeclOccurrenceFindingVisitor
+    : public RecursiveSymbolVisitor<NamedDeclOccurrenceFindingVisitor> {
 public:
   // \brief Finds the NamedDecl at a point in the source.
   // \param Point the location in the source to search for the NamedDecl.
-  explicit NamedDeclFindingASTVisitor(const SourceLocation Point,
-                                      const ASTContext &Context)
-      : Result(nullptr), Point(Point), Context(Context) {}
+  explicit NamedDeclOccurrenceFindingVisitor(const SourceLocation Point,
+                                             const ASTContext &Context)
+      : RecursiveSymbolVisitor(Context.getSourceManager(),
+                               Context.getLangOpts()),
+        Point(Point), Context(Context) {}
 
-  // \brief Finds the NamedDecl for a name in the source.
-  // \param Name the fully qualified name.
-  explicit NamedDeclFindingASTVisitor(const std::string &Name,
-                                      const ASTContext &Context)
-      : Result(nullptr), Name(Name), Context(Context) {}
-
-  // Declaration visitors:
-
-  // \brief Checks if the point falls within the NameDecl. This covers every
-  // declaration of a named entity that we may come across. Usually, just
-  // checking if the point lies within the length of the name of the declaration
-  // and the start location is sufficient.
-  bool VisitNamedDecl(const NamedDecl *Decl) {
-    return dyn_cast<CXXConversionDecl>(Decl)
-               ? true
-               : setResult(Decl, Decl->getLocation(),
-                           Decl->getNameAsString().length());
-  }
-
-  // Expression visitors:
-
-  bool VisitDeclRefExpr(const DeclRefExpr *Expr) {
-    const NamedDecl *Decl = Expr->getFoundDecl();
-    return setResult(Decl, Expr->getLocation(),
-                     Decl->getNameAsString().length());
-  }
-
-  bool VisitMemberExpr(const MemberExpr *Expr) {
-    const NamedDecl *Decl = Expr->getFoundDecl().getDecl();
-    return setResult(Decl, Expr->getMemberLoc(),
-                     Decl->getNameAsString().length());
-  }
-
-  // Other visitors:
-
-  bool VisitTypeLoc(const TypeLoc Loc) {
-    const SourceLocation TypeBeginLoc = Loc.getBeginLoc();
-    const SourceLocation TypeEndLoc = Lexer::getLocForEndOfToken(
-        TypeBeginLoc, 0, Context.getSourceManager(), Context.getLangOpts());
-    if (const auto *TemplateTypeParm =
-            dyn_cast<TemplateTypeParmType>(Loc.getType()))
-      return setResult(TemplateTypeParm->getDecl(), TypeBeginLoc, TypeEndLoc);
-    if (const auto *TemplateSpecType =
-            dyn_cast<TemplateSpecializationType>(Loc.getType())) {
-      return setResult(TemplateSpecType->getTemplateName().getAsTemplateDecl(),
-                       TypeBeginLoc, TypeEndLoc);
-    }
-    return setResult(Loc.getType()->getAsCXXRecordDecl(), TypeBeginLoc,
-                     TypeEndLoc);
-  }
-
-  bool VisitCXXConstructorDecl(clang::CXXConstructorDecl *ConstructorDecl) {
-    for (const auto *Initializer : ConstructorDecl->inits()) {
-      // Ignore implicit initializers.
-      if (!Initializer->isWritten())
-        continue;
-      if (const clang::FieldDecl *FieldDecl = Initializer->getMember()) {
-        const SourceLocation InitBeginLoc = Initializer->getSourceLocation(),
-                             InitEndLoc = Lexer::getLocForEndOfToken(
-                                 InitBeginLoc, 0, Context.getSourceManager(),
-                                 Context.getLangOpts());
-        if (!setResult(FieldDecl, InitBeginLoc, InitEndLoc))
-          return false;
-      }
-    }
-    return true;
-  }
-
-  // Other:
-
-  const NamedDecl *getNamedDecl() { return Result; }
-
-  // \brief Determines if a namespace qualifier contains the point.
-  // \returns false on success and sets Result.
-  void handleNestedNameSpecifierLoc(NestedNameSpecifierLoc NameLoc) {
-    while (NameLoc) {
-      const NamespaceDecl *Decl =
-          NameLoc.getNestedNameSpecifier()->getAsNamespace();
-      setResult(Decl, NameLoc.getLocalBeginLoc(), NameLoc.getLocalEndLoc());
-      NameLoc = NameLoc.getPrefix();
-    }
-  }
-
-private:
-  // \brief Sets Result to Decl if the Point is within Start and End.
-  // \returns false on success.
-  bool setResult(const NamedDecl *Decl, SourceLocation Start,
-                 SourceLocation End) {
-    if (!Decl)
+  bool visitSymbolOccurrence(const NamedDecl *ND,
+                             ArrayRef<SourceRange> NameRanges) {
+    if (!ND)
       return true;
-    if (Name.empty()) {
-      // Offset is used to find the declaration.
+    for (const auto &Range : NameRanges) {
+      SourceLocation Start = Range.getBegin();
+      SourceLocation End = Range.getEnd();
       if (!Start.isValid() || !Start.isFileID() || !End.isValid() ||
           !End.isFileID() || !isPointWithin(Start, End))
         return true;
-    } else {
-      // Fully qualified name is used to find the declaration.
-      if (Name != Decl->getQualifiedNameAsString() &&
-          Name != "::" + Decl->getQualifiedNameAsString())
-        return true;
     }
-    Result = Decl;
+    Result = ND;
     return false;
   }
 
-  // \brief Sets Result to Decl if Point is within Loc and Loc + Offset.
-  // \returns false on success.
-  bool setResult(const NamedDecl *Decl, SourceLocation Loc, unsigned Offset) {
-    // FIXME: Add test for Offset == 0. Add test for Offset - 1 (vs -2 etc).
-    return Offset == 0 ||
-           setResult(Decl, Loc, Loc.getLocWithOffset(Offset - 1));
-  }
+  const NamedDecl *getNamedDecl() const { return Result; }
 
+private:
   // \brief Determines if the Point is within Start and End.
   bool isPointWithin(const SourceLocation Start, const SourceLocation End) {
     // FIXME: Add tests for Point == End.
@@ -160,17 +67,17 @@ class NamedDeclFindingASTVisitor
             Context.getSourceManager().isBeforeInTranslationUnit(Point, End));
   }
 
-  const NamedDecl *Result;
+  const NamedDecl *Result = nullptr;
   const SourceLocation Point; // The location to find the NamedDecl.
-  const std::string Name;
   const ASTContext &Context;
 };
-} // namespace
+
+} // end anonymous namespace
 
 const NamedDecl *getNamedDeclAt(const ASTContext &Context,
                                 const SourceLocation Point) {
   const SourceManager &SM = Context.getSourceManager();
-  NamedDeclFindingASTVisitor Visitor(Point, Context);
+  NamedDeclOccurrenceFindingVisitor Visitor(Point, Context);
 
   // Try to be clever about pruning down the number of top-level declarations we
   // see. If both start and end is either before or after the point we're
@@ -184,18 +91,44 @@ const NamedDecl *getNamedDeclAt(const ASTContext &Context,
       Visitor.TraverseDecl(CurrDecl);
   }
 
-  NestedNameSpecifierLocFinder Finder(const_cast<ASTContext &>(Context));
-  for (const auto &Location : Finder.getNestedNameSpecifierLocations())
-    Visitor.handleNestedNameSpecifierLoc(Location);
-
   return Visitor.getNamedDecl();
 }
 
+namespace {
+
+/// Recursively visits each NamedDecl node to find the declaration with a
+/// specific name.
+class NamedDeclFindingVisitor
+    : public RecursiveASTVisitor<NamedDeclFindingVisitor> {
+public:
+  explicit NamedDeclFindingVisitor(StringRef Name) : Name(Name) {}
+
+  // We don't have to traverse the uses to find some declaration with a
+  // specific name, so just visit the named declarations.
+  bool VisitNamedDecl(const NamedDecl *ND) {
+    if (!ND)
+      return true;
+    // Fully qualified name is used to find the declaration.
+    if (Name != ND->getQualifiedNameAsString() &&
+        Name != "::" + ND->getQualifiedNameAsString())
+      return true;
+    Result = ND;
+    return false;
+  }
+
+  const NamedDecl *getNamedDecl() const { return Result; }
+
+private:
+  const NamedDecl *Result = nullptr;
+  StringRef Name;
+};
+
+} // end anonymous namespace
+
 const NamedDecl *getNamedDeclFor(const ASTContext &Context,
                                  const std::string &Name) {
-  NamedDeclFindingASTVisitor Visitor(Name, Context);
+  NamedDeclFindingVisitor Visitor(Name);
   Visitor.TraverseDecl(Context.getTranslationUnitDecl());
-
   return Visitor.getNamedDecl();
 }
 
diff --git a/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp b/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp
index 934507fe6eae..dc21a94610cb 100644
--- a/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp
+++ b/lib/Tooling/Refactoring/Rename/USRLocFinder.cpp
@@ -22,6 +22,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Tooling/Core/Lookup.h"
+#include "clang/Tooling/Refactoring/RecursiveSymbolVisitor.h"
 #include "clang/Tooling/Refactoring/Rename/USRFinder.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -40,70 +41,27 @@ namespace {
 // \brief This visitor recursively searches for all instances of a USR in a
 // translation unit and stores them for later usage.
 class USRLocFindingASTVisitor
-    : public clang::RecursiveASTVisitor<USRLocFindingASTVisitor> {
+    : public RecursiveSymbolVisitor<USRLocFindingASTVisitor> {
 public:
   explicit USRLocFindingASTVisitor(const std::vector<std::string> &USRs,
                                    StringRef PrevName,
                                    const ASTContext &Context)
-      : USRSet(USRs.begin(), USRs.end()), PrevName(PrevName), Context(Context) {
+      : RecursiveSymbolVisitor(Context.getSourceManager(),
+                               Context.getLangOpts()),
+        USRSet(USRs.begin(), USRs.end()), PrevName(PrevName), Context(Context) {
   }
 
-  // Declaration visitors:
-
-  bool VisitCXXConstructorDecl(clang::CXXConstructorDecl *ConstructorDecl) {
-    for (const auto *Initializer : ConstructorDecl->inits()) {
-      // Ignore implicit initializers.
-      if (!Initializer->isWritten())
-        continue;
-      if (const clang::FieldDecl *FieldDecl = Initializer->getMember()) {
-        if (USRSet.find(getUSRForDecl(FieldDecl)) != USRSet.end())
-          LocationsFound.push_back(Initializer->getSourceLocation());
-      }
-    }
-    return true;
-  }
-
-  bool VisitNamedDecl(const NamedDecl *Decl) {
-    if (USRSet.find(getUSRForDecl(Decl)) != USRSet.end())
-      checkAndAddLocation(Decl->getLocation());
-    return true;
-  }
-
-  // Expression visitors:
-
-  bool VisitDeclRefExpr(const DeclRefExpr *Expr) {
-    const NamedDecl *Decl = Expr->getFoundDecl();
-
-    if (USRSet.find(getUSRForDecl(Decl)) != USRSet.end()) {
-      const SourceManager &Manager = Decl->getASTContext().getSourceManager();
-      SourceLocation Location = Manager.getSpellingLoc(Expr->getLocation());
-      checkAndAddLocation(Location);
-    }
-
-    return true;
-  }
-
-  bool VisitMemberExpr(const MemberExpr *Expr) {
-    const NamedDecl *Decl = Expr->getFoundDecl().getDecl();
-    if (USRSet.find(getUSRForDecl(Decl)) != USRSet.end()) {
-      const SourceManager &Manager = Decl->getASTContext().getSourceManager();
-      SourceLocation Location = Manager.getSpellingLoc(Expr->getMemberLoc());
-      checkAndAddLocation(Location);
-    }
-    return true;
-  }
-
-  // Other visitors:
-
-  bool VisitTypeLoc(const TypeLoc Loc) {
-    if (USRSet.find(getUSRForDecl(Loc.getType()->getAsCXXRecordDecl())) !=
-        USRSet.end())
-      checkAndAddLocation(Loc.getBeginLoc());
-    if (const auto *TemplateTypeParm =
-            dyn_cast<TemplateTypeParmType>(Loc.getType())) {
-      if (USRSet.find(getUSRForDecl(TemplateTypeParm->getDecl())) !=
-          USRSet.end())
-        checkAndAddLocation(Loc.getBeginLoc());
+  bool visitSymbolOccurrence(const NamedDecl *ND,
+                             ArrayRef<SourceRange> NameRanges) {
+    if (USRSet.find(getUSRForDecl(ND)) != USRSet.end()) {
+      assert(NameRanges.size() == 1 &&
+             "Multiple name pieces are not supported yet!");
+      SourceLocation Loc = NameRanges[0].getBegin();
+      const SourceManager &SM = Context.getSourceManager();
+      // TODO: Deal with macro occurrences correctly.
+      if (Loc.isMacroID())
+        Loc = SM.getSpellingLoc(Loc);
+      checkAndAddLocation(Loc);
     }
     return true;
   }
@@ -116,17 +74,6 @@ class USRLocFindingASTVisitor
     return LocationsFound;
   }
 
-  // Namespace traversal:
-  void handleNestedNameSpecifierLoc(NestedNameSpecifierLoc NameLoc) {
-    while (NameLoc) {
-      const NamespaceDecl *Decl =
-          NameLoc.getNestedNameSpecifier()->getAsNamespace();
-      if (Decl && USRSet.find(getUSRForDecl(Decl)) != USRSet.end())
-        checkAndAddLocation(NameLoc.getLocalBeginLoc());
-      NameLoc = NameLoc.getPrefix();
-    }
-  }
-
 private:
   void checkAndAddLocation(SourceLocation Loc) {
     const SourceLocation BeginLoc = Loc;
@@ -449,11 +396,6 @@ getLocationsOfUSRs(const std::vector<std::string> &USRs, StringRef PrevName,
                    Decl *Decl) {
   USRLocFindingASTVisitor Visitor(USRs, PrevName, Decl->getASTContext());
   Visitor.TraverseDecl(Decl);
-  NestedNameSpecifierLocFinder Finder(Decl->getASTContext());
-
-  for (const auto &Location : Finder.getNestedNameSpecifierLocations())
-    Visitor.handleNestedNameSpecifierLoc(Location);
-
   return Visitor.getLocationsFound();
 }
 
diff --git a/lib/Tooling/Tooling.cpp b/lib/Tooling/Tooling.cpp
index c84fbf473753..662f02dca2a6 100644
--- a/lib/Tooling/Tooling.cpp
+++ b/lib/Tooling/Tooling.cpp
@@ -336,6 +336,7 @@ ClangTool::ClangTool(const CompilationDatabase &Compilations,
   OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   appendArgumentsAdjuster(getClangStripOutputAdjuster());
   appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
+  appendArgumentsAdjuster(getClangStripDependencyFileAdjuster());
 }
 
 ClangTool::~ClangTool() {}
diff --git a/test/Analysis/localization-aggressive.m b/test/Analysis/localization-aggressive.m
index 346cf3ef22e2..ea5e0b152945 100644
--- a/test/Analysis/localization-aggressive.m
+++ b/test/Analysis/localization-aggressive.m
@@ -61,8 +61,16 @@ int random();
 NSString *CFNumberFormatterCreateStringWithNumber(float x);
 + (NSString *)forceLocalized:(NSString *)str
     __attribute__((annotate("returns_localized_nsstring")));
++ (NSString *)takesLocalizedString:
+    (NSString *)__attribute__((annotate("takes_localized_nsstring")))str;
 @end
 
+NSString *
+takesLocalizedString(NSString *str
+                     __attribute__((annotate("takes_localized_nsstring")))) {
+  return str;
+}
+
 // Test cases begin here
 @implementation LocalizationTestSuite
 
@@ -75,6 +83,8 @@ NSString *ForceLocalized(NSString *str) { return str; }
   return str;
 }
 
++ (NSString *) takesLocalizedString:(NSString *)str { return str; }
+
 // An ObjC method that returns a localized string
 + (NSString *)unLocalizedStringMethod {
   return @"UnlocalizedString";
@@ -269,4 +279,13 @@ NSString *ForceLocalized(NSString *str) { return str; }
   NSString *string2 = POSSIBLE_FALSE_POSITIVE(@"Hello", @"Hello"); // no-warning
 }
 
+- (void)testTakesLocalizedString {
+  NSString *localized = NSLocalizedString(@"Hello", @"World");
+  NSString *alsoLocalized = [LocalizationTestSuite takesLocalizedString:localized]; // no-warning
+  NSString *stillLocalized = [LocalizationTestSuite takesLocalizedString:alsoLocalized]; // no-warning
+  takesLocalizedString(stillLocalized); // no-warning
+
+  [LocalizationTestSuite takesLocalizedString:@"not localized"]; // expected-warning {{User-facing text should use localized string macro}}
+  takesLocalizedString(@"not localized"); // expected-warning {{User-facing text should use localized string macro}}
+}
 @end
diff --git a/test/Analysis/retain-release-inline.m b/test/Analysis/retain-release-inline.m
index 0cde2c1cf5af..388c55f6be23 100644
--- a/test/Analysis/retain-release-inline.m
+++ b/test/Analysis/retain-release-inline.m
@@ -12,7 +12,7 @@
 //
 // It includes the basic definitions for the test cases below.
 //===----------------------------------------------------------------------===//
-
+#define NULL 0
 typedef unsigned int __darwin_natural_t;
 typedef unsigned long uintptr_t;
 typedef unsigned int uint32_t;
@@ -267,6 +267,10 @@ typedef NSUInteger NSStringEncoding;
 
 extern CFStringRef CFStringCreateWithCStringNoCopy(CFAllocatorRef alloc, const char *cStr, CFStringEncoding encoding, CFAllocatorRef contentsDeallocator);
 
+typedef struct {
+  int ref;
+} isl_basic_map;
+
 //===----------------------------------------------------------------------===//
 // Test cases.
 //===----------------------------------------------------------------------===//
@@ -285,6 +289,7 @@ void test() {
   foo(s);
   bar(s);
 }
+
 void test_neg() {
   NSString *s = [[NSString alloc] init]; // no-warning  
   foo(s);
@@ -294,6 +299,55 @@ void test_neg() {
   bar(s);
 }
 
+__attribute__((cf_returns_retained)) isl_basic_map *isl_basic_map_cow(__attribute__((cf_consumed)) isl_basic_map *bmap);
+void free(void *);
+
+// As 'isl_basic_map_free' is annotated with 'rc_ownership_trusted_implementation', RetainCountChecker trusts its
+// implementation and doesn't analyze its body. If the annotation 'rc_ownership_trusted_implementation' is removed,
+// a leak warning is raised by RetainCountChecker as the analyzer is unable to detect a decrement in the reference
+// count of 'bmap' along the path in 'isl_basic_map_free' assuming the predicate of the second 'if' branch to be
+// true or assuming both the predicates in the function to be false.
+__attribute__((annotate("rc_ownership_trusted_implementation"))) isl_basic_map *isl_basic_map_free(__attribute__((cf_consumed)) isl_basic_map *bmap) {
+  if (!bmap)
+    return NULL;
+
+  if (--bmap->ref > 0)
+    return NULL;
+
+  free(bmap);
+  return NULL;
+}
+
+// As 'isl_basic_map_copy' is annotated with 'rc_ownership_trusted_implementation', RetainCountChecker trusts its
+// implementation and doesn't analyze its body. If that annotation is removed, a 'use-after-release' warning might
+// be raised by RetainCountChecker as the pointer which is passed as an argument to this function and the pointer
+// which is returned from the function point to the same memory location.
+__attribute__((annotate("rc_ownership_trusted_implementation"))) __attribute__((cf_returns_retained)) isl_basic_map *isl_basic_map_copy(isl_basic_map *bmap) {
+  if (!bmap)
+    return NULL;
+
+  bmap->ref++;
+  return bmap;
+}
+
+void test_use_after_release_with_trusted_implementation_annotate_attribute(__attribute__((cf_consumed)) isl_basic_map *bmap) {
+  // After this call, 'bmap' has a +1 reference count.
+  bmap = isl_basic_map_cow(bmap);
+  // After the call to 'isl_basic_map_copy', 'bmap' has a +1 reference count.
+  isl_basic_map *temp = isl_basic_map_cow(isl_basic_map_copy(bmap));
+  // After this call, 'bmap' has a +0 reference count.
+  isl_basic_map *temp2 = isl_basic_map_cow(bmap); // no-warning
+  isl_basic_map_free(temp2);
+  isl_basic_map_free(temp);
+}
+
+void test_leak_with_trusted_implementation_annotate_attribute(__attribute__((cf_consumed)) isl_basic_map *bmap) {
+  // After this call, 'bmap' has a +1 reference count.
+  bmap = isl_basic_map_cow(bmap); // no-warning
+  // After this call, 'bmap' has a +0 reference count.
+  isl_basic_map_free(bmap);
+}
+
 //===----------------------------------------------------------------------===//
 // Test returning retained and not-retained values.
 //===----------------------------------------------------------------------===//
diff --git a/test/Analysis/retain-release.m b/test/Analysis/retain-release.m
index 39a3baa8960c..29af194a3d67 100644
--- a/test/Analysis/retain-release.m
+++ b/test/Analysis/retain-release.m
@@ -1447,7 +1447,7 @@ typedef NSString* MyStringTy;
 + (void) consume2:(id) CF_CONSUMED x;
 @end
 
-static int ownership_attribute_doesnt_go_here NS_RETURNS_RETAINED; // expected-warning{{'ns_returns_retained' attribute only applies to functions and methods}}
+static int ownership_attribute_doesnt_go_here NS_RETURNS_RETAINED; // expected-warning{{'ns_returns_retained' only applies to function types; type here is 'int'}}
 
 void test_attr_1(TestOwnershipAttr *X) {
   NSString *str = [X returnsAnOwnedString]; // expected-warning{{leak}}
diff --git a/test/CodeGen/aarch64-type-sizes.c b/test/CodeGen/aarch64-type-sizes.c
index ce8b51fc4085..195e8968ecb1 100644
--- a/test/CodeGen/aarch64-type-sizes.c
+++ b/test/CodeGen/aarch64-type-sizes.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck %s
 // char by definition has size 1
 
 // CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/aarch64-varargs-ms.c b/test/CodeGen/aarch64-varargs-ms.c
new file mode 100644
index 000000000000..f3ff9603c9c2
--- /dev/null
+++ b/test/CodeGen/aarch64-varargs-ms.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple arm64-windows-msvc -emit-llvm -o - %s | FileCheck %s
+
+#include <stdarg.h>
+
+int simple_int(va_list ap) {
+// CHECK-LABEL: define i32 @simple_int
+  return va_arg(ap, int);
+// CHECK: [[ADDR:%[a-z_0-9]+]] = bitcast i8* %argp.cur to i32*
+// CHECK: [[RESULT:%[a-z_0-9]+]] = load i32, i32* [[ADDR]]
+// CHECK: ret i32 [[RESULT]]
+}
diff --git a/test/CodeGen/builtins-hexagon.c b/test/CodeGen/builtins-hexagon.c
index e2eda2afafd5..f9f5d495d02a 100644
--- a/test/CodeGen/builtins-hexagon.c
+++ b/test/CodeGen/builtins-hexagon.c
@@ -2962,4 +2962,16 @@ void foo() {
   // CHECK: @llvm.hexagon.V6.vzh.128B
   __builtin_HEXAGON_V6_vzh(v16);
   // CHECK: @llvm.hexagon.V6.vzh
+  __builtin_HEXAGON_Y2_dccleana(0);
+  // CHECK: @llvm.hexagon.Y2.dccleana
+  __builtin_HEXAGON_Y2_dccleaninva(0);
+  // CHECK: @llvm.hexagon.Y2.dccleaninva
+  __builtin_HEXAGON_Y2_dcinva(0);
+  // CHECK: @llvm.hexagon.Y2.dcinva
+  __builtin_HEXAGON_Y2_dczeroa(0);
+  // CHECK: @llvm.hexagon.Y2.dczeroa
+  __builtin_HEXAGON_Y4_l2fetch(0, 0);
+  // CHECK: @llvm.hexagon.Y4.l2fetch
+  __builtin_HEXAGON_Y5_l2fetch(0, 0);
+  // CHECK: @llvm.hexagon.Y5.l2fetch
 }
diff --git a/test/CodeGen/builtins-systemz-vector2-error.c b/test/CodeGen/builtins-systemz-vector2-error.c
new file mode 100644
index 000000000000..ceebaf8cc72f
--- /dev/null
+++ b/test/CodeGen/builtins-systemz-vector2-error.c
@@ -0,0 +1,61 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu z14 -triple s390x-unknown-unknown \
+// RUN: -Wall -Wno-unused -Werror -fsyntax-only -verify %s
+
+typedef __attribute__((vector_size(16))) signed char vec_schar;
+typedef __attribute__((vector_size(16))) signed short vec_sshort;
+typedef __attribute__((vector_size(16))) signed int vec_sint;
+typedef __attribute__((vector_size(16))) signed long long vec_slong;
+typedef __attribute__((vector_size(16))) unsigned char vec_uchar;
+typedef __attribute__((vector_size(16))) unsigned short vec_ushort;
+typedef __attribute__((vector_size(16))) unsigned int vec_uint;
+typedef __attribute__((vector_size(16))) unsigned long long vec_ulong;
+typedef __attribute__((vector_size(16))) double vec_double;
+typedef __attribute__((vector_size(16))) float vec_float;
+
+volatile vec_schar vsc;
+volatile vec_sshort vss;
+volatile vec_sint vsi;
+volatile vec_slong vsl;
+volatile vec_uchar vuc;
+volatile vec_ushort vus;
+volatile vec_uint vui;
+volatile vec_ulong vul;
+volatile vec_double vd;
+volatile vec_float vf;
+
+volatile unsigned int len;
+int cc;
+
+void test_integer(void) {
+  __builtin_s390_vmslg(vul, vul, vuc, -1);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vmslg(vul, vul, vuc, 16);   // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vmslg(vul, vul, vuc, len);  // expected-error {{must be a constant integer}}
+}
+
+void test_float(void) {
+  __builtin_s390_vfmaxdb(vd, vd, -1);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmaxdb(vd, vd, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmaxdb(vd, vd, len);       // expected-error {{must be a constant integer}}
+  __builtin_s390_vfmindb(vd, vd, -1);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmindb(vd, vd, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmindb(vd, vd, len);       // expected-error {{must be a constant integer}}
+
+  __builtin_s390_vftcisb(vf, -1, &cc);       // expected-error {{argument should be a value from 0 to 4095}}
+  __builtin_s390_vftcisb(vf, 4096, &cc);     // expected-error {{argument should be a value from 0 to 4095}}
+  __builtin_s390_vftcisb(vf, len, &cc);      // expected-error {{must be a constant integer}}
+
+  __builtin_s390_vfisb(vf, -1, 0);           // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfisb(vf, 16, 0);           // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfisb(vf, len, 0);          // expected-error {{must be a constant integer}}
+  __builtin_s390_vfisb(vf, 0, -1);           // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfisb(vf, 0, 16);           // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfisb(vf, 0, len);          // expected-error {{must be a constant integer}}
+
+  __builtin_s390_vfmaxsb(vf, vf, -1);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmaxsb(vf, vf, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfmaxsb(vf, vf, len);       // expected-error {{must be a constant integer}}
+  __builtin_s390_vfminsb(vf, vf, -1);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfminsb(vf, vf, 16);        // expected-error {{argument should be a value from 0 to 15}}
+  __builtin_s390_vfminsb(vf, vf, len);       // expected-error {{must be a constant integer}}
+}
diff --git a/test/CodeGen/builtins-systemz-vector2.c b/test/CodeGen/builtins-systemz-vector2.c
new file mode 100644
index 000000000000..8f0b8aee8c2d
--- /dev/null
+++ b/test/CodeGen/builtins-systemz-vector2.c
@@ -0,0 +1,136 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu z14 -triple s390x-ibm-linux -fno-lax-vector-conversions \
+// RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
+
+typedef __attribute__((vector_size(16))) signed char vec_schar;
+typedef __attribute__((vector_size(16))) signed short vec_sshort;
+typedef __attribute__((vector_size(16))) signed int vec_sint;
+typedef __attribute__((vector_size(16))) signed long long vec_slong;
+typedef __attribute__((vector_size(16))) unsigned char vec_uchar;
+typedef __attribute__((vector_size(16))) unsigned short vec_ushort;
+typedef __attribute__((vector_size(16))) unsigned int vec_uint;
+typedef __attribute__((vector_size(16))) unsigned long long vec_ulong;
+typedef __attribute__((vector_size(16))) double vec_double;
+typedef __attribute__((vector_size(16))) float vec_float;
+
+volatile vec_schar vsc;
+volatile vec_sshort vss;
+volatile vec_sint vsi;
+volatile vec_slong vsl;
+volatile vec_uchar vuc;
+volatile vec_ushort vus;
+volatile vec_uint vui;
+volatile vec_ulong vul;
+volatile vec_double vd;
+volatile vec_float vf;
+
+volatile unsigned int len;
+const void * volatile cptr;
+void * volatile ptr;
+int cc;
+
+void test_core(void) {
+  vul = __builtin_s390_vbperm(vuc, vuc);
+  // CHECK: call <2 x i64> @llvm.s390.vbperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  vsc = __builtin_s390_vlrl(len, cptr);
+  // CHECK: call <16 x i8> @llvm.s390.vlrl(i32 %{{.*}}, i8* %{{.*}})
+
+  __builtin_s390_vstrl(vsc, len, ptr);
+  // CHECK: call void @llvm.s390.vstrl(<16 x i8> %{{.*}}, i32 %{{.*}}, i8* %{{.*}})
+}
+
+void test_integer(void) {
+  vuc = __builtin_s390_vmslg(vul, vul, vuc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vuc = __builtin_s390_vmslg(vul, vul, vuc, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+}
+
+void test_float(void) {
+  vd = __builtin_s390_vfmaxdb(vd, vd, 4);
+  // CHECK: call <2 x double> @llvm.maxnum.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  vd = __builtin_s390_vfmaxdb(vd, vd, 0);
+  // CHECK: call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
+  vd = __builtin_s390_vfmaxdb(vd, vd, 15);
+  // CHECK: call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 15)
+
+  vd = __builtin_s390_vfmindb(vd, vd, 4);
+  // CHECK: call <2 x double> @llvm.minnum.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  vd = __builtin_s390_vfmindb(vd, vd, 0);
+  // CHECK: call <2 x double> @llvm.s390.vfmindb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
+  vd = __builtin_s390_vfmindb(vd, vd, 15);
+  // CHECK: call <2 x double> @llvm.s390.vfmindb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 15)
+
+  vd = __builtin_s390_vfnmadb(vd, vd, vd);
+  // CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[RES]]
+  vd = __builtin_s390_vfnmsdb(vd, vd, vd);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]])
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[RES]]
+
+  vsi = __builtin_s390_vfcesbs(vf, vf, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfcesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  vsi = __builtin_s390_vfchsbs(vf, vf, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  vsi = __builtin_s390_vfchesbs(vf, vf, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+
+  vsi = __builtin_s390_vftcisb(vf, 0, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 0)
+  vsi = __builtin_s390_vftcisb(vf, 4095, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 4095)
+
+  vf = __builtin_s390_vfmaxsb(vf, vf, 4);
+  // CHECK: call <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  vf = __builtin_s390_vfmaxsb(vf, vf, 0);
+  // CHECK: call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 0)
+  vf = __builtin_s390_vfmaxsb(vf, vf, 15);
+  // CHECK: call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 15)
+
+  vf = __builtin_s390_vfminsb(vf, vf, 4);
+  // CHECK: call <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  vf = __builtin_s390_vfminsb(vf, vf, 0);
+  // CHECK: call <4 x float> @llvm.s390.vfminsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 0)
+  vf = __builtin_s390_vfminsb(vf, vf, 15);
+  // CHECK: call <4 x float> @llvm.s390.vfminsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 15)
+
+  vf = __builtin_s390_vfsqsb(vf);
+  // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.*}})
+
+  vf = __builtin_s390_vfmasb(vf, vf, vf);
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  vf = __builtin_s390_vfmssb(vf, vf, vf);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
+  vf = __builtin_s390_vfnmasb(vf, vf, vf);
+  // CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[RES]]
+  vf = __builtin_s390_vfnmssb(vf, vf, vf);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[RES]]
+
+  vf = __builtin_s390_vflpsb(vf);
+  // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vflnsb(vf);
+  // CHECK: [[ABS:%[^ ]+]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ABS]]
+
+  vf = __builtin_s390_vfisb(vf, 0, 0);
+  // CHECK: call <4 x float> @llvm.rint.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vfisb(vf, 4, 0);
+  // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vfisb(vf, 4, 1);
+  // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vfisb(vf, 4, 5);
+  // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vfisb(vf, 4, 6);
+  // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vfisb(vf, 4, 7);
+  // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
+  vf = __builtin_s390_vfisb(vf, 4, 4);
+  // CHECK: call <4 x float> @llvm.s390.vfisb(<4 x float> %{{.*}}, i32 4, i32 4)
+}
+
diff --git a/test/CodeGen/builtins-systemz-zvector-error.c b/test/CodeGen/builtins-systemz-zvector-error.c
index b28e8d9bc1fc..8d5aaabfbac3 100644
--- a/test/CodeGen/builtins-systemz-zvector-error.c
+++ b/test/CodeGen/builtins-systemz-zvector-error.c
@@ -446,43 +446,43 @@ void test_integer(void) {
                                     // expected-note@vecintrin.h:* 1 {{must be a constant integer}}
 
   vsc = vec_sld(vsc, vsc, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vsc = vec_sld(vsc, vsc, -1);  // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vsc = vec_sld(vsc, vsc, 16);  // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vuc = vec_sld(vuc, vuc, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vuc = vec_sld(vuc, vuc, -1);  // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vuc = vec_sld(vuc, vuc, 16);  // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vss = vec_sld(vss, vss, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vus = vec_sld(vus, vus, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vsi = vec_sld(vsi, vsi, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vui = vec_sld(vui, vui, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vsl = vec_sld(vsl, vsl, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
   vul = vec_sld(vul, vul, idx); // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
-                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+                                // expected-note@vecintrin.h:* 11 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 2 {{must be a constant integer from 0 to 15}}
   vd = vec_sld(vd, vd, idx);    // expected-error {{no matching function}}
-                                // expected-note@vecintrin.h:* 8 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 12 {{candidate function not viable}}
                                 // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
 
   vsc = vec_sldw(vsc, vsc, idx); // expected-error {{no matching function}}
diff --git a/test/CodeGen/builtins-systemz-zvector.c b/test/CodeGen/builtins-systemz-zvector.c
index 6d554af44e93..a8adbd717ea1 100644
--- a/test/CodeGen/builtins-systemz-zvector.c
+++ b/test/CodeGen/builtins-systemz-zvector.c
@@ -294,6 +294,16 @@ void test_core(void) {
   vec_scatter_element(vd, vul, ptrd, 0);
   vec_scatter_element(vd, vul, ptrd, 1);
 
+  vsc = vec_xl(idx, cptrsc);
+  vuc = vec_xl(idx, cptruc);
+  vss = vec_xl(idx, cptrss);
+  vus = vec_xl(idx, cptrus);
+  vsi = vec_xl(idx, cptrsi);
+  vui = vec_xl(idx, cptrui);
+  vsl = vec_xl(idx, cptrsl);
+  vul = vec_xl(idx, cptrul);
+  vd = vec_xl(idx, cptrd);
+
   vsc = vec_xld2(idx, cptrsc);
   vuc = vec_xld2(idx, cptruc);
   vss = vec_xld2(idx, cptrss);
@@ -311,6 +321,16 @@ void test_core(void) {
   vsi = vec_xlw4(idx, cptrsi);
   vui = vec_xlw4(idx, cptrui);
 
+  vec_xst(vsc, idx, ptrsc);
+  vec_xst(vuc, idx, ptruc);
+  vec_xst(vss, idx, ptrss);
+  vec_xst(vus, idx, ptrus);
+  vec_xst(vsi, idx, ptrsi);
+  vec_xst(vui, idx, ptrui);
+  vec_xst(vsl, idx, ptrsl);
+  vec_xst(vul, idx, ptrul);
+  vec_xst(vd, idx, ptrd);
+
   vec_xstd2(vsc, idx, ptrsc);
   vec_xstd2(vuc, idx, ptruc);
   vec_xstd2(vss, idx, ptrss);
@@ -1841,6 +1861,10 @@ void test_integer(void) {
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vuc = vec_sld(vuc, vuc, 15);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  vbc = vec_sld(vbc, vbc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vbc = vec_sld(vbc, vbc, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
   vss = vec_sld(vss, vss, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vss = vec_sld(vss, vss, 15);
@@ -1849,6 +1873,10 @@ void test_integer(void) {
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vus = vec_sld(vus, vus, 15);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  vbs = vec_sld(vbs, vbs, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vbs = vec_sld(vbs, vbs, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
   vsi = vec_sld(vsi, vsi, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vsi = vec_sld(vsi, vsi, 15);
@@ -1857,6 +1885,10 @@ void test_integer(void) {
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vui = vec_sld(vui, vui, 15);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  vbi = vec_sld(vbi, vbi, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vbi = vec_sld(vbi, vbi, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
   vsl = vec_sld(vsl, vsl, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vsl = vec_sld(vsl, vsl, 15);
@@ -1865,6 +1897,10 @@ void test_integer(void) {
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vul = vec_sld(vul, vul, 15);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  vbl = vec_sld(vbl, vbl, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vbl = vec_sld(vbl, vbl, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
   vd = vec_sld(vd, vd, 0);
   // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
   vd = vec_sld(vd, vd, 15);
@@ -2943,6 +2979,16 @@ void test_float(void) {
   // CHECK: [[VAL:%[^ ]+]] = fmul <2 x double> %{{.*}}, <double 0x41E0000000000000, double 0x41E0000000000000>
   // CHECK: fptoui <2 x double> [[VAL]] to <2 x i64>
 
+  vd = vec_double(vsl);
+  // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
+  vd = vec_double(vul);
+  // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
+
+  vsl = vec_signed(vd);
+  // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
+  vul = vec_unsigned(vd);
+  // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
+
   vd = vec_roundp(vd);
   // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
   vd = vec_ceil(vd);
@@ -2957,6 +3003,8 @@ void test_float(void) {
   // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{.*}})
   vd = vec_roundc(vd);
   // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{.*}})
+  vd = vec_rint(vd);
+  // CHECK: call <2 x double> @llvm.rint.v2f64(<2 x double> %{{.*}})
   vd = vec_round(vd);
   // CHECK: call <2 x double> @llvm.s390.vfidb(<2 x double> %{{.*}}, i32 4, i32 4)
 
@@ -2964,4 +3012,44 @@ void test_float(void) {
   // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 0)
   vbl = vec_fp_test_data_class(vd, 4095, &cc);
   // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 4095)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_ZERO_P, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 2048)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_ZERO_N, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 1024)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_ZERO, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 3072)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_NORMAL_P, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 512)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_NORMAL_N, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 256)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_NORMAL, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 768)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_SUBNORMAL_P, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 128)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_SUBNORMAL_N, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 64)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_SUBNORMAL, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 192)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_INFINITY_P, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 32)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_INFINITY_N, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 16)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_INFINITY, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 48)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_QNAN_P, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 8)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_QNAN_N, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 4)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_QNAN, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 12)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_SNAN_P, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 2)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_SNAN_N, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 1)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_SNAN, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 3)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_NAN, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 15)
+  vbl = vec_fp_test_data_class(vd, __VEC_CLASS_FP_NOT_NORMAL, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 3327)
 }
diff --git a/test/CodeGen/builtins-systemz-zvector2-error.c b/test/CodeGen/builtins-systemz-zvector2-error.c
new file mode 100644
index 000000000000..823d6374dc0e
--- /dev/null
+++ b/test/CodeGen/builtins-systemz-zvector2-error.c
@@ -0,0 +1,153 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu z14 -triple s390x-linux-gnu \
+// RUN: -fzvector -fno-lax-vector-conversions \
+// RUN: -Wall -Wno-unused -Werror -fsyntax-only -verify %s
+
+#include <vecintrin.h>
+
+volatile vector signed char vsc;
+volatile vector signed short vss;
+volatile vector signed int vsi;
+volatile vector signed long long vsl;
+volatile vector unsigned char vuc;
+volatile vector unsigned short vus;
+volatile vector unsigned int vui;
+volatile vector unsigned long long vul;
+volatile vector bool char vbc;
+volatile vector bool short vbs;
+volatile vector bool int vbi;
+volatile vector bool long long vbl;
+volatile vector float vf;
+volatile vector double vd;
+
+volatile signed char sc;
+volatile signed short ss;
+volatile signed int si;
+volatile signed long long sl;
+volatile unsigned char uc;
+volatile unsigned short us;
+volatile unsigned int ui;
+volatile unsigned long long ul;
+volatile float f;
+volatile double d;
+
+const void * volatile cptr;
+const signed char * volatile cptrsc;
+const signed short * volatile cptrss;
+const signed int * volatile cptrsi;
+const signed long long * volatile cptrsl;
+const unsigned char * volatile cptruc;
+const unsigned short * volatile cptrus;
+const unsigned int * volatile cptrui;
+const unsigned long long * volatile cptrul;
+const float * volatile cptrf;
+const double * volatile cptrd;
+
+void * volatile ptr;
+signed char * volatile ptrsc;
+signed short * volatile ptrss;
+signed int * volatile ptrsi;
+signed long long * volatile ptrsl;
+unsigned char * volatile ptruc;
+unsigned short * volatile ptrus;
+unsigned int * volatile ptrui;
+unsigned long long * volatile ptrul;
+float * volatile ptrf;
+double * volatile ptrd;
+
+volatile unsigned int len;
+volatile int idx;
+int cc;
+
+void test_core(void) {
+  vf = vec_gather_element(vf, vui, cptrf, idx);    // expected-error {{no matching function}}
+                                                   // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                                   // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vf = vec_gather_element(vf, vui, cptrf, -1);     // expected-error {{no matching function}}
+                                                   // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                                   // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vf = vec_gather_element(vf, vui, cptrf, 4);      // expected-error {{no matching function}}
+                                                   // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                                   // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vd = vec_gather_element(vd, vul, cptrd, idx);    // expected-error {{no matching function}}
+                                                   // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                                   // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+  vd = vec_gather_element(vd, vul, cptrd, -1);     // expected-error {{no matching function}}
+                                                   // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                                   // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+  vd = vec_gather_element(vd, vul, cptrd, 2);      // expected-error {{no matching function}}
+                                                   // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                                   // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+
+  vec_scatter_element(vf, vui, ptrf, idx);   // expected-error {{no matching function}}
+                                             // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vec_scatter_element(vf, vui, ptrf, -1);    // expected-error {{no matching function}}
+                                             // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vec_scatter_element(vf, vui, ptrf, 4);     // expected-error {{no matching function}}
+                                             // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vec_scatter_element(vd, vul, ptrd, idx);   // expected-error {{no matching function}}
+                                             // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+  vec_scatter_element(vd, vul, ptrd, -1);    // expected-error {{no matching function}}
+                                             // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+  vec_scatter_element(vd, vul, ptrd, 2);     // expected-error {{no matching function}}
+                                             // expected-note@vecintrin.h:* 7 {{candidate function not viable}}
+                                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+
+  vf = vec_splat(vf, idx);   // expected-error {{no matching function}}
+                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vf = vec_splat(vf, -1);    // expected-error {{no matching function}}
+                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vf = vec_splat(vf, 4);     // expected-error {{no matching function}}
+                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 3}}
+  vd = vec_splat(vd, idx);   // expected-error {{no matching function}}
+                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+  vd = vec_splat(vd, -1);    // expected-error {{no matching function}}
+                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+  vd = vec_splat(vd, 2);     // expected-error {{no matching function}}
+                             // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                             // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 1}}
+}
+
+void test_integer(void) {
+  vf = vec_sld(vf, vf, idx);    // expected-error {{no matching function}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+  vd = vec_sld(vd, vd, idx);    // expected-error {{no matching function}}
+                                // expected-note@vecintrin.h:* 13 {{candidate function not viable}}
+                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}}
+
+  vuc = vec_msum_u128(vul, vul, vuc, idx);  // expected-error {{must be a constant integer}}
+  vuc = vec_msum_u128(vul, vul, vuc, -1);   // expected-error {{should be a value from 0 to 15}}
+  vuc = vec_msum_u128(vul, vul, vuc, 16);   // expected-error {{should be a value from 0 to 15}}
+}
+
+void test_float(void) {
+  vbi = vec_fp_test_data_class(vf, idx, &cc);   // expected-error {{no matching function}}
+                                                // expected-note@vecintrin.h:* 1 {{candidate function not viable}}
+                                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 4095}}
+  vbi = vec_fp_test_data_class(vf, -1, &cc);    // expected-error {{no matching function}}
+                                                // expected-note@vecintrin.h:* 1 {{candidate function not viable}}
+                                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 4095}}
+  vbi = vec_fp_test_data_class(vf, 4096, &cc);  // expected-error {{no matching function}}
+                                                // expected-note@vecintrin.h:* 1 {{candidate function not viable}}
+                                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 4095}}
+  vbl = vec_fp_test_data_class(vd, idx, &cc);   // expected-error {{no matching function}}
+                                                // expected-note@vecintrin.h:* 1 {{candidate function not viable}}
+                                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 4095}}
+  vbl = vec_fp_test_data_class(vd, -1, &cc);    // expected-error {{no matching function}}
+                                                // expected-note@vecintrin.h:* 1 {{candidate function not viable}}
+                                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 4095}}
+  vbl = vec_fp_test_data_class(vd, 4096, &cc);  // expected-error {{no matching function}}
+                                                // expected-note@vecintrin.h:* 1 {{candidate function not viable}}
+                                                // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 4095}}
+}
diff --git a/test/CodeGen/builtins-systemz-zvector2.c b/test/CodeGen/builtins-systemz-zvector2.c
new file mode 100644
index 000000000000..d9607b3a8788
--- /dev/null
+++ b/test/CodeGen/builtins-systemz-zvector2.c
@@ -0,0 +1,545 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -target-cpu z14 -triple s390x-linux-gnu \
+// RUN: -O -fzvector -fno-lax-vector-conversions \
+// RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
+
+#include <vecintrin.h>
+
+volatile vector signed char vsc;
+volatile vector signed short vss;
+volatile vector signed int vsi;
+volatile vector signed long long vsl;
+volatile vector unsigned char vuc;
+volatile vector unsigned short vus;
+volatile vector unsigned int vui;
+volatile vector unsigned long long vul;
+volatile vector bool char vbc;
+volatile vector bool short vbs;
+volatile vector bool int vbi;
+volatile vector bool long long vbl;
+volatile vector float vf;
+volatile vector double vd;
+
+volatile signed char sc;
+volatile signed short ss;
+volatile signed int si;
+volatile signed long long sl;
+volatile unsigned char uc;
+volatile unsigned short us;
+volatile unsigned int ui;
+volatile unsigned long long ul;
+volatile float f;
+volatile double d;
+
+const void * volatile cptr;
+const signed char * volatile cptrsc;
+const signed short * volatile cptrss;
+const signed int * volatile cptrsi;
+const signed long long * volatile cptrsl;
+const unsigned char * volatile cptruc;
+const unsigned short * volatile cptrus;
+const unsigned int * volatile cptrui;
+const unsigned long long * volatile cptrul;
+const float * volatile cptrf;
+const double * volatile cptrd;
+
+void * volatile ptr;
+signed char * volatile ptrsc;
+signed short * volatile ptrss;
+signed int * volatile ptrsi;
+signed long long * volatile ptrsl;
+unsigned char * volatile ptruc;
+unsigned short * volatile ptrus;
+unsigned int * volatile ptrui;
+unsigned long long * volatile ptrul;
+float * volatile ptrf;
+double * volatile ptrd;
+
+volatile unsigned int len;
+volatile int idx;
+int cc;
+
+void test_core(void) {
+  f = vec_extract(vf, idx);
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 %{{.*}}
+  d = vec_extract(vd, idx);
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 %{{.*}}
+
+  vf = vec_insert(d, vf, idx);
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 %{{.*}}
+  vd = vec_insert(f, vd, idx);
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 %{{.*}}
+
+  vf = vec_promote(f, idx);
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 %{{.*}}
+  vd = vec_promote(d, idx);
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 %{{.*}}
+
+  vf = vec_insert_and_zero(cptrf);
+  // CHECK: insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %{{.*}}, i32 0
+  vd = vec_insert_and_zero(cptrd);
+  // CHECK: insertelement <2 x double> <double undef, double 0.000000e+00>, double %{{.*}}, i32 0
+
+  vf = vec_perm(vf, vf, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_perm(vd, vd, vuc);
+  // CHECK: call <16 x i8> @llvm.s390.vperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  vul = vec_bperm_u128(vuc, vuc);
+  // CHECK: call <2 x i64> @llvm.s390.vbperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  vf = vec_sel(vf, vf, vui);
+  vf = vec_sel(vf, vf, vbi);
+  vd = vec_sel(vd, vd, vul);
+  vd = vec_sel(vd, vd, vbl);
+
+  vf = vec_gather_element(vf, vui, cptrf, 0);
+  vf = vec_gather_element(vf, vui, cptrf, 1);
+  vf = vec_gather_element(vf, vui, cptrf, 2);
+  vf = vec_gather_element(vf, vui, cptrf, 3);
+  vd = vec_gather_element(vd, vul, cptrd, 0);
+  vd = vec_gather_element(vd, vul, cptrd, 1);
+
+  vec_scatter_element(vf, vui, ptrf, 0);
+  vec_scatter_element(vf, vui, ptrf, 1);
+  vec_scatter_element(vf, vui, ptrf, 2);
+  vec_scatter_element(vf, vui, ptrf, 3);
+  vec_scatter_element(vd, vul, ptrd, 0);
+  vec_scatter_element(vd, vul, ptrd, 1);
+
+  vf = vec_xl(idx, cptrf);
+  vd = vec_xl(idx, cptrd);
+
+  vec_xst(vf, idx, ptrf);
+  vec_xst(vd, idx, ptrd);
+
+  vd = vec_load_bndry(cptrd, 64);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 0)
+  vf = vec_load_bndry(cptrf, 64);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 0)
+  vf = vec_load_bndry(cptrf, 128);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 1)
+  vf = vec_load_bndry(cptrf, 256);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 2)
+  vf = vec_load_bndry(cptrf, 512);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 3)
+  vf = vec_load_bndry(cptrf, 1024);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 4)
+  vf = vec_load_bndry(cptrf, 2048);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 5)
+  vf = vec_load_bndry(cptrf, 4096);
+  // CHECK: call <16 x i8> @llvm.s390.vlbb(i8* %{{.*}}, i32 6)
+
+  vf = vec_load_len(cptrf, idx);
+  // CHECK: call <16 x i8> @llvm.s390.vll(i32 %{{.*}}, i8* %{{.*}})
+  vd = vec_load_len(cptrd, idx);
+  // CHECK: call <16 x i8> @llvm.s390.vll(i32 %{{.*}}, i8* %{{.*}})
+
+  vec_store_len(vf, ptrf, idx);
+  // CHECK: call void @llvm.s390.vstl(<16 x i8> %{{.*}}, i32 %{{.*}}, i8* %{{.*}})
+  vec_store_len(vd, ptrd, idx);
+  // CHECK: call void @llvm.s390.vstl(<16 x i8> %{{.*}}, i32 %{{.*}}, i8* %{{.*}})
+
+  vuc = vec_load_len_r(cptruc, idx);
+  // CHECK: call <16 x i8> @llvm.s390.vlrl(i32 %{{.*}}, i8* %{{.*}})
+
+  vec_store_len_r(vuc, ptruc, idx);
+  // CHECK: call void @llvm.s390.vstrl(<16 x i8> %{{.*}}, i32 %{{.*}}, i8* %{{.*}})
+
+  vf = vec_splat(vf, 0);
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
+  vf = vec_splat(vf, 1);
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  vd = vec_splat(vd, 0);
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> zeroinitializer
+  vd = vec_splat(vd, 1);
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+
+  vf = vec_splats(f);
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
+  vd = vec_splats(d);
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> zeroinitializer
+
+  vf = vec_mergeh(vf, vf);
+  // shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  vd = vec_mergeh(vd, vd);
+  // shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
+
+  vf = vec_mergel(vf, vf);
+  // shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <i32 2, i32 6, i32 3, i32 7>
+  vd = vec_mergel(vd, vd);
+  // shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <i32 1, i32 3>
+}
+
+void test_compare(void) {
+  vbi = vec_cmpeq(vf, vf);
+  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
+  vbl = vec_cmpeq(vd, vd);
+  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
+
+  vbi = vec_cmpge(vf, vf);
+  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
+  vbl = vec_cmpge(vd, vd);
+  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
+
+  vbi = vec_cmpgt(vf, vf);
+  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
+  vbl = vec_cmpgt(vd, vd);
+  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
+
+  vbi = vec_cmple(vf, vf);
+  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
+  vbl = vec_cmple(vd, vd);
+  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
+
+  vbi = vec_cmplt(vf, vf);
+  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
+  vbl = vec_cmplt(vd, vd);
+  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
+
+  idx = vec_all_eq(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfcesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_eq(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfcedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_ne(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfcesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_ne(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfcedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_ge(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_ge(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_gt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_gt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_le(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_le(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_lt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_lt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_nge(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  idx = vec_all_nge(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_ngt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_ngt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_nle(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_nle(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_nlt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_all_nlt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_all_nan(vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 15)
+  idx = vec_all_nan(vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 15)
+
+  idx = vec_all_numeric(vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 15)
+  idx = vec_all_numeric(vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 15)
+
+  idx = vec_any_eq(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfcesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_eq(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfcedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_ne(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfcesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_ne(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfcedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_ge(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_ge(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_gt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_gt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_le(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_le(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_lt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_lt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_nge(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_nge(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_ngt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_ngt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_nle(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_nle(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchedbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_nlt(vf, vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  idx = vec_any_nlt(vd, vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vfchdbs(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  idx = vec_any_nan(vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 15)
+  idx = vec_any_nan(vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 15)
+
+  idx = vec_any_numeric(vf);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 15)
+  idx = vec_any_numeric(vd);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 15)
+}
+
+void test_integer(void) {
+  vf = vec_andc(vf, vf);
+  vd = vec_andc(vd, vd);
+
+  vf = vec_nor(vf, vf);
+  vd = vec_nor(vd, vd);
+
+  vsc = vec_nand(vsc, vsc);
+  vuc = vec_nand(vuc, vuc);
+  vbc = vec_nand(vbc, vbc);
+  vss = vec_nand(vss, vss);
+  vus = vec_nand(vus, vus);
+  vbs = vec_nand(vbs, vbs);
+  vsi = vec_nand(vsi, vsi);
+  vui = vec_nand(vui, vui);
+  vbi = vec_nand(vbi, vbi);
+  vsl = vec_nand(vsl, vsl);
+  vul = vec_nand(vul, vul);
+  vbl = vec_nand(vbl, vbl);
+  vf = vec_nand(vf, vf);
+  vd = vec_nand(vd, vd);
+
+  vsc = vec_orc(vsc, vsc);
+  vuc = vec_orc(vuc, vuc);
+  vbc = vec_orc(vbc, vbc);
+  vss = vec_orc(vss, vss);
+  vus = vec_orc(vus, vus);
+  vbs = vec_orc(vbs, vbs);
+  vsi = vec_orc(vsi, vsi);
+  vui = vec_orc(vui, vui);
+  vbi = vec_orc(vbi, vbi);
+  vsl = vec_orc(vsl, vsl);
+  vul = vec_orc(vul, vul);
+  vbl = vec_orc(vbl, vbl);
+  vf = vec_orc(vf, vf);
+  vd = vec_orc(vd, vd);
+
+  vsc = vec_eqv(vsc, vsc);
+  vuc = vec_eqv(vuc, vuc);
+  vbc = vec_eqv(vbc, vbc);
+  vss = vec_eqv(vss, vss);
+  vus = vec_eqv(vus, vus);
+  vbs = vec_eqv(vbs, vbs);
+  vsi = vec_eqv(vsi, vsi);
+  vui = vec_eqv(vui, vui);
+  vbi = vec_eqv(vbi, vbi);
+  vsl = vec_eqv(vsl, vsl);
+  vul = vec_eqv(vul, vul);
+  vbl = vec_eqv(vbl, vbl);
+  vf = vec_eqv(vf, vf);
+  vd = vec_eqv(vd, vd);
+
+  vf = vec_slb(vf, vsi);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vf = vec_slb(vf, vui);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_slb(vd, vsl);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_slb(vd, vul);
+  // CHECK: call <16 x i8> @llvm.s390.vslb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  vf = vec_sld(vf, vf, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vf = vec_sld(vf, vf, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+  vd = vec_sld(vd, vd, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vd = vec_sld(vd, vd, 15);
+  // CHECK: call <16 x i8> @llvm.s390.vsldb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i32 15)
+
+  vf = vec_srab(vf, vsi);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vf = vec_srab(vf, vui);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_srab(vd, vsl);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_srab(vd, vul);
+  // CHECK: call <16 x i8> @llvm.s390.vsrab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  vf = vec_srb(vf, vsi);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vf = vec_srb(vf, vui);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_srb(vd, vsl);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  vd = vec_srb(vd, vul);
+  // CHECK: call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  idx = vec_test_mask(vf, vui);
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  idx = vec_test_mask(vd, vul);
+  // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+
+  vuc = vec_msum_u128(vul, vul, vuc, 0);
+  // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 0)
+  vuc = vec_msum_u128(vul, vul, vuc, 4);
+  // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 4)
+  vuc = vec_msum_u128(vul, vul, vuc, 8);
+  // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 8)
+  vuc = vec_msum_u128(vul, vul, vuc, 12);
+  // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 12)
+}
+
+void test_float(void) {
+  vf = vec_abs(vf);
+  // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
+  vd = vec_abs(vd);
+  // CHECK: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+
+  vf = vec_nabs(vf);
+  // CHECK: [[ABS:%[^ ]+]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
+  // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[ABS]]
+  vd = vec_nabs(vd);
+  // CHECK: [[ABS:%[^ ]+]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+  // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[ABS]]
+
+  vf = vec_max(vf, vf);
+  // CHECK: call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 0)
+  vd = vec_max(vd, vd);
+  // CHECK: call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
+
+  vf = vec_min(vf, vf);
+  // CHECK: call <4 x float> @llvm.s390.vfminsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 0)
+  vd = vec_min(vd, vd);
+  // CHECK: call <2 x double> @llvm.s390.vfmindb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
+
+  vf = vec_madd(vf, vf, vf);
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  vd = vec_madd(vd, vd, vd);
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+
+  vf = vec_msub(vf, vf, vf);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
+  vd = vec_msub(vd, vd, vd);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]])
+
+  vf = vec_nmadd(vf, vf, vf);
+  // CHECK: [[RES:%[^ ]+]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[RES]]
+  vd = vec_nmadd(vd, vd, vd);
+  // CHECK: [[RES:%[^ ]+]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[RES]]
+
+  vf = vec_nmsub(vf, vf, vf);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{.*}}
+  // CHECK: [[RES:%[^ ]+]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
+  // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[RES]]
+  vd = vec_nmsub(vd, vd, vd);
+  // CHECK: [[NEG:%[^ ]+]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{.*}}
+  // CHECK: [[RES:%[^ ]+]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]])
+  // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[RES]]
+
+  vf = vec_sqrt(vf);
+  // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.*}})
+  vd = vec_sqrt(vd);
+  // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
+
+  vd = vec_doublee(vf);
+  // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
+  vf = vec_floate(vd);
+  // CHECK: fptrunc <2 x double> %{{.*}} to <2 x float>
+
+  vd = vec_double(vsl);
+  // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
+  vd = vec_double(vul);
+  // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
+
+  vsl = vec_signed(vd);
+  // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
+  vul = vec_unsigned(vd);
+  // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
+
+  vf = vec_roundp(vf);
+  // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
+  vf = vec_ceil(vf);
+  // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
+  vd = vec_roundp(vd);
+  // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
+  vd = vec_ceil(vd);
+  // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
+
+  vf = vec_roundm(vf);
+  // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
+  vf = vec_floor(vf);
+  // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
+  vd = vec_roundm(vd);
+  // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{.*}})
+  vd = vec_floor(vd);
+  // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{.*}})
+
+  vf = vec_roundz(vf);
+  // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{.*}})
+  vf = vec_trunc(vf);
+  // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{.*}})
+  vd = vec_roundz(vd);
+  // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{.*}})
+  vd = vec_trunc(vd);
+  // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{.*}})
+
+  vf = vec_roundc(vf);
+  // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
+  vd = vec_roundc(vd);
+  // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{.*}})
+
+  vf = vec_rint(vf);
+  // CHECK: call <4 x float> @llvm.rint.v4f32(<4 x float> %{{.*}})
+  vd = vec_rint(vd);
+  // CHECK: call <2 x double> @llvm.rint.v2f64(<2 x double> %{{.*}})
+
+  vf = vec_round(vf);
+  // CHECK: call <4 x float> @llvm.s390.vfisb(<4 x float> %{{.*}}, i32 4, i32 4)
+  vd = vec_round(vd);
+  // CHECK: call <2 x double> @llvm.s390.vfidb(<2 x double> %{{.*}}, i32 4, i32 4)
+
+  vbi = vec_fp_test_data_class(vf, 0, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 0)
+  vbi = vec_fp_test_data_class(vf, 4095, &cc);
+  // CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 4095)
+  vbl = vec_fp_test_data_class(vd, 0, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 0)
+  vbl = vec_fp_test_data_class(vd, 4095, &cc);
+  // CHECK: call { <2 x i64>, i32 } @llvm.s390.vftcidb(<2 x double> %{{.*}}, i32 4095)
+}
diff --git a/test/CodeGen/coff-aarch64-type-sizes.c b/test/CodeGen/coff-aarch64-type-sizes.c
new file mode 100644
index 000000000000..1d8a64520469
--- /dev/null
+++ b/test/CodeGen/coff-aarch64-type-sizes.c
@@ -0,0 +1,88 @@
+// RUN: %clang_cc1 -triple aarch64-windows -emit-llvm -w -o - %s | FileCheck %s
+
+// CHECK: target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+// CHECK: target triple = "aarch64--windows-msvc"
+
+int check_short() {
+  return sizeof(short);
+// CHECK: ret i32 2
+}
+
+int check_int() {
+  return sizeof(int);
+// CHECK: ret i32 4
+}
+
+int check_long() {
+  return sizeof(long);
+// CHECK: ret i32 4
+}
+
+int check_longlong() {
+  return sizeof(long long);
+// CHECK: ret i32 8
+}
+
+int check_int128() {
+  return sizeof(__int128);
+// CHECK: ret i32 16
+}
+
+int check_fp16() {
+  return sizeof(__fp16);
+// CHECK: ret i32 2
+}
+
+int check_float() {
+  return sizeof(float);
+// CHECK: ret i32 4
+}
+
+int check_double() {
+  return sizeof(double);
+// CHECK: ret i32 8
+}
+
+int check_longdouble() {
+  return sizeof(long double);
+// CHECK: ret i32 8
+}
+
+int check_floatComplex() {
+  return sizeof(float _Complex);
+// CHECK: ret i32 8
+}
+
+int check_doubleComplex() {
+  return sizeof(double _Complex);
+// CHECK: ret i32 16
+}
+
+int check_longdoubleComplex() {
+  return sizeof(long double _Complex);
+// CHECK: ret i32 16
+}
+
+int check_bool() {
+  return sizeof(_Bool);
+// CHECK: ret i32 1
+}
+
+int check_wchar() {
+  return sizeof(__WCHAR_TYPE__);
+// CHECK: ret i32 2
+}
+
+int check_wchar_unsigned() {
+  return (__WCHAR_TYPE__)-1 > (__WCHAR_TYPE__)0;
+// CHECK: ret i32 1
+}
+
+enum Small {
+  Item
+};
+
+int foo() {
+  return sizeof(enum Small);
+// CHECK: ret i32 4
+}
diff --git a/test/CodeGen/debug-info-imported-entity.cpp b/test/CodeGen/debug-info-imported-entity.cpp
index c7935eda48c5..398536e581ae 100644
--- a/test/CodeGen/debug-info-imported-entity.cpp
+++ b/test/CodeGen/debug-info-imported-entity.cpp
@@ -5,5 +5,6 @@ using std::A; using ::A;
 
 // CHECK: [[CompileUnit:![0-9]+]] = distinct !DICompileUnit({{.+}} imports: [[Imports:![0-9]+]]
 // CHECK: [[Imports]] = !{[[ImportedEntity:![0-9]+]]}
-// CHECK: [[ImportedEntity]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[CompileUnit]], entity: [[STDA:![0-9]+]], line: 4)
+// CHECK: [[ImportedEntity]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[CompileUnit]], entity: [[STDA:![0-9]+]], file: [[FILE:![0-9]+]], line: 4)
 // CHECK: [[STDA]] = !DICompositeType(tag: DW_TAG_class_type, name: "A",
+// CHECK: [[FILE]] = !DIFile(filename: {{.*}}debug-info-imported-entity.cpp
diff --git a/test/CodeGen/ms_abi.c b/test/CodeGen/ms_abi.c
index 97f66b29c517..407087e40916 100644
--- a/test/CodeGen/ms_abi.c
+++ b/test/CodeGen/ms_abi.c
@@ -15,20 +15,20 @@ void f3(void) {
   // FREEBSD-LABEL: define void @f3()
   // WIN64-LABEL: define void @f3()
   f1();
-  // FREEBSD: call x86_64_win64cc void @f1()
+  // FREEBSD: call win64cc void @f1()
   // WIN64: call void @f1()
   f2();
   // FREEBSD: call void @f2()
   // WIN64: call x86_64_sysvcc void @f2()
 }
-// FREEBSD: declare x86_64_win64cc void @f1()
+// FREEBSD: declare win64cc void @f1()
 // FREEBSD: declare void @f2()
 // WIN64: declare void @f1()
 // WIN64: declare x86_64_sysvcc void @f2()
 
 // Win64 ABI varargs
 void __attribute__((ms_abi)) f4(int a, ...) {
-  // FREEBSD-LABEL: define x86_64_win64cc void @f4
+  // FREEBSD-LABEL: define win64cc void @f4
   // WIN64-LABEL: define void @f4
   __builtin_ms_va_list ap;
   __builtin_ms_va_start(ap, a);
diff --git a/test/CodeGen/ms_abi_aarch64.c b/test/CodeGen/ms_abi_aarch64.c
new file mode 100644
index 000000000000..4aa93f00859a
--- /dev/null
+++ b/test/CodeGen/ms_abi_aarch64.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm < %s | FileCheck -check-prefix=LINUX %s
+// RUN: %clang_cc1 -triple aarch64-pc-win32 -emit-llvm < %s | FileCheck -check-prefix=WIN64 %s
+
+void __attribute__((ms_abi)) f1(void);
+void f2(void);
+void f3(void) {
+  // LINUX-LABEL: define void @f3()
+  // WIN64-LABEL: define void @f3()
+  f1();
+  // LINUX: call win64cc void @f1()
+  // WIN64: call void @f1()
+  f2();
+  // LINUX: call void @f2()
+  // WIN64: call void @f2()
+}
+// LINUX: declare win64cc void @f1()
+// LINUX: declare void @f2()
+// WIN64: declare void @f1()
+// WIN64: declare void @f2()
+
+// Win64 ABI varargs
+void __attribute__((ms_abi)) f4(int a, ...) {
+  // LINUX-LABEL: define win64cc void @f4
+  // WIN64-LABEL: define void @f4
+  __builtin_ms_va_list ap;
+  __builtin_ms_va_start(ap, a);
+  // LINUX: %[[AP:.*]] = alloca i8*
+  // LINUX: call void @llvm.va_start
+  // WIN64: %[[AP:.*]] = alloca i8*
+  // WIN64: call void @llvm.va_start
+  int b = __builtin_va_arg(ap, int);
+  // LINUX: %[[AP_CUR:.*]] = load i8*, i8** %[[AP]]
+  // LINUX-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR]], i64 8
+  // LINUX-NEXT: store i8* %[[AP_NEXT]], i8** %[[AP]]
+  // LINUX-NEXT: bitcast i8* %[[AP_CUR]] to i32*
+  // WIN64: %[[AP_CUR:.*]] = load i8*, i8** %[[AP]]
+  // WIN64-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR]], i64 8
+  // WIN64-NEXT: store i8* %[[AP_NEXT]], i8** %[[AP]]
+  // WIN64-NEXT: bitcast i8* %[[AP_CUR]] to i32*
+  __builtin_ms_va_list ap2;
+  __builtin_ms_va_copy(ap2, ap);
+  // LINUX: %[[AP_VAL:.*]] = load i8*, i8** %[[AP]]
+  // LINUX-NEXT: store i8* %[[AP_VAL]], i8** %[[AP2:.*]]
+  // WIN64: %[[AP_VAL:.*]] = load i8*, i8** %[[AP]]
+  // WIN64-NEXT: store i8* %[[AP_VAL]], i8** %[[AP2:.*]]
+  __builtin_ms_va_end(ap);
+  // LINUX: call void @llvm.va_end
+  // WIN64: call void @llvm.va_end
+}
+
+// Let's verify that normal va_lists work right on Win64, too.
+void f5(int a, ...) {
+  // WIN64-LABEL: define void @f5
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  // WIN64: %[[AP:.*]] = alloca i8*
+  // WIN64: call void @llvm.va_start
+  int b = __builtin_va_arg(ap, int);
+  // WIN64: %[[AP_CUR:.*]] = load i8*, i8** %[[AP]]
+  // WIN64-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, i8* %[[AP_CUR]], i64 8
+  // WIN64-NEXT: store i8* %[[AP_NEXT]], i8** %[[AP]]
+  // WIN64-NEXT: bitcast i8* %[[AP_CUR]] to i32*
+  __builtin_va_list ap2;
+  __builtin_va_copy(ap2, ap);
+  // WIN64: call void @llvm.va_copy
+  __builtin_va_end(ap);
+  // WIN64: call void @llvm.va_end
+}
diff --git a/test/CodeGen/systemz-abi-vector.c b/test/CodeGen/systemz-abi-vector.c
index 455e3bbcced2..191c2321bb1d 100644
--- a/test/CodeGen/systemz-abi-vector.c
+++ b/test/CodeGen/systemz-abi-vector.c
@@ -6,6 +6,10 @@
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 // RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu arch11 \
 // RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu z14 \
+// RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu arch12 \
+// RUN:   -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-VECTOR %s
 
 // Vector types
 
diff --git a/test/CodeGen/systemz-abi.c b/test/CodeGen/systemz-abi.c
index 17fdb9e322c8..de5dce54b5f0 100644
--- a/test/CodeGen/systemz-abi.c
+++ b/test/CodeGen/systemz-abi.c
@@ -6,6 +6,10 @@
 // RUN:   -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu arch11 \
 // RUN:   -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu z14 \
+// RUN:   -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu arch12 \
+// RUN:   -emit-llvm -o - %s | FileCheck %s
 
 // Scalar types
 
diff --git a/test/CodeGen/target-data.c b/test/CodeGen/target-data.c
index 68ee8f02d2ee..851ce5831fa3 100644
--- a/test/CodeGen/target-data.c
+++ b/test/CodeGen/target-data.c
@@ -171,6 +171,10 @@
 // RUN: FileCheck %s -check-prefix=SYSTEMZ-VECTOR
 // RUN: %clang_cc1 -triple s390x-unknown -target-cpu arch11 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SYSTEMZ-VECTOR
+// RUN: %clang_cc1 -triple s390x-unknown -target-cpu z14 -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=SYSTEMZ-VECTOR
+// RUN: %clang_cc1 -triple s390x-unknown -target-cpu arch12 -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=SYSTEMZ-VECTOR
 // SYSTEMZ-VECTOR: target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
 // RUN: %clang_cc1 -triple msp430-unknown -o - -emit-llvm %s | \
diff --git a/test/CodeGen/ubsan-pointer-overflow.m b/test/CodeGen/ubsan-pointer-overflow.m
index da622355fdb4..977e2458384b 100644
--- a/test/CodeGen/ubsan-pointer-overflow.m
+++ b/test/CodeGen/ubsan-pointer-overflow.m
@@ -10,16 +10,20 @@ void unary_arith(char *p) {
   ++p;
 
   // CHECK: ptrtoint i8* {{.*}} to i64, !nosanitize
-  // CHECK-NEXT: add i64 {{.*}}, -1, !nosanitize
-  // CHECK: select i1 false{{.*}}, !nosanitize
+  // CHECK-NEXT: [[COMPGEP:%.*]] = add i64 {{.*}}, -1, !nosanitize
+  // CHECK: [[NEGVALID:%.*]] = icmp ule i64 [[COMPGEP]], {{.*}}, !nosanitize
+  // CHECK-NOT: select
+  // CHECK: br i1 [[NEGVALID]]{{.*}}, !nosanitize
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
   --p;
 
+  // CHECK: icmp uge i64
   // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
   p++;
 
-  // CHECK: select
+  // CHECK: icmp ule i64
+  // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
   p--;
 }
@@ -64,7 +68,8 @@ void binary_arith_unsigned(char *p, unsigned i) {
 
   // CHECK: [[OFFSET:%.*]] = sub i64 0, {{.*}}
   // CHECK-NEXT: getelementptr inbounds {{.*}} [[OFFSET]]
-  // CHECK: select
+  // CHECK: icmp ule i64
+  // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
   p - i;
 }
@@ -121,8 +126,10 @@ void pointer_array(int **arr, int k) {
 
 // CHECK-LABEL: define void @pointer_array_unsigned_indices
 void pointer_array_unsigned_indices(int **arr, unsigned k) {
+  // CHECK: icmp uge
   // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
+  // CHECK: icmp uge
   // CHECK-NOT: select
   // CHECK: call void @__ubsan_handle_pointer_overflow{{.*}}
   arr[k][k];
diff --git a/test/CodeGen/zvector2.c b/test/CodeGen/zvector2.c
new file mode 100644
index 000000000000..909b3fd7650f
--- /dev/null
+++ b/test/CodeGen/zvector2.c
@@ -0,0 +1,194 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -target-cpu z14 -fzvector \
+// RUN:  -O -emit-llvm -o - -W -Wall -Werror %s | FileCheck %s
+
+volatile vector float ff, ff2;
+volatile vector bool int bi;
+
+void test_assign (void)
+{
+// CHECK-LABEL: test_assign
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: store volatile <4 x float> [[VAL]], <4 x float>* @ff
+  ff = ff2;
+}
+
+void test_pos (void)
+{
+// CHECK-LABEL: test_pos
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: store volatile <4 x float> [[VAL]], <4 x float>* @ff
+  ff = +ff2;
+}
+
+void test_neg (void)
+{
+// CHECK-LABEL: test_neg
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VAL]]
+  ff = -ff2;
+}
+
+void test_preinc (void)
+{
+// CHECK-LABEL: test_preinc
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fadd <4 x float> [[VAL]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  ++ff2;
+}
+
+void test_postinc (void)
+{
+// CHECK-LABEL: test_postinc
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fadd <4 x float> [[VAL]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  ff2++;
+}
+
+void test_predec (void)
+{
+// CHECK-LABEL: test_predec
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fadd <4 x float> [[VAL]], <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>
+  --ff2;
+}
+
+void test_postdec (void)
+{
+// CHECK-LABEL: test_postdec
+// CHECK: [[VAL:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fadd <4 x float> [[VAL]], <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>
+  ff2--;
+}
+
+void test_add (void)
+{
+// CHECK-LABEL: test_add
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fadd <4 x float> [[VAL1]], [[VAL2]]
+  ff = ff + ff2;
+}
+
+void test_add_assign (void)
+{
+// CHECK-LABEL: test_add_assign
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: %{{.*}} = fadd <4 x float> [[VAL2]], [[VAL1]]
+  ff += ff2;
+}
+
+void test_sub (void)
+{
+// CHECK-LABEL: test_sub
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fsub <4 x float> [[VAL1]], [[VAL2]]
+  ff = ff - ff2;
+}
+
+void test_sub_assign (void)
+{
+// CHECK-LABEL: test_sub_assign
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: %{{.*}} = fsub <4 x float> [[VAL1]], [[VAL2]]
+  ff -= ff2;
+}
+
+void test_mul (void)
+{
+// CHECK-LABEL: test_mul
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fmul <4 x float> [[VAL1]], [[VAL2]]
+  ff = ff * ff2;
+}
+
+void test_mul_assign (void)
+{
+// CHECK-LABEL: test_mul_assign
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: %{{.*}} = fmul <4 x float> [[VAL2]], [[VAL1]]
+  ff *= ff2;
+}
+
+void test_div (void)
+{
+// CHECK-LABEL: test_div
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: %{{.*}} = fdiv <4 x float> [[VAL1]], [[VAL2]]
+  ff = ff / ff2;
+}
+
+void test_div_assign (void)
+{
+// CHECK-LABEL: test_div_assign
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: %{{.*}} = fdiv <4 x float> [[VAL1]], [[VAL2]]
+  ff /= ff2;
+}
+
+void test_cmpeq (void)
+{
+// CHECK-LABEL: test_cmpeq
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[CMP:%[^ ]+]] = fcmp oeq <4 x float> [[VAL1]], [[VAL2]]
+// CHECK: %{{.*}} = sext <4 x i1> [[CMP]] to <4 x i32>
+  bi = ff == ff2;
+}
+
+void test_cmpne (void)
+{
+// CHECK-LABEL: test_cmpne
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[CMP:%[^ ]+]] = fcmp une <4 x float> [[VAL1]], [[VAL2]]
+// CHECK: %{{.*}} = sext <4 x i1> [[CMP]] to <4 x i32>
+  bi = ff != ff2;
+}
+
+void test_cmpge (void)
+{
+// CHECK-LABEL: test_cmpge
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[CMP:%[^ ]+]] = fcmp oge <4 x float> [[VAL1]], [[VAL2]]
+// CHECK: %{{.*}} = sext <4 x i1> [[CMP]] to <4 x i32>
+  bi = ff >= ff2;
+}
+
+void test_cmpgt (void)
+{
+// CHECK-LABEL: test_cmpgt
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[CMP:%[^ ]+]] = fcmp ogt <4 x float> [[VAL1]], [[VAL2]]
+// CHECK: %{{.*}} = sext <4 x i1> [[CMP]] to <4 x i32>
+  bi = ff > ff2;
+}
+
+void test_cmple (void)
+{
+// CHECK-LABEL: test_cmple
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[CMP:%[^ ]+]] = fcmp ole <4 x float> [[VAL1]], [[VAL2]]
+// CHECK: %{{.*}} = sext <4 x i1> [[CMP]] to <4 x i32>
+  bi = ff <= ff2;
+}
+
+void test_cmplt (void)
+{
+// CHECK-LABEL: test_cmplt
+// CHECK: [[VAL1:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff
+// CHECK: [[VAL2:%[^ ]+]] = load volatile <4 x float>, <4 x float>* @ff2
+// CHECK: [[CMP:%[^ ]+]] = fcmp olt <4 x float> [[VAL1]], [[VAL2]]
+// CHECK: %{{.*}} = sext <4 x i1> [[CMP]] to <4 x i32>
+  bi = ff < ff2;
+}
+
diff --git a/test/CodeGenCXX/amdgcn-automatic-variable.cpp b/test/CodeGenCXX/amdgcn-automatic-variable.cpp
index a82275206c6d..9830a7845e5b 100644
--- a/test/CodeGenCXX/amdgcn-automatic-variable.cpp
+++ b/test/CodeGenCXX/amdgcn-automatic-variable.cpp
@@ -13,31 +13,32 @@ void func1(int *x) {
 // CHECK-LABEL: define void @_Z5func2v()
 void func2(void) {
   // CHECK: %lv1 = alloca i32, align 4, addrspace(5)
-  // CHECK: %lv2 = alloca i32, align 4, addrspace(5)
-  // CHECK: %la = alloca [100 x i32], align 4, addrspace(5)
-  // CHECK: %lp1 = alloca i32*, align 8, addrspace(5)
-  // CHECK: %lp2 = alloca i32*, align 8, addrspace(5)
-  // CHECK: %lvc = alloca i32, align 4, addrspace(5)
-
   // CHECK: %[[r0:.*]] = addrspacecast i32 addrspace(5)* %lv1 to i32*
+  // CHECK: %lv2 = alloca i32, align 4, addrspace(5)
+  // CHECK: %[[r1:.*]] = addrspacecast i32 addrspace(5)* %lv2 to i32*
+  // CHECK: %la = alloca [100 x i32], align 4, addrspace(5)
+  // CHECK: %[[r2:.*]] = addrspacecast [100 x i32] addrspace(5)* %la to [100 x i32]*
+  // CHECK: %lp1 = alloca i32*, align 8, addrspace(5)
+  // CHECK: %[[r3:.*]] = addrspacecast i32* addrspace(5)* %lp1 to i32**
+  // CHECK: %lp2 = alloca i32*, align 8, addrspace(5)
+  // CHECK: %[[r4:.*]] = addrspacecast i32* addrspace(5)* %lp2 to i32**
+  // CHECK: %lvc = alloca i32, align 4, addrspace(5)
+  // CHECK: %[[r5:.*]] = addrspacecast i32 addrspace(5)* %lvc to i32*
+
   // CHECK: store i32 1, i32* %[[r0]]
   int lv1;
   lv1 = 1;
-  // CHECK: %[[r1:.*]] = addrspacecast i32 addrspace(5)* %lv2 to i32*
   // CHECK: store i32 2, i32* %[[r1]]
   int lv2 = 2;
 
-  // CHECK: %[[r2:.*]] = addrspacecast [100 x i32] addrspace(5)* %la to [100 x i32]*
   // CHECK: %[[arrayidx:.*]] = getelementptr inbounds [100 x i32], [100 x i32]* %[[r2]], i64 0, i64 0
   // CHECK: store i32 3, i32* %[[arrayidx]], align 4
   int la[100];
   la[0] = 3;
 
-  // CHECK: %[[r3:.*]] = addrspacecast i32* addrspace(5)* %lp1 to i32**
   // CHECK: store i32* %[[r0]], i32** %[[r3]], align 8
   int *lp1 = &lv1;
 
-  // CHECK: %[[r4:.*]] = addrspacecast i32* addrspace(5)* %lp2 to i32**
   // CHECK: %[[arraydecay:.*]] = getelementptr inbounds [100 x i32], [100 x i32]* %[[r2]], i32 0, i32 0
   // CHECK: store i32* %[[arraydecay]], i32** %[[r4]], align 8
   int *lp2 = la;
@@ -45,7 +46,6 @@ void func2(void) {
   // CHECK: call void @_Z5func1Pi(i32* %[[r0]])
   func1(&lv1);
 
-  // CHECK: %[[r5:.*]] = addrspacecast i32 addrspace(5)* %lvc to i32*
   // CHECK: store i32 4, i32* %[[r5]]
   // CHECK: store i32 4, i32* %[[r0]]
   const int lvc = 4;
@@ -81,4 +81,25 @@ void func4(int x) {
   func1(&x);
 }
 
+// CHECK-LABEL: define void @_Z5func5v
+void func5() {
+  return;
+  int x = 0;
+}
+
+// CHECK-LABEL: define void @_Z5func6v
+void func6() {
+  return;
+  int x;
+}
+
+// CHECK-LABEL: define void @_Z5func7v
+extern void use(int *);
+void func7() {
+  goto later;
+  int x;
+later:
+  use(&x);
+}
+
 // CHECK-NOT: !opencl.ocl.version
diff --git a/test/CodeGenCXX/debug-info-anon-namespace.cpp b/test/CodeGenCXX/debug-info-anon-namespace.cpp
index 79298deaf8b1..3f4ef2a38360 100644
--- a/test/CodeGenCXX/debug-info-anon-namespace.cpp
+++ b/test/CodeGenCXX/debug-info-anon-namespace.cpp
@@ -21,7 +21,7 @@ int *b2 = &a2;
 // PS4:  [[NS:![0-9]+]] = !DINamespace
 // PS4:  [[CU:![0-9]+]] = distinct !DICompileUnit
 // PS4:  [[NS2:![0-9]+]] = !DINamespace
-// PS4: !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CU]], entity: [[NS]])
-// PS4: !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[NS]], entity: [[NS2]], line: {{[0-9]+}})
+// PS4: !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CU]], entity: [[NS]], file: {{![0-9]+}})
+// PS4: !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[NS]], entity: [[NS2]], file: {{![0-9]+}}, line: {{[0-9]+}})
 // NON-PS4-NOT: !DIImportedEntity
 
diff --git a/test/CodeGenCXX/debug-info-namespace.cpp b/test/CodeGenCXX/debug-info-namespace.cpp
index 95857e339085..296c99efc8eb 100644
--- a/test/CodeGenCXX/debug-info-namespace.cpp
+++ b/test/CodeGenCXX/debug-info-namespace.cpp
@@ -81,17 +81,17 @@ void C::c() {}
 // CHECK: [[CU:![0-9]+]] = distinct !DICompileUnit(
 // CHECK-SAME:                            imports: [[MODULES:![0-9]*]]
 // CHECK: [[MODULES]] = !{[[M1:![0-9]+]], [[M2:![0-9]+]], [[M3:![0-9]+]], [[M4:![0-9]+]], [[M5:![0-9]+]], [[M6:![0-9]+]], [[M7:![0-9]+]], [[M8:![0-9]+]], [[M9:![0-9]+]], [[M10:![0-9]+]], [[M11:![0-9]+]], [[M12:![0-9]+]], [[M13:![0-9]+]], [[M14:![0-9]+]], [[M15:![0-9]+]], [[M16:![0-9]+]], [[M17:![0-9]+]]
-// CHECK: [[M1]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CTXT]], entity: [[NS]], line: 15)
+// CHECK: [[M1]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CTXT]], entity: [[NS]], file: [[FOOCPP]], line: 15)
 
 // CHECK: [[M2]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CU]], entity: [[CTXT]],
-// CHECK: [[M3]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "E", scope: [[CU]], entity: [[CTXT]], line: 19)
-// CHECK: [[M4]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[LEX2:![0-9]+]], entity: [[NS]], line: 23)
+// CHECK: [[M3]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "E", scope: [[CU]], entity: [[CTXT]], file: [[FOOCPP]], line: 19)
+// CHECK: [[M4]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[LEX2:![0-9]+]], entity: [[NS]], file: [[FOOCPP]], line: 23)
 // CHECK: [[LEX2]] = distinct !DILexicalBlock(scope: [[LEX1:![0-9]+]], file: [[FOOCPP]],
 // CHECK: [[LEX1]] = distinct !DILexicalBlock(scope: [[FUNC:![0-9]+]], file: [[FOOCPP]],
 
 // CHECK: [[FUNC:![0-9]+]] = distinct !DISubprogram(name: "func",{{.*}} isDefinition: true
 // CHECK: [[M5]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[FUNC]], entity: [[CTXT:![0-9]+]],
-// CHECK: [[M6]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FOO:![0-9]+]], line: 27)
+// CHECK: [[M6]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FOO:![0-9]+]], file: [[FOOCPP]], line: 27)
 // CHECK: [[FOO]] = !DICompositeType(tag: DW_TAG_structure_type, name: "foo",
 // CHECK-SAME:                               line: 5
 // CHECK-SAME:                               DIFlagFwdDecl
diff --git a/test/CodeGenCXX/implicit-exception-spec.cpp b/test/CodeGenCXX/implicit-exception-spec.cpp
index cf363bd685ad..e1a969ab277f 100644
--- a/test/CodeGenCXX/implicit-exception-spec.cpp
+++ b/test/CodeGenCXX/implicit-exception-spec.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -std=c++11 -o - -fcxx-exceptions -fexceptions | FileCheck -check-prefix=CHECK %s
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -std=c++11 -o - -fcxx-exceptions -fexceptions | FileCheck %s
 
 struct A {
   A();
diff --git a/test/CodeGenObjC/arc-property.m b/test/CodeGenObjC/arc-property.m
index b8dc18e872bf..edc4d1e85383 100644
--- a/test/CodeGenObjC/arc-property.m
+++ b/test/CodeGenObjC/arc-property.m
@@ -131,4 +131,24 @@ void test3(Test3 *t) {
 - (void) setCopyMachine: (id) x {}
 @end
 
+// rdar://31579994
+// When synthesizing a property that's declared in multiple protocols, ensure
+// that the setter is emitted if any of these declarations is readwrite.
+@protocol ABC
+@property (copy, nonatomic,  readonly) Test3 *someId;
+@end
+@protocol ABC__Mutable <ABC>
+@property (copy, nonatomic, readwrite) Test3 *someId;
+@end
+
+@interface ABC_Class <ABC, ABC__Mutable>
+@end
+
+@implementation ABC_Class
+@synthesize someId = _someId;
+// CHECK:  define internal %{{.*}}* @"\01-[ABC_Class someId]"
+// CHECK:  define internal void @"\01-[ABC_Class setSomeId:]"(
+@end
+
+
 // CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/CodeGenObjC/attr-callconv.m b/test/CodeGenObjC/attr-callconv.m
index b0a41f74381d..8cf8d2bd71f7 100644
--- a/test/CodeGenObjC/attr-callconv.m
+++ b/test/CodeGenObjC/attr-callconv.m
@@ -9,5 +9,5 @@
     // CHECK: define{{.*}}x86_stdcallcc{{.*}}Test test
     
 - (void)test2 __attribute__((ms_abi)) {}
-    // CHECK: define{{.*}}x86_64_win64cc{{.*}}Test test2
+    // CHECK: define{{.*}}win64cc{{.*}}Test test2
 @end
diff --git a/test/CodeGenOpenCL/kernel-arg-info.cl b/test/CodeGenOpenCL/kernel-arg-info.cl
index f1b2f45d7d97..463cc4451114 100644
--- a/test/CodeGenOpenCL/kernel-arg-info.cl
+++ b/test/CodeGenOpenCL/kernel-arg-info.cl
@@ -1,10 +1,24 @@
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir-unknown-unknown -cl-kernel-arg-info | FileCheck %s -check-prefix ARGINFO
 
-kernel void foo(__global int * restrict X, const int Y, 
-                volatile int anotherArg, __constant float * restrict Z,
-                __global volatile int * V, __global const int * C) {
-  *X = Y + anotherArg;
+kernel void foo(global int * globalintp, global int * restrict globalintrestrictp,
+                global const int * globalconstintp,
+                global const int * restrict globalconstintrestrictp,
+                constant int * constantintp, constant int * restrict constantintrestrictp,
+                global const volatile int * globalconstvolatileintp,
+                global const volatile int * restrict globalconstvolatileintrestrictp,
+                global volatile int * globalvolatileintp,
+                global volatile int * restrict globalvolatileintrestrictp,
+                local int * localintp, local int * restrict localintrestrictp,
+                local const int * localconstintp,
+                local const int * restrict localconstintrestrictp,
+                local const volatile int * localconstvolatileintp,
+                local const volatile int * restrict localconstvolatileintrestrictp,
+                local volatile int * localvolatileintp,
+                local volatile int * restrict localvolatileintrestrictp,
+                int X, const int constint, const volatile int constvolatileint,
+                volatile int volatileint) {
+  *globalintrestrictp = constint + volatileint;
 }
 // CHECK: define spir_kernel void @foo{{[^!]+}}
 // CHECK: !kernel_arg_addr_space ![[MD11:[0-9]+]]
@@ -61,11 +75,15 @@ kernel void foo5(myImage img1, write_only image1d_t img2) {
 // CHECK-NOT: !kernel_arg_name
 // ARGINFO: !kernel_arg_name ![[MD54:[0-9]+]]
 
-// CHECK: ![[MD11]] = !{i32 1, i32 0, i32 0, i32 2, i32 1, i32 1}
-// CHECK: ![[MD12]] = !{!"none", !"none", !"none", !"none", !"none", !"none"}
-// CHECK: ![[MD13]] = !{!"int*", !"int", !"int", !"float*", !"int*", !"int*"}
-// CHECK: ![[MD14]] = !{!"restrict", !"", !"", !"restrict const", !"volatile", !"const"}
-// ARGINFO: ![[MD15]] = !{!"X", !"Y", !"anotherArg", !"Z", !"V", !"C"}
+typedef char char16 __attribute__((ext_vector_type(16)));
+__kernel void foo6(__global char16 arg[]) {}
+// CHECK: !kernel_arg_type ![[MD61:[0-9]+]]
+
+// CHECK: ![[MD11]] = !{i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 1, i32 1, i32 1, i32 1, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 0, i32 0, i32 0, i32 0}
+// CHECK: ![[MD12]] = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+// CHECK: ![[MD13]] = !{!"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int*", !"int", !"int", !"int", !"int"}
+// CHECK: ![[MD14]] = !{!"", !"restrict", !"const", !"restrict const", !"const", !"restrict const", !"const volatile", !"restrict const volatile", !"volatile", !"restrict volatile", !"", !"restrict", !"const", !"restrict const", !"const volatile", !"restrict const volatile", !"volatile", !"restrict volatile", !"", !"", !"", !""}
+// ARGINFO: ![[MD15]] = !{!"globalintp", !"globalintrestrictp", !"globalconstintp", !"globalconstintrestrictp", !"constantintp", !"constantintrestrictp", !"globalconstvolatileintp", !"globalconstvolatileintrestrictp", !"globalvolatileintp", !"globalvolatileintrestrictp", !"localintp", !"localintrestrictp", !"localconstintp", !"localconstintrestrictp", !"localconstvolatileintp", !"localconstvolatileintrestrictp", !"localvolatileintp", !"localvolatileintrestrictp", !"X", !"constint", !"constvolatileint", !"volatileint"}
 // CHECK: ![[MD21]] = !{i32 1, i32 1, i32 1, i32 1}
 // CHECK: ![[MD22]] = !{!"read_only", !"read_only", !"write_only", !"read_write"}
 // CHECK: ![[MD23]] = !{!"image1d_t", !"image2d_t", !"image2d_array_t", !"image1d_t"}
@@ -86,4 +104,5 @@ kernel void foo5(myImage img1, write_only image1d_t img2) {
 // CHECK: ![[MD52]] = !{!"myImage", !"image1d_t"}
 // CHECK: ![[MD53]] = !{!"image1d_t", !"image1d_t"}
 // ARGINFO: ![[MD54]] = !{!"img1", !"img2"}
+// CHECK: ![[MD61]] = !{!"char16*"}
 
diff --git a/test/Driver/autocomplete.c b/test/Driver/autocomplete.c
index e9e2e992bae9..dec48bfb3596 100644
--- a/test/Driver/autocomplete.c
+++ b/test/Driver/autocomplete.c
@@ -40,3 +40,7 @@
 // MRELOCMODEL_CLANG-NOT: -mrelocation-model
 // RUN: %clang --autocomplete=#-mrelocation-mode | FileCheck %s -check-prefix=MRELOCMODEL_CC1
 // MRELOCMODEL_CC1: -mrelocation-model
+// RUN: %clang --autocomplete=-Wma | FileCheck %s -check-prefix=WARNING
+// WARNING: -Wmacro-redefined -Wmain -Wmain-return-type -Wmalformed-warning-check -Wmany-braces-around-scalar-init -Wmax-unsigned-zero
+// RUN: %clang --autocomplete=-Wnoinvalid-pp- | FileCheck %s -check-prefix=NOWARNING
+// NOWARNING: -Wnoinvalid-pp-token
diff --git a/test/Driver/constructors.c b/test/Driver/constructors.c
index cd14ed736a28..39a199a3c6ae 100644
--- a/test/Driver/constructors.c
+++ b/test/Driver/constructors.c
@@ -74,3 +74,11 @@
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1       \
 // RUN:     -target arm64-none-none-eabi \
 // RUN:   | FileCheck --check-prefix=CHECK-INIT-ARRAY %s
+//
+// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1        \
+// RUN:     -target sparc-sun-solaris2.11 \
+// RUN:   | FileCheck --check-prefix=CHECK-INIT-ARRAY %s
+//
+// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1        \
+// RUN:     -target i386-pc-solaris2.11 \
+// RUN:   | FileCheck --check-prefix=CHECK-INIT-ARRAY %s
diff --git a/test/Driver/darwin-version.c b/test/Driver/darwin-version.c
index c11a2df37f98..12c7ef6eb0b1 100644
--- a/test/Driver/darwin-version.c
+++ b/test/Driver/darwin-version.c
@@ -121,3 +121,15 @@
 // RUN:   %clang -target i386-apple-darwin9 -c %s -### 2>&1 | \
 // RUN:   FileCheck --check-prefix=CHECK-VERSION-WATCHOSSIM %s
 // CHECK-VERSION-WATCHOSSIM: "i386-apple-watchos2.0.0"
+
+// RUN: %clang -target x86_64-apple-ios11.0.0 -c %s -### 2>&1 | \
+// RUN: FileCheck --check-prefix=CHECK-VERSION-IOS-TARGET %s
+// CHECK-VERSION-IOS-TARGET: "x86_64-apple-ios11.0.0"
+
+// RUN: %clang -target x86_64-apple-tvos11.0 -c %s -### 2>&1 | \
+// RUN: FileCheck --check-prefix=CHECK-VERSION-TVOS-TARGET %s
+// CHECK-VERSION-TVOS-TARGET: "x86_64-apple-tvos11.0.0"
+
+// RUN: %clang -target x86_64-apple-watchos4.0 -c %s -### 2>&1 | \
+// RUN: FileCheck --check-prefix=CHECK-VERSION-WATCHOS-TARGET %s
+// CHECK-VERSION-WATCHOS-TARGET: "x86_64-apple-watchos4.0.0"
diff --git a/test/Driver/emulated-tls.cpp b/test/Driver/emulated-tls.cpp
index a18c2e220bf8..38edc98ac889 100644
--- a/test/Driver/emulated-tls.cpp
+++ b/test/Driver/emulated-tls.cpp
@@ -1,5 +1,7 @@
-// Cygwin uses emutls. Clang should pass -femulated-tls to cc1 and cc1 should pass EmulatedTLS to LLVM CodeGen.
+// Cygwin and OpenBSD use emutls. Clang should pass -femulated-tls to cc1
+// and cc1 should pass EmulatedTLS to LLVM CodeGen.
 // FIXME: Add more targets here to use emutls.
 // RUN: %clang -### -std=c++11 -target i686-pc-cygwin %s 2>&1 | FileCheck %s
+// RUN: %clang -### -std=c++11 -target i686-pc-openbsd %s 2>&1 | FileCheck %s
 
 // CHECK: "-cc1" {{.*}}"-femulated-tls"
diff --git a/test/Driver/fuchsia.c b/test/Driver/fuchsia.c
index 64d31cc0d38c..405086f3f07e 100644
--- a/test/Driver/fuchsia.c
+++ b/test/Driver/fuchsia.c
@@ -5,6 +5,7 @@
 // CHECK: "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK: "-internal-externc-isystem" "[[SYSROOT]]{{/|\\\\}}include"
 // CHECK: {{.*}}lld{{.*}}" "-flavor" "gnu"
+// CHECK: "-z" "rodynamic"
 // CHECK: "--sysroot=[[SYSROOT]]"
 // CHECK: "-pie"
 // CHECK: "--build-id"
diff --git a/test/Driver/fuchsia.cpp b/test/Driver/fuchsia.cpp
index 4c3f7d29acd8..20299c342688 100644
--- a/test/Driver/fuchsia.cpp
+++ b/test/Driver/fuchsia.cpp
@@ -7,6 +7,7 @@
 // CHECK: "-internal-isystem" "{{.*[/\\]}}x86_64-fuchsia{{/|\\\\}}include{{/|\\\\}}c++{{/|\\\\}}v1"
 // CHECK: "-internal-externc-isystem" "[[SYSROOT]]{{/|\\\\}}include"
 // CHECK: {{.*}}lld{{.*}}" "-flavor" "gnu"
+// CHECK: "-z" "rodynamic"
 // CHECK: "--sysroot=[[SYSROOT]]"
 // CHECK: "-pie"
 // CHECK: "--build-id"
diff --git a/test/Driver/lto-unit.c b/test/Driver/lto-unit.c
index 8a800fa4dcd6..1f1a2861670e 100644
--- a/test/Driver/lto-unit.c
+++ b/test/Driver/lto-unit.c
@@ -2,6 +2,8 @@
 // RUN: %clang -target x86_64-unknown-linux -### %s -flto=thin 2>&1 | FileCheck --check-prefix=UNIT %s
 // RUN: %clang -target x86_64-apple-darwin13.3.0 -### %s -flto=full 2>&1 | FileCheck --check-prefix=UNIT %s
 // RUN: %clang -target x86_64-apple-darwin13.3.0 -### %s -flto=thin 2>&1 | FileCheck --check-prefix=NOUNIT %s
+// RUN: %clang -target x86_64-scei-ps4 -### %s -flto=full 2>&1 | FileCheck --check-prefix=UNIT %s
+// RUN: %clang -target x86_64-scei-ps4 -### %s -flto=thin 2>&1 | FileCheck --check-prefix=NOUNIT %s
 
 // UNIT: "-flto-unit"
 // NOUNIT-NOT: "-flto-unit"
diff --git a/test/Driver/pic.c b/test/Driver/pic.c
index 61d752094f6e..6b01c583b8b1 100644
--- a/test/Driver/pic.c
+++ b/test/Driver/pic.c
@@ -247,6 +247,9 @@
 // On OpenBSD, -nopie needs to be passed through to the linker.
 // RUN: %clang %s -target i386-pc-openbsd -nopie -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-NOPIE-LD
+// Try with the alias
+// RUN: %clang %s -target i386-pc-openbsd -no-pie -### 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=CHECK-NOPIE-LD
 //
 // On Android PIC is enabled by default
 // RUN: %clang -c %s -target i686-linux-android -### 2>&1 \
diff --git a/test/Driver/std.cpp b/test/Driver/std.cpp
index aceda017a4e1..52b42ab9bd4f 100644
--- a/test/Driver/std.cpp
+++ b/test/Driver/std.cpp
@@ -9,6 +9,9 @@
 // RUN: not %clang -std=gnu++1y %s -fsyntax-only 2>&1 | FileCheck -check-prefix=GNUXX1Y %s
 // RUN: not %clang -std=c++1z %s -fsyntax-only 2>&1 | FileCheck -check-prefix=CXX1Z %s
 // RUN: not %clang -std=gnu++1z %s -fsyntax-only 2>&1 | FileCheck -check-prefix=GNUXX1Z %s
+// RUN: not %clang -std=c++2a %s -fsyntax-only 2>&1 | FileCheck -check-prefix=CXX2A %s
+// RUN: not %clang -std=gnu++2a %s -fsyntax-only 2>&1 | FileCheck -check-prefix=GNUXX2A %s
+
 
 void f(int n) {
   typeof(n)();
@@ -38,3 +41,10 @@ void f(int n) {
 
 // GNUXX1Z-NOT: undeclared identifier 'typeof'
 // GNUXX1Z-NOT: undeclared identifier 'decltype'
+
+// CXX2A: undeclared identifier 'typeof'
+// CXX2A-NOT: undeclared identifier 'decltype'
+
+// GNUXX2A-NOT: undeclared identifier 'typeof'
+// GNUXX2A-NOT: undeclared identifier 'decltype'
+
diff --git a/test/Driver/systemz-march.c b/test/Driver/systemz-march.c
index a5df1f482d1e..2fe7f89d8512 100644
--- a/test/Driver/systemz-march.c
+++ b/test/Driver/systemz-march.c
@@ -9,6 +9,8 @@
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch10 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH10 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=z13 %s 2>&1 | FileCheck --check-prefix=CHECK-Z13 %s
 // RUN: %clang -target s390x -### -S -emit-llvm -march=arch11 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH11 %s
+// RUN: %clang -target s390x -### -S -emit-llvm -march=z14 %s 2>&1 | FileCheck --check-prefix=CHECK-Z14 %s
+// RUN: %clang -target s390x -### -S -emit-llvm -march=arch12 %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH12 %s
 
 // CHECK-Z9: error: unknown target CPU 'z9'
 // CHECK-Z10: "-target-cpu" "z10"
@@ -19,5 +21,7 @@
 // CHECK-ARCH10: "-target-cpu" "arch10"
 // CHECK-Z13: "-target-cpu" "z13"
 // CHECK-ARCH11: "-target-cpu" "arch11"
+// CHECK-Z14: "-target-cpu" "z14"
+// CHECK-ARCH12: "-target-cpu" "arch12"
 
 int x;
diff --git a/test/Driver/unknown-std.cpp b/test/Driver/unknown-std.cpp
index 4f7a5e64a6ed..195a671edadf 100644
--- a/test/Driver/unknown-std.cpp
+++ b/test/Driver/unknown-std.cpp
@@ -15,6 +15,8 @@
 // CHECK-NEXT: note: use 'gnu++14' for 'ISO C++ 2014 with amendments and GNU extensions' standard
 // CHECK-NEXT: note: use 'c++1z' for 'Working draft for ISO C++ 2017' standard
 // CHECK-NEXT: note: use 'gnu++1z' for 'Working draft for ISO C++ 2017 with GNU extensions' standard
+// CHECK-NEXT: note: use 'c++2a' for 'Working draft for ISO C++ 2020' standard
+// CHECK-NEXT: note: use 'gnu++2a' for 'Working draft for ISO C++ 2020 with GNU extensions' standard
 // CUDA-NEXT: note: use 'cuda' for 'NVIDIA CUDA(tm)' standard
 
 // Make sure that no other output is present.
diff --git a/test/FixIt/format.m b/test/FixIt/format.m
index 7af2cfdaf71a..c3cf2b1f3c56 100644
--- a/test/FixIt/format.m
+++ b/test/FixIt/format.m
@@ -229,6 +229,19 @@ void testSignedness(long i, unsigned long u) {
   // CHECK: fix-it:"{{.*}}":{[[@LINE-2]]:11-[[@LINE-2]]:14}:"%+ld"
 }
 
+void testSizeTypes() {
+  printf("%zu", 0.f); // expected-warning-re{{format specifies type 'size_t' (aka '{{.+}}') but the argument has type 'float'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f"
+
+  printf("%zd", 0.f); // expected-warning-re{{format specifies type 'ssize_t' (aka '{{.+}}') but the argument has type 'float'}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f"
+  
+  short x;
+  printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}}
+  // PrintfSpecifier::fixType doesn't handle %n, so a fix-it is not emitted, 
+  // see the comment in PrintfSpecifier::fixType in PrintfFormatString.cpp.
+}
+
 void testEnum() {
   typedef enum {
     ImplicitA = 1,
diff --git a/test/Index/Core/index-source.m b/test/Index/Core/index-source.m
index c911973a70d6..1333cafd7575 100644
--- a/test/Index/Core/index-source.m
+++ b/test/Index/Core/index-source.m
@@ -438,3 +438,28 @@ void testImplicitProperties(ImplicitProperties *c) {
 // CHECK: [[@LINE-1]]:22 | class-method/ObjC | classImplicit | c:objc(cs)ImplicitProperties(cm)classImplicit | +[ImplicitProperties classImplicit] | Ref,Call,RelCall,RelCont | rel: 1
 // CHECK-NEXT: RelCall,RelCont | testImplicitProperties | c:@F@testImplicitProperties
 }
+
+@interface EmptySelectors
+
+- (int):(int)_; // CHECK: [[@LINE]]:8 | instance-method/ObjC | : | c:objc(cs)EmptySelectors(im): | -[EmptySelectors :]
+- (void)test: (int)x :(int)y; // CHECK: [[@LINE]]:9 | instance-method/ObjC | test:: | c:objc(cs)EmptySelectors(im)test:: | -[EmptySelectors test::]
+- (void):(int)_ :(int)m:(int)z; // CHECK: [[@LINE]]:9 | instance-method/ObjC | ::: | c:objc(cs)EmptySelectors(im)::: | -[EmptySelectors :::]
+
+@end
+
+@implementation EmptySelectors
+
+- (int):(int)_ { // CHECK: [[@LINE]]:8 | instance-method/ObjC | : | c:objc(cs)EmptySelectors(im): | -[EmptySelectors :]
+  [self :2]; // CHECK: [[@LINE]]:9 | instance-method/ObjC | : | c:objc(cs)EmptySelectors(im): | -[EmptySelectors :]
+  return 0;
+}
+
+- (void)test: (int)x :(int)y { // CHECK: [[@LINE]]:9 | instance-method/ObjC | test:: | c:objc(cs)EmptySelectors(im)test:: | -[EmptySelectors test::]
+}
+
+- (void) :(int)_ :(int)m :(int)z { // CHECK: [[@LINE]]:10 | instance-method/ObjC | ::: | c:objc(cs)EmptySelectors(im)::: | -[EmptySelectors :::]
+  [self test:0:1]; // CHECK: [[@LINE]]:9 | instance-method/ObjC | test:: | c:objc(cs)EmptySelectors(im)test:: | -[EmptySelectors test::]
+  [self: 0: 1: 2]; // CHECK: [[@LINE]]:8 | instance-method/ObjC | ::: | c:objc(cs)EmptySelectors(im)::: | -[EmptySelectors :::]
+}
+
+@end
diff --git a/test/Index/Core/no-templated-canonical-decl.cpp b/test/Index/Core/no-templated-canonical-decl.cpp
new file mode 100644
index 000000000000..5434e9cb6515
--- /dev/null
+++ b/test/Index/Core/no-templated-canonical-decl.cpp
@@ -0,0 +1,4 @@
+// RUN: c-index-test core -print-source-symbols -include-locals -- %s | FileCheck %s
+
+template <template <typename> class A> class B { typedef A<int> A_int; };
+// CHECK: [[@LINE-1]]:46 | class(Gen)/C++ | B | c:@ST>1#t>1#T@B | <no-cgname> | Def | rel: 0
diff --git a/test/Index/complete-available.m b/test/Index/complete-available.m
index 8267dbdddc5c..6c0a2622be03 100644
--- a/test/Index/complete-available.m
+++ b/test/Index/complete-available.m
@@ -8,8 +8,8 @@ void atAvailable() {
   }
 }
 
-// RUN: c-index-test -code-completion-at=%s:4:18 %s | FileCheck -check-prefix=CHECK %s
-// RUN: c-index-test -code-completion-at=%s:7:27 %s | FileCheck -check-prefix=CHECK %s
+// RUN: c-index-test -code-completion-at=%s:4:18 %s | FileCheck %s
+// RUN: c-index-test -code-completion-at=%s:7:27 %s | FileCheck %s
 // CHECK: {TypedText iOS} (40)
 // CHECK: {TypedText iOSApplicationExtension} (40)
 // CHECK: {TypedText macOS} (40)
diff --git a/test/Misc/ast-dump-decl.c b/test/Misc/ast-dump-decl.c
index 313c808c4ac8..71b98c9443cf 100644
--- a/test/Misc/ast-dump-decl.c
+++ b/test/Misc/ast-dump-decl.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump -ast-dump-filter Test %s | FileCheck -check-prefix CHECK -strict-whitespace %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump -ast-dump-filter Test %s | FileCheck -strict-whitespace %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck -check-prefix CHECK-TU -strict-whitespace %s
 // RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=X -triple x86_64-unknown-unknown -fmodule-map-file=%S/Inputs/module.modulemap -ast-dump -ast-dump-filter Test %s -DMODULES | FileCheck -check-prefix CHECK -check-prefix CHECK-MODULES -strict-whitespace %s
 
diff --git a/test/Misc/ast-dump-decl.cpp b/test/Misc/ast-dump-decl.cpp
index 7370f46c5b32..a48261d367d9 100644
--- a/test/Misc/ast-dump-decl.cpp
+++ b/test/Misc/ast-dump-decl.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -fms-extensions -ast-dump -ast-dump-filter Test %s | FileCheck -check-prefix CHECK -strict-whitespace %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-linux-gnu -fms-extensions -ast-dump -ast-dump-filter Test %s | FileCheck -strict-whitespace %s
 
 class testEnumDecl {
   enum class TestEnumDeclScoped;
diff --git a/test/Modules/DebugInfoTransitiveImport.m b/test/Modules/DebugInfoTransitiveImport.m
index 034a909333cd..08dfecfb7899 100644
--- a/test/Modules/DebugInfoTransitiveImport.m
+++ b/test/Modules/DebugInfoTransitiveImport.m
@@ -12,9 +12,9 @@
 
 // Definition of left:
 // CHECK: !DICompileUnit({{.*}}dwoId:
-// CHECK: !DIFile({{.*}}diamond_left
+// CHECK: ![[LEFT:[0-9]+]] = !DIFile({{.*}}diamond_left.h
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration,
-// CHECK-SAME:              entity: ![[MODULE:.*]], line: 3)
+// CHECK-SAME:              entity: ![[MODULE:.*]], file: ![[LEFT]], line: 3)
 // CHECK: ![[MODULE]] = !DIModule(scope: null, name: "diamond_top"
 
 // Skeleton for top:
diff --git a/test/Modules/ExtDebugInfo.cpp b/test/Modules/ExtDebugInfo.cpp
index fd3bb940751f..ed9d1e859d83 100644
--- a/test/Modules/ExtDebugInfo.cpp
+++ b/test/Modules/ExtDebugInfo.cpp
@@ -71,6 +71,8 @@ void foo() {
   anon.i = GlobalStruct.i = GlobalUnion.i = GlobalEnum;
 }
 
+// CHECK: ![[CPP:.*]] = !DIFile(filename: {{.*}}ExtDebugInfo.cpp"
+
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "Enum",
 // CHECK-SAME:             scope: ![[NS:[0-9]+]],
 // CHECK-SAME:             flags: DIFlagFwdDecl,
@@ -201,7 +203,7 @@ void foo() {
 // CHECK-SAME:              name: "InAnonymousNamespace", {{.*}}DIFlagFwdDecl)
 
 
-// CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !{{[0-9]+}}, entity: ![[STRUCT]], line: 27)
+// CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !{{[0-9]+}}, entity: ![[STRUCT]], file: ![[CPP]], line: 27)
 
 // CHECK: !DICompileUnit(
 // CHECK-SAME:           splitDebugFilename:
diff --git a/test/Modules/Inputs/DebugObjCImport.h b/test/Modules/Inputs/DebugObjCImport.h
new file mode 100644
index 000000000000..b5ef366fca1f
--- /dev/null
+++ b/test/Modules/Inputs/DebugObjCImport.h
@@ -0,0 +1,2 @@
+@import Empty;
+struct DebugObjCImport {};
diff --git a/test/Modules/Inputs/module.map b/test/Modules/Inputs/module.map
index c0fe6c557f22..4cb3e8a02804 100644
--- a/test/Modules/Inputs/module.map
+++ b/test/Modules/Inputs/module.map
@@ -347,6 +347,12 @@ module DebugObjC {
   header "DebugObjC.h"
 }
 
+module DebugObjCImport {
+  module SubModule {
+    header "DebugObjCImport.h"
+  }
+}
+
 module ImportNameInDir {
   header "ImportNameInDir.h"
   export *
diff --git a/test/Modules/ModuleDebugInfo.m b/test/Modules/ModuleDebugInfo.m
index d1a0e8d5e80a..9289abe08080 100644
--- a/test/Modules/ModuleDebugInfo.m
+++ b/test/Modules/ModuleDebugInfo.m
@@ -23,8 +23,10 @@
 @import DebugObjC;
 #endif
 
-// CHECK: distinct !DICompileUnit(language: DW_LANG_ObjC
-// CHECK-SAME:                    isOptimized: false,
+// CHECK: distinct !DICompileUnit(language: DW_LANG_ObjC, file: ![[FILE:[0-9]+]],
+// CHECK-SAME:                    isOptimized: false
+
+// CHECK: ![[FILE]] = !DIFile(filename: "{{DebugObjC|.*DebugObjC.h}}"
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
 // CHECK-SAME:             scope: ![[MODULE:[0-9]+]],
diff --git a/test/Modules/debug-info-moduleimport-in-module.m b/test/Modules/debug-info-moduleimport-in-module.m
new file mode 100644
index 000000000000..cde46115d288
--- /dev/null
+++ b/test/Modules/debug-info-moduleimport-in-module.m
@@ -0,0 +1,21 @@
+// Test that an @import inside a module is not represented in the debug info.
+
+// REQUIRES: asserts
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -x objective-c -fmodules -fmodule-format=obj \
+// RUN:   -fimplicit-module-maps -fmodules-cache-path=%t %s \
+// RUN:   -debugger-tuning=lldb -I %S/Inputs -emit-llvm -o %t.ll \
+// RUN:   -mllvm -debug-only=pchcontainer &>%t-mod.ll
+// RUN: cat %t-mod.ll | FileCheck %s
+
+@import DebugObjCImport.SubModule;
+
+// CHECK: distinct !DICompileUnit(language: DW_LANG_ObjC
+// CHECK: DW_TAG_structure_type, name: "DebugObjCImport"
+// CHECK: ![[HEADER:.*]] = !DIFile(filename: {{.*}}DebugObjCImport.h"
+// CHECK: ![[SUBMOD:.*]] = !DIModule({{.*}}name: "SubModule"
+// CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration,
+// CHECK-SAME:              scope: ![[SUBMOD]], entity: ![[EMPTY:[0-9]+]],
+// CHECK-SAME:              file: ![[HEADER]], line: 1)
+// CHECK: ![[EMPTY]] = !DIModule(scope: null, name: "Empty"
diff --git a/test/Modules/debug-info-moduleimport.m b/test/Modules/debug-info-moduleimport.m
index bf60690be40b..16820baebe68 100644
--- a/test/Modules/debug-info-moduleimport.m
+++ b/test/Modules/debug-info-moduleimport.m
@@ -10,12 +10,13 @@
 // CHECK: ![[CU:.*]] = distinct !DICompileUnit
 @import DebugObjC;
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: ![[CU]],
-// CHECK-SAME:              entity: ![[MODULE:.*]], line: [[@LINE-2]])
+// CHECK-SAME:              entity: ![[MODULE:.*]], file: ![[F:[0-9]+]],
+// CHECK-SAME:              line: [[@LINE-3]])
 // CHECK: ![[MODULE]] = !DIModule(scope: null, name: "DebugObjC",
 // CHECK-SAME:  configMacros: "\22-DGREETING=Hello World\22 \22-UNDEBUG\22",
 // CHECK-SAME:  includePath: "{{.*}}test{{.*}}Modules{{.*}}Inputs",
 // CHECK-SAME:  isysroot: "/tmp/..")
-
+// CHECK: ![[F]] = !DIFile(filename: {{.*}}debug-info-moduleimport.m
 
 // RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:   %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - \
diff --git a/test/Modules/odr_hash.cpp b/test/Modules/odr_hash.cpp
index ee45ae529961..68a988dc8e78 100644
--- a/test/Modules/odr_hash.cpp
+++ b/test/Modules/odr_hash.cpp
@@ -517,8 +517,91 @@ S14 s14;
 // expected-error@second.h:* {{'Method::S14' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'A' with 1st parameter of type 'int *' decayed from 'int [3]'}}
 // expected-note@first.h:* {{but in 'FirstModule' found method 'A' with 1st parameter of type 'int *' decayed from 'int [2]'}}
 #endif
+
+#if defined(FIRST)
+struct S15 {
+  int A() { return 0; }
+};
+#elif defined(SECOND)
+struct S15 {
+  long A() { return 0; }
+};
+#else
+S15 s15;
+// expected-error@first.h:* {{'Method::S15::A' from module 'FirstModule' is not present in definition of 'Method::S15' in module 'SecondModule'}}
+// expected-note@second.h:* {{declaration of 'A' does not match}}
+#endif
 }  // namespace Method
 
+namespace Constructor {
+#if defined(FIRST)
+struct S1 {
+  S1() {}
+  void foo() {}
+};
+#elif defined(SECOND)
+struct S1 {
+  void foo() {}
+  S1() {}
+};
+#else
+S1 s1;
+// expected-error@second.h:* {{'Constructor::S1' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'foo'}}
+// expected-note@first.h:* {{but in 'FirstModule' found constructor}}
+#endif
+
+#if defined(FIRST)
+struct S2 {
+  S2(int) {}
+  S2(int, int) {}
+};
+#elif defined(SECOND)
+struct S2 {
+  S2(int, int) {}
+  S2(int) {}
+};
+#else
+S2* s2;
+// expected-error@second.h:* {{'Constructor::S2' has different definitions in different modules; first difference is definition in module 'SecondModule' found constructor that has 2 parameters}}
+// expected-note@first.h:* {{but in 'FirstModule' found constructor that has 1 parameter}}
+#endif
+}  // namespace Constructor
+
+namespace Destructor {
+#if defined(FIRST)
+struct S1 {
+  ~S1() {}
+  S1() {}
+};
+#elif defined(SECOND)
+struct S1 {
+  S1() {}
+  ~S1() {}
+};
+#else
+S1 s1;
+// expected-error@second.h:* {{'Destructor::S1' has different definitions in different modules; first difference is definition in module 'SecondModule' found constructor}}
+// expected-note@first.h:* {{but in 'FirstModule' found destructor}}
+#endif
+
+#if defined(FIRST)
+struct S2 {
+  virtual ~S2() {}
+  void foo() {}
+};
+#elif defined(SECOND)
+struct S2 {
+  ~S2() {}
+  virtual void foo() {}
+};
+#else
+S2 s2;
+// expected-error@second.h:* {{'Destructor::S2' has different definitions in different modules; first difference is definition in module 'SecondModule' found destructor is not virtual}}
+// expected-note@first.h:* {{but in 'FirstModule' found destructor is virtual}}
+#endif
+
+}  // namespace Destructor
+
 // Naive parsing of AST can lead to cycles in processing.  Ensure
 // self-references don't trigger an endless cycles of AST node processing.
 namespace SelfReference {
diff --git a/test/OpenMP/distribute_parallel_for_if_codegen.cpp b/test/OpenMP/distribute_parallel_for_if_codegen.cpp
index e6cd7215f585..9ee909e0139c 100644
--- a/test/OpenMP/distribute_parallel_for_if_codegen.cpp
+++ b/test/OpenMP/distribute_parallel_for_if_codegen.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/test/OpenMP/for_reduction_codegen.cpp b/test/OpenMP/for_reduction_codegen.cpp
index 274546b06547..d5afae990bf4 100644
--- a/test/OpenMP/for_reduction_codegen.cpp
+++ b/test/OpenMP/for_reduction_codegen.cpp
@@ -497,15 +497,12 @@ int main() {
 // CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
 
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}**
+// CHECK: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** [[ARR_ADDR:%.+]],
+// CHECK: [[ARR:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARR_ADDR]],
 
-// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
-// CHECK: [[LB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[LB1_0:%.+]] = getelementptr inbounds i32, i32* [[LB1]], i64 0
-// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
-// CHECK: [[UB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[UB1_UP:%.+]] = getelementptr inbounds i32, i32* [[UB1]], i64 %
-// CHECK: [[UB_CAST:%.+]] = ptrtoint i32* [[UB1_UP]] to i64
-// CHECK: [[LB_CAST:%.+]] = ptrtoint i32* [[LB1_0]] to i64
+// CHECK: [[UB_CAST:%.+]] = ptrtoint i32* [[UB1_UP:%.+]] to i64
+// CHECK: [[LB_CAST:%.+]] = ptrtoint i32* [[LB1_0:%.+]] to i64
 // CHECK: [[DIFF:%.+]] = sub i64 [[UB_CAST]], [[LB_CAST]]
 // CHECK: [[SIZE_1:%.+]] = sdiv exact i64 [[DIFF]], ptrtoint (i32* getelementptr (i32, i32* null, i32 1) to i64)
 // CHECK: [[ARR_SIZE:%.+]] = add nuw i64 [[SIZE_1]], 1
@@ -704,7 +701,7 @@ int main() {
 
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
-// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: [[ARR_SIZE:%.+]] = udiv exact i64
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
 
@@ -718,7 +715,6 @@ int main() {
 // CHECK: br i1 [[DONE]],
 
 // Check initialization of private copy.
-// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
 // CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
 // CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
 // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]]
@@ -727,6 +723,7 @@ int main() {
 // CHECK: call void @_ZN1SIfEC1Ev([[S_FLOAT_TY]]* %
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
 // CHECK: [[ARRS_PRIV_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]] to [[S_FLOAT_TY]]*
 
 // CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
@@ -895,23 +892,15 @@ int main() {
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 // CHECK: [[VAR2_ORIG:%.+]] = load [[S_FLOAT_TY]]***, [[S_FLOAT_TY]]**** [[VAR2_ORIG_ADDR]],
 
-// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
-// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 0
-// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
-// CHECK: [[LOW:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 1
-// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
-// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 4
-// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
-// CHECK: getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 6
-// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
-// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
-// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW:%.+]] to i64
 // CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
 // CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
 // CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[VAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
 // CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
 // CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
@@ -932,8 +921,6 @@ int main() {
 // CHECK: [[VVAR2_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VVAR2_ORIG_ADDR]],
 
 // CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 0
-// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 4
-// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
 // CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
@@ -941,6 +928,7 @@ int main() {
 // CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[VVAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
 // CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
@@ -960,19 +948,15 @@ int main() {
 
 // CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
-// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
-// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 2
-// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
-// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW:%.+]] to i64
 // CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
 // CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
 // CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[VAR3_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
 // CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
@@ -997,11 +981,11 @@ int main() {
 // CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
 // CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 2
 
 // CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 
 // CHECK: ret void
 
diff --git a/test/OpenMP/for_reduction_codegen_UDR.cpp b/test/OpenMP/for_reduction_codegen_UDR.cpp
index 699e06dda85a..95b04f44d468 100644
--- a/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -306,15 +306,12 @@ int main() {
 // CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
 
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}**
+// CHECK: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** [[ARR_ADDR:%.+]],
+// CHECK: [[ARR:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARR_ADDR]],
 
-// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
-// CHECK: [[LB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[LB1_0:%.+]] = getelementptr inbounds i32, i32* [[LB1]], i64 0
-// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
-// CHECK: [[UB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[UB1_UP:%.+]] = getelementptr inbounds i32, i32* [[UB1]], i64 %
-// CHECK: [[UB_CAST:%.+]] = ptrtoint i32* [[UB1_UP]] to i64
-// CHECK: [[LB_CAST:%.+]] = ptrtoint i32* [[LB1_0]] to i64
+// CHECK: [[UB_CAST:%.+]] = ptrtoint i32* [[UB1_UP:%.+]] to i64
+// CHECK: [[LB_CAST:%.+]] = ptrtoint i32* [[LB1_0:%.+]] to i64
 // CHECK: [[DIFF:%.+]] = sub i64 [[UB_CAST]], [[LB_CAST]]
 // CHECK: [[SIZE_1:%.+]] = sdiv exact i64 [[DIFF]], ptrtoint (i32* getelementptr (i32, i32* null, i32 1) to i64)
 // CHECK: [[ARR_SIZE:%.+]] = add nuw i64 [[SIZE_1]], 1
@@ -508,7 +505,7 @@ int main() {
 
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
-// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: [[ARR_SIZE:%.+]] = udiv exact i64
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
 
@@ -522,8 +519,8 @@ int main() {
 // CHECK: br i1 [[DONE]],
 
 // Check initialization of private copy.
-// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
 // CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
 // CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
 // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]]
 // CHECK: br i1 [[ISEMPTY]],
@@ -531,6 +528,7 @@ int main() {
 // CHECK: call void @_Z4initR6BaseS1RKS_(%
 // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
 // CHECK: br i1 [[DONE]],
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
 // CHECK: [[ARRS_PRIV_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]] to [[S_FLOAT_TY]]*
 
 // CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
@@ -695,23 +693,15 @@ int main() {
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 // CHECK: [[VAR2_ORIG:%.+]] = load [[S_FLOAT_TY]]***, [[S_FLOAT_TY]]**** [[VAR2_ORIG_ADDR]],
 
-// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
-// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 0
-// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
-// CHECK: [[LOW:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 1
-// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
-// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 4
-// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
-// CHECK: getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 6
-// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
-// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
-// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW:%.+]] to i64
 // CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
 // CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
 // CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[VAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
 // CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
 // CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
@@ -732,8 +722,6 @@ int main() {
 // CHECK: [[VVAR2_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VVAR2_ORIG_ADDR]],
 
 // CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 0
-// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 4
-// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
 // CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
@@ -741,6 +729,7 @@ int main() {
 // CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[VVAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
 // CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
@@ -760,19 +749,15 @@ int main() {
 
 // CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
-// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
-// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 2
-// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
-// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW:%.+]] to i64
 // CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
 // CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
 // CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
 // CHECK: call i8* @llvm.stacksave()
 // CHECK: [[VAR3_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
 // CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
 // CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
@@ -797,8 +782,8 @@ int main() {
 // CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
-// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
+// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
 // CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 2
 
 // CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
diff --git a/test/OpenMP/parallel_if_codegen.cpp b/test/OpenMP/parallel_if_codegen.cpp
index af154c0e36dd..87dcf1dde943 100644
--- a/test/OpenMP/parallel_if_codegen.cpp
+++ b/test/OpenMP/parallel_if_codegen.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
diff --git a/test/OpenMP/taskgroup_ast_print.cpp b/test/OpenMP/taskgroup_ast_print.cpp
index 683141e686ec..4292eedfb56e 100644
--- a/test/OpenMP/taskgroup_ast_print.cpp
+++ b/test/OpenMP/taskgroup_ast_print.cpp
@@ -8,6 +8,62 @@
 
 void foo() {}
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+  S1& operator +(const S1&);
+  S1& operator *(const S1&);
+  S1& operator &&(const S1&);
+  S1& operator ^(const S1&);
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  T b[100];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp taskgroup task_reduction(+ : a) task_reduction(*: b[:])
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp taskgroup task_reduction(&& : this->a) task_reduction(^: b[s.a.a])
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp taskgroup task_reduction(+: this->a) task_reduction(*: this->b[:])
+// CHECK: #pragma omp taskgroup task_reduction(&&: this->a) task_reduction(^: this->b[s.a.a])
+// CHECK: #pragma omp taskgroup task_reduction(+: this->a) task_reduction(*: this->b[:])
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp taskgroup task_reduction(^ : S7 < S1 > ::a) task_reduction(+ : S7 < S1 > ::b[ : S7 < S1 > ::a.a])
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp taskgroup task_reduction(* : this->a) task_reduction(&&:this->b[a.a:])
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp taskgroup task_reduction(^: this->S7<S1>::a) task_reduction(+: this->S7<S1>::b[:this->S7<S1>::a.a])
+// CHECK: #pragma omp taskgroup task_reduction(*: this->a) task_reduction(&&: this->b[this->a.a:])
+
 int main (int argc, char **argv) {
   int b = argc, c, d, e, f, g;
   static int a;
@@ -18,9 +74,9 @@ int main (int argc, char **argv) {
 // CHECK-NEXT: a = 2;
 // CHECK-NEXT: ++a;
   ++a;
-#pragma omp taskgroup
+#pragma omp taskgroup task_reduction(min: a)
   foo();
-// CHECK-NEXT: #pragma omp taskgroup
+// CHECK-NEXT: #pragma omp taskgroup task_reduction(min: a)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: return 0;
   return 0;
diff --git a/test/OpenMP/taskgroup_messages.cpp b/test/OpenMP/taskgroup_messages.cpp
index e893da4be3fa..23820734a551 100644
--- a/test/OpenMP/taskgroup_messages.cpp
+++ b/test/OpenMP/taskgroup_messages.cpp
@@ -63,5 +63,7 @@ int foo() {
     foo();
   }
 
+#pragma omp taskgroup init // expected-warning {{extra tokens at the end of '#pragma omp taskgroup' are ignored}}
+  ;
   return 0;
 }
diff --git a/test/OpenMP/taskgroup_task_reduction_messages.cpp b/test/OpenMP/taskgroup_task_reduction_messages.cpp
new file mode 100644
index 000000000000..9c8177b0bd43
--- /dev/null
+++ b/test/OpenMP/taskgroup_task_reduction_messages.cpp
@@ -0,0 +1,258 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+void foobar(int &ref) {
+#pragma omp taskgroup task_reduction(+:ref)
+  foo();
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                    // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];     // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];
+  T fl;
+#pragma omp taskgroup task_reduction // expected-error {{expected '(' after 'task_reduction'}}
+  foo();
+#pragma omp taskgroup task_reduction + // expected-error {{expected '(' after 'task_reduction'}} expected-warning {{extra tokens at the end of '#pragma omp taskgroup' are ignored}}
+  foo();
+#pragma omp taskgroup task_reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp taskgroup task_reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp taskgroup task_reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp taskgroup task_reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp taskgroup task_reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp taskgroup task_reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp taskgroup task_reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp taskgroup task_reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  foo();
+#pragma omp taskgroup task_reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  foo();
+#pragma omp taskgroup task_reduction(&& : argc)
+  foo();
+#pragma omp taskgroup task_reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  foo();
+#pragma omp taskgroup task_reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'task_reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp taskgroup task_reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  foo();
+#pragma omp parallel private(k)
+#pragma omp taskgroup task_reduction(+ : p), task_reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'task_reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : p), task_reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'task_reduction' clause}} expected-note 2 {{previously referenced here}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp taskgroup task_reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'task_reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp parallel
+#pragma omp for private(fl)
+  for (int i = 0; i < 10; ++i)
+#pragma omp taskgroup task_reduction(+ : fl)
+    foo();
+#pragma omp parallel
+#pragma omp for reduction(- : fl)
+  for (int i = 0; i < 10; ++i)
+#pragma omp taskgroup task_reduction(+ : fl)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                  // expected-note {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];        // expected-note {{'r' defined here}}
+  int &q = qa[i];
+  float fl;
+#pragma omp taskgroup task_reduction // expected-error {{expected '(' after 'task_reduction'}}
+  foo();
+#pragma omp taskgroup task_reduction + // expected-error {{expected '(' after 'task_reduction'}} expected-warning {{extra tokens at the end of '#pragma omp taskgroup' are ignored}}
+  foo();
+#pragma omp taskgroup task_reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp taskgroup task_reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp taskgroup task_reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp taskgroup task_reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp taskgroup task_reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp taskgroup task_reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  foo();
+#pragma omp taskgroup task_reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp taskgroup task_reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp taskgroup task_reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  foo();
+#pragma omp taskgroup task_reduction(&& : argc)
+  foo();
+#pragma omp taskgroup task_reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  foo();
+#pragma omp taskgroup task_reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'task_reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp taskgroup task_reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{nvalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  foo();
+#pragma omp parallel private(k)
+#pragma omp taskgroup task_reduction(+ : p), task_reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'task_reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : p), task_reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'task_reduction' clause}} expected-note {{previously referenced here}}
+  foo();
+#pragma omp taskgroup task_reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp taskgroup task_reduction(max : j) // expected-error {{argument of OpenMP clause 'task_reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp parallel
+#pragma omp for private(fl)
+  for (int i = 0; i < 10; ++i)
+#pragma omp taskgroup task_reduction(+ : fl)
+    foo();
+#pragma omp parallel
+#pragma omp for reduction(- : fl)
+  for (int i = 0; i < 10; ++i)
+#pragma omp taskgroup task_reduction(+ : fl)
+    foo();
+  static int m;
+#pragma omp taskgroup task_reduction(+ : m) // OK
+  m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/taskloop_codegen.cpp b/test/OpenMP/taskloop_codegen.cpp
index b1b45455802a..8de52023f33d 100644
--- a/test/OpenMP/taskloop_codegen.cpp
+++ b/test/OpenMP/taskloop_codegen.cpp
@@ -9,7 +9,7 @@
 int main(int argc, char **argv) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@@ -24,7 +24,7 @@ int main(int argc, char **argv) {
 #pragma omp taskloop priority(argc)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     ;
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
@@ -144,7 +144,7 @@ struct S {
   int a;
   S(int c) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
diff --git a/test/OpenMP/taskloop_firstprivate_codegen.cpp b/test/OpenMP/taskloop_firstprivate_codegen.cpp
index f4358db99b8c..6e8453050987 100644
--- a/test/OpenMP/taskloop_firstprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_firstprivate_codegen.cpp
@@ -23,7 +23,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
@@ -57,7 +57,7 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
 // LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@@ -101,7 +101,7 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
   // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@@ -190,7 +190,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
diff --git a/test/OpenMP/taskloop_lastprivate_codegen.cpp b/test/OpenMP/taskloop_lastprivate_codegen.cpp
index e689feba20ac..837977eb57c0 100644
--- a/test/OpenMP/taskloop_lastprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_lastprivate_codegen.cpp
@@ -23,7 +23,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
@@ -57,7 +57,7 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 
 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
@@ -98,7 +98,7 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
   // BLOCKS: ret
@@ -182,7 +182,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
diff --git a/test/OpenMP/taskloop_private_codegen.cpp b/test/OpenMP/taskloop_private_codegen.cpp
index dad9a17767fc..6ca7c402a12b 100644
--- a/test/OpenMP/taskloop_private_codegen.cpp
+++ b/test/OpenMP/taskloop_private_codegen.cpp
@@ -23,7 +23,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
@@ -55,7 +55,7 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
 // LAMBDA: ret
@@ -91,7 +91,7 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
   // BLOCKS: ret
@@ -157,7 +157,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/taskloop_reduction_codegen.cpp b/test/OpenMP/taskloop_reduction_codegen.cpp
new file mode 100644
index 000000000000..af071ae8bd4e
--- /dev/null
+++ b/test/OpenMP/taskloop_reduction_codegen.cpp
@@ -0,0 +1,197 @@
+// RUN: %clang_cc1 -fopenmp -x c++ %s -verify -debug-info-kind=limited -emit-llvm -o - -triple powerpc64le-unknown-linux-gnu | FileCheck %s
+// expected-no-diagnostics
+
+struct S {
+  float a;
+  S() : a(0.0f) {}
+  ~S() {}
+};
+
+#pragma omp declare reduction(+:S:omp_out.a += omp_in.a) initializer(omp_priv = omp_orig)
+
+float g;
+
+int a;
+#pragma omp threadprivate(a)
+int main (int argc, char *argv[])
+{
+int   i, n;
+float a[100], b[100], sum, e[argc + 100];
+S c[100];
+float &d = g;
+
+/* Some initializations */
+n = 100;
+for (i=0; i < n; i++)
+  a[i] = b[i] = i * 1.0;
+sum = 0.0;
+
+#pragma omp taskloop reduction(+:sum, c[:n], d, e)
+  for (i=0; i < n; i++) {
+    sum = sum + (a[i] * b[i]);
+    c[i].a = i*i;
+    d += i*i;
+    e[i] = i;
+  }
+
+}
+
+// CHECK-LABEL: @main(
+// CHECK:    [[RETVAL:%.*]] = alloca i32,
+// CHECK:    [[ARGC_ADDR:%.*]] = alloca i32,
+// CHECK:    [[ARGV_ADDR:%.*]] = alloca i8**,
+// CHECK:    [[I:%.*]] = alloca i32,
+// CHECK:    [[N:%.*]] = alloca i32,
+// CHECK:    [[A:%.*]] = alloca [100 x float],
+// CHECK:    [[B:%.*]] = alloca [100 x float],
+// CHECK:    [[SUM:%.*]] = alloca float,
+// CHECK:    [[SAVED_STACK:%.*]] = alloca i8*,
+// CHECK:    [[C:%.*]] = alloca [100 x %struct.S],
+// CHECK:    [[D:%.*]] = alloca float*,
+// CHECK:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]],
+// CHECK:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%ident_t*
+// CHECK:    [[DOTRD_INPUT_:%.*]] = alloca [4 x %struct.kmp_task_red_input_t],
+// CHECK:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32,
+// CHECK:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32,
+// CHECK:    store i32 0, i32* [[RETVAL]],
+// CHECK:    store i32 [[ARGC:%.*]], i32* [[ARGC_ADDR]],
+// CHECK:    store i8** [[ARGV:%.*]], i8*** [[ARGV_ADDR]],
+// CHECK:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]],
+// CHECK:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 100
+// CHECK:    [[TMP2:%.*]] = zext i32 [[ADD]] to i64
+// CHECK:    [[VLA:%.+]] = alloca float, i64 %
+
+// CHECK:    call void @__kmpc_taskgroup(%ident_t*
+// CHECK-DAG:    [[TMP21:%.*]] = bitcast float* [[SUM]] to i8*
+// CHECK-DAG:    store i8* [[TMP21]], i8** [[TMP20:%[^,]+]],
+// CHECK-DAG:    [[TMP20]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T:%.+]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 1
+// CHECK-DAG:    store i64 4, i64* [[TMP22]],
+// CHECK-DAG:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT1:@.+]] to i8*), i8** [[TMP23]],
+// CHECK-DAG:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 3
+// CHECK-DAG:    store i8* null, i8** [[TMP24]],
+// CHECK-DAG:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB1:@.+]] to i8*), i8** [[TMP25]],
+// CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
+// CHECK-DAG:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
+// CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[TMP31:%.*]] = bitcast %struct.S* [[ARRAYIDX5]] to i8*
+// CHECK-DAG:    store i8* [[TMP31]], i8** [[TMP28:%[^,]+]],
+// CHECK-DAG:    [[TMP28]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP32:%.*]] = ptrtoint %struct.S* [[ARRAYIDX6]] to i64
+// CHECK-DAG:    [[TMP33:%.*]] = ptrtoint %struct.S* [[ARRAYIDX5]] to i64
+// CHECK-DAG:    [[TMP34:%.*]] = sub i64 [[TMP32]], [[TMP33]]
+// CHECK-DAG:    [[TMP35:%.*]] = sdiv exact i64 [[TMP34]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK-DAG:    [[TMP36:%.*]] = add nuw i64 [[TMP35]], 1
+// CHECK-DAG:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK-DAG:    store i64 [[TMP37]], i64* [[TMP38:%[^,]+]],
+// CHECK-DAG:    [[TMP38]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
+// CHECK-DAG:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT2:@.+]] to i8*), i8** [[TMP39]],
+// CHECK-DAG:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 3
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_FINI2:@.+]] to i8*), i8** [[TMP40]],
+// CHECK-DAG:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB2:@.+]] to i8*), i8** [[TMP41]],
+// CHECK-DAG:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 5
+// CHECK-DAG:    store i32 1, i32* [[TMP42]],
+// CHECK-DAG:    [[TMP44:%.*]] = load float*, float** [[D]],
+// CHECK-DAG:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to i8*
+// CHECK-DAG:    store i8* [[TMP45]], i8** [[TMP43:%[^,]+]],
+// CHECK-DAG:    [[TMP43]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
+// CHECK-DAG:    store i64 4, i64* [[TMP46]],
+// CHECK-DAG:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT3:@.+]] to i8*), i8** [[TMP47]],
+// CHECK-DAG:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 3
+// CHECK-DAG:    store i8* null, i8** [[TMP48]],
+// CHECK-DAG:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB3:@.+]] to i8*), i8** [[TMP49]],
+// CHECK-DAG:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
+// CHECK-DAG:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
+// CHECK-DAG:    store i8* [[TMP53]], i8** [[TMP52:%[^,]+]],
+// CHECK-DAG:    [[TMP52]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP54:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK-DAG:    [[TMP55:%.*]] = udiv exact i64 [[TMP54]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK-DAG:    store i64 [[TMP54]], i64* [[TMP56:%[^,]+]],
+// CHECK-DAG:    [[TMP56]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 1
+// CHECK-DAG:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT4:@.+]] to i8*), i8** [[TMP57]],
+// CHECK-DAG:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 3
+// CHECK-DAG:    store i8* null, i8** [[TMP58]],
+// CHECK-DAG:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB4:@.+]] to i8*), i8** [[TMP59]],
+// CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 5
+// CHECK-DAG:    store i32 1, i32* [[TMP60]],
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK:    [[TMP61:%.*]] = bitcast [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]] to i8*
+// CHECK:    [[TMP62:%.*]] = call i8* @__kmpc_task_reduction_init(i32 [[TMP0]], i32 4, i8* [[TMP61]])
+// CHECK:    [[TMP63:%.*]] = load i32, i32* [[N]],
+// CHECK:    store i32 [[TMP63]], i32* [[DOTCAPTURE_EXPR_]],
+// CHECK:    [[TMP64:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]],
+// CHECK:    [[SUB:%.*]] = sub nsw i32 [[TMP64]], 0
+// CHECK:    [[SUB10:%.*]] = sub nsw i32 [[SUB]], 1
+// CHECK:    [[ADD11:%.*]] = add nsw i32 [[SUB10]], 1
+// CHECK:    [[DIV:%.*]] = sdiv i32 [[ADD11]], 1
+// CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 72, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// CHECK:    call void @__kmpc_end_taskgroup(%ident_t*
+
+// CHECK:    ret i32
+
+// CHECK: define internal void [[RED_INIT1]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB1]](i8*, i8*)
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_FINI2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB2]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT3]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB3]](i8*, i8*)
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT4]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB4]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
diff --git a/test/OpenMP/taskloop_simd_codegen.cpp b/test/OpenMP/taskloop_simd_codegen.cpp
index 48c6f479e9fa..f787689688e3 100644
--- a/test/OpenMP/taskloop_simd_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_codegen.cpp
@@ -9,7 +9,7 @@
 int main(int argc, char **argv) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@@ -24,7 +24,7 @@ int main(int argc, char **argv) {
 #pragma omp taskloop simd priority(argc)
   for (int i = 0; i < 10; ++i)
     ;
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     ;
 // CHECK: call void @__kmpc_taskgroup(%ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
@@ -146,7 +146,7 @@ struct S {
   int a;
   S(int c) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
diff --git a/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
index 230ef980e64d..9e268d8a1739 100644
--- a/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
@@ -23,7 +23,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
@@ -57,7 +57,7 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
 // LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@@ -101,7 +101,7 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
   // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
@@ -190,7 +190,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
diff --git a/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
index 41f04f9bdb3c..8d5ebb216808 100644
--- a/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
@@ -23,7 +23,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
@@ -57,7 +57,7 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 
 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
@@ -98,7 +98,7 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
   // BLOCKS: ret
@@ -182,7 +182,7 @@ int main() {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
diff --git a/test/OpenMP/taskloop_simd_private_codegen.cpp b/test/OpenMP/taskloop_simd_private_codegen.cpp
index 493137ade65f..3833b5ea9fb3 100644
--- a/test/OpenMP/taskloop_simd_private_codegen.cpp
+++ b/test/OpenMP/taskloop_simd_private_codegen.cpp
@@ -23,7 +23,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32, i8* }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
@@ -55,7 +55,7 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
 // LAMBDA: ret
@@ -91,7 +91,7 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 96, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
   // BLOCKS: ret
@@ -157,7 +157,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 120, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
diff --git a/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/test/OpenMP/taskloop_simd_reduction_codegen.cpp
new file mode 100644
index 000000000000..b96c8d50a3ff
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_reduction_codegen.cpp
@@ -0,0 +1,197 @@
+// RUN: %clang_cc1 -fopenmp -x c++ %s -verify -debug-info-kind=limited -emit-llvm -o - -triple powerpc64le-unknown-linux-gnu | FileCheck %s
+// expected-no-diagnostics
+
+struct S {
+  float a;
+  S() : a(0.0f) {}
+  ~S() {}
+};
+
+#pragma omp declare reduction(+:S:omp_out.a += omp_in.a) initializer(omp_priv = omp_orig)
+
+float g;
+
+int a;
+#pragma omp threadprivate(a)
+int main (int argc, char *argv[])
+{
+int   i, n;
+float a[100], b[100], sum, e[argc + 100];
+S c[100];
+float &d = g;
+
+/* Some initializations */
+n = 100;
+for (i=0; i < n; i++)
+  a[i] = b[i] = i * 1.0;
+sum = 0.0;
+
+#pragma omp taskloop simd reduction(+:sum, c[:n], d, e)
+  for (i=0; i < n; i++) {
+    sum = sum + (a[i] * b[i]);
+    c[i].a = i*i;
+    d += i*i;
+    e[i] = i;
+  }
+
+}
+
+// CHECK-LABEL: @main(
+// CHECK:    [[RETVAL:%.*]] = alloca i32,
+// CHECK:    [[ARGC_ADDR:%.*]] = alloca i32,
+// CHECK:    [[ARGV_ADDR:%.*]] = alloca i8**,
+// CHECK:    [[I:%.*]] = alloca i32,
+// CHECK:    [[N:%.*]] = alloca i32,
+// CHECK:    [[A:%.*]] = alloca [100 x float],
+// CHECK:    [[B:%.*]] = alloca [100 x float],
+// CHECK:    [[SUM:%.*]] = alloca float,
+// CHECK:    [[SAVED_STACK:%.*]] = alloca i8*,
+// CHECK:    [[C:%.*]] = alloca [100 x %struct.S],
+// CHECK:    [[D:%.*]] = alloca float*,
+// CHECK:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]],
+// CHECK:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%ident_t*
+// CHECK:    [[DOTRD_INPUT_:%.*]] = alloca [4 x %struct.kmp_task_red_input_t],
+// CHECK:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32,
+// CHECK:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32,
+// CHECK:    store i32 0, i32* [[RETVAL]],
+// CHECK:    store i32 [[ARGC:%.*]], i32* [[ARGC_ADDR]],
+// CHECK:    store i8** [[ARGV:%.*]], i8*** [[ARGV_ADDR]],
+// CHECK:    [[TMP1:%.*]] = load i32, i32* [[ARGC_ADDR]],
+// CHECK:    [[ADD:%.*]] = add nsw i32 [[TMP1]], 100
+// CHECK:    [[TMP2:%.*]] = zext i32 [[ADD]] to i64
+// CHECK:    [[VLA:%.+]] = alloca float, i64 %
+
+// CHECK:    call void @__kmpc_taskgroup(%ident_t*
+// CHECK-DAG:    [[TMP21:%.*]] = bitcast float* [[SUM]] to i8*
+// CHECK-DAG:    store i8* [[TMP21]], i8** [[TMP20:%[^,]+]],
+// CHECK-DAG:    [[TMP20]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T:%.+]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 1
+// CHECK-DAG:    store i64 4, i64* [[TMP22]],
+// CHECK-DAG:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT1:@.+]] to i8*), i8** [[TMP23]],
+// CHECK-DAG:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 3
+// CHECK-DAG:    store i8* null, i8** [[TMP24]],
+// CHECK-DAG:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB1:@.+]] to i8*), i8** [[TMP25]],
+// CHECK-DAG:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
+// CHECK-DAG:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
+// CHECK-DAG:    [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
+// CHECK-DAG:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
+// CHECK-DAG:    [[TMP31:%.*]] = bitcast %struct.S* [[ARRAYIDX5]] to i8*
+// CHECK-DAG:    store i8* [[TMP31]], i8** [[TMP28:%[^,]+]],
+// CHECK-DAG:    [[TMP28]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP32:%.*]] = ptrtoint %struct.S* [[ARRAYIDX6]] to i64
+// CHECK-DAG:    [[TMP33:%.*]] = ptrtoint %struct.S* [[ARRAYIDX5]] to i64
+// CHECK-DAG:    [[TMP34:%.*]] = sub i64 [[TMP32]], [[TMP33]]
+// CHECK-DAG:    [[TMP35:%.*]] = sdiv exact i64 [[TMP34]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK-DAG:    [[TMP36:%.*]] = add nuw i64 [[TMP35]], 1
+// CHECK-DAG:    [[TMP37:%.*]] = mul nuw i64 [[TMP36]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK-DAG:    store i64 [[TMP37]], i64* [[TMP38:%[^,]+]],
+// CHECK-DAG:    [[TMP38]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 1
+// CHECK-DAG:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT2:@.+]] to i8*), i8** [[TMP39]],
+// CHECK-DAG:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 3
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_FINI2:@.+]] to i8*), i8** [[TMP40]],
+// CHECK-DAG:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB2:@.+]] to i8*), i8** [[TMP41]],
+// CHECK-DAG:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_4]], i32 0, i32 5
+// CHECK-DAG:    store i32 1, i32* [[TMP42]],
+// CHECK-DAG:    [[TMP44:%.*]] = load float*, float** [[D]],
+// CHECK-DAG:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to i8*
+// CHECK-DAG:    store i8* [[TMP45]], i8** [[TMP43:%[^,]+]],
+// CHECK-DAG:    [[TMP43]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 1
+// CHECK-DAG:    store i64 4, i64* [[TMP46]],
+// CHECK-DAG:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT3:@.+]] to i8*), i8** [[TMP47]],
+// CHECK-DAG:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 3
+// CHECK-DAG:    store i8* null, i8** [[TMP48]],
+// CHECK-DAG:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB3:@.+]] to i8*), i8** [[TMP49]],
+// CHECK-DAG:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
+// CHECK-DAG:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
+// CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG:    [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
+// CHECK-DAG:    store i8* [[TMP53]], i8** [[TMP52:%[^,]+]],
+// CHECK-DAG:    [[TMP52]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8:%.+]], i32 0, i32 0
+// CHECK-DAG:    [[TMP54:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK-DAG:    [[TMP55:%.*]] = udiv exact i64 [[TMP54]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK-DAG:    store i64 [[TMP54]], i64* [[TMP56:%[^,]+]],
+// CHECK-DAG:    [[TMP56]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 1
+// CHECK-DAG:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 2
+// CHECK-DAG:    store i8* bitcast (void (i8*)* [[RED_INIT4:@.+]] to i8*), i8** [[TMP57]],
+// CHECK-DAG:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 3
+// CHECK-DAG:    store i8* null, i8** [[TMP58]],
+// CHECK-DAG:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 4
+// CHECK-DAG:    store i8* bitcast (void (i8*, i8*)* [[RED_COMB4:@.+]] to i8*), i8** [[TMP59]],
+// CHECK-DAG:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8]], i32 0, i32 5
+// CHECK-DAG:    store i32 1, i32* [[TMP60]],
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_4]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_7]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK-DAG:    [[DOTRD_INPUT_GEP_8]] = getelementptr inbounds [4 x %struct.kmp_task_red_input_t], [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]], i64 0, i64
+// CHECK:    [[TMP61:%.*]] = bitcast [4 x %struct.kmp_task_red_input_t]* [[DOTRD_INPUT_]] to i8*
+// CHECK:    [[TMP62:%.*]] = call i8* @__kmpc_task_reduction_init(i32 [[TMP0]], i32 4, i8* [[TMP61]])
+// CHECK:    [[TMP63:%.*]] = load i32, i32* [[N]],
+// CHECK:    store i32 [[TMP63]], i32* [[DOTCAPTURE_EXPR_]],
+// CHECK:    [[TMP64:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]],
+// CHECK:    [[SUB:%.*]] = sub nsw i32 [[TMP64]], 0
+// CHECK:    [[SUB10:%.*]] = sub nsw i32 [[SUB]], 1
+// CHECK:    [[ADD11:%.*]] = add nsw i32 [[SUB10]], 1
+// CHECK:    [[DIV:%.*]] = sdiv i32 [[ADD11]], 1
+// CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 72, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// CHECK:    call void @__kmpc_end_taskgroup(%ident_t*
+
+// CHECK:    ret i32
+
+// CHECK: define internal void [[RED_INIT1]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB1]](i8*, i8*)
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_FINI2]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: call void @
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB2]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT3]](i8*)
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB3]](i8*, i8*)
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_INIT4]](i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: store float 0.000000e+00, float* %
+// CHECK: ret void
+
+// CHECK: define internal void [[RED_COMB4]](i8*, i8*)
+// CHECK: call i8* @__kmpc_threadprivate_cached(
+// CHECK: fadd float %
+// CHECK: store float %{{.+}}, float* %
+// CHECK: ret void
+
diff --git a/test/Parser/MicrosoftExtensions.cpp b/test/Parser/MicrosoftExtensions.cpp
index 74f4bb3268de..c796eded1f0d 100644
--- a/test/Parser/MicrosoftExtensions.cpp
+++ b/test/Parser/MicrosoftExtensions.cpp
@@ -261,9 +261,7 @@ int __identifier(else} = __identifier(for); // expected-error {{missing ')' afte
 #define identifier_weird(x) __identifier(x
 int k = identifier_weird(if)); // expected-error {{use of undeclared identifier 'if'}}
 
-// This is a bit weird, but the alternative tokens aren't keywords, and this
-// behavior matches MSVC. FIXME: Consider supporting this anyway.
-extern int __identifier(and) r; // expected-error {{cannot convert '&&' token to an identifier}}
+extern int __identifier(and);
 
 void f() {
   __identifier(() // expected-error {{cannot convert '(' token to an identifier}}
diff --git a/test/Preprocessor/aarch64-target-features.c b/test/Preprocessor/aarch64-target-features.c
index 721246aff3a7..836d29866efe 100644
--- a/test/Preprocessor/aarch64-target-features.c
+++ b/test/Preprocessor/aarch64-target-features.c
@@ -37,6 +37,7 @@
 // CHECK-NOT: __ARM_PCS_VFP 1
 // CHECK-NOT: __ARM_SIZEOF_MINIMAL_ENUM 1
 // CHECK-NOT: __ARM_SIZEOF_WCHAR_T 2
+// CHECK-NOT: __ARM_FEATURE_SVE
 
 // RUN: %clang -target aarch64_be-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-BIGENDIAN
 // CHECK-BIGENDIAN: __ARM_BIG_ENDIAN 1
@@ -84,6 +85,10 @@
 // CHECK-GENERIC: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon"
 
 // RUN: %clang -target aarch64 -mtune=cyclone -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s
+
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-a+sve -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE %s
+// CHECK-SVE: __ARM_FEATURE_SVE 1
+
 // ================== Check whether -mtune accepts mixed-case features.
 // RUN: %clang -target aarch64 -mtune=CYCLONE -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s
 // CHECK-MTUNE-CYCLONE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+zcm" "-target-feature" "+zcz"
diff --git a/test/Preprocessor/cxx_oper_keyword.cpp b/test/Preprocessor/cxx_oper_keyword.cpp
index 89a094d073ca..a66e8904973f 100644
--- a/test/Preprocessor/cxx_oper_keyword.cpp
+++ b/test/Preprocessor/cxx_oper_keyword.cpp
@@ -29,3 +29,15 @@
 #ifdef and
 #warning and is defined
 #endif
+
+#ifdef OPERATOR_NAMES
+//expected-error@+2 {{invalid token at start of a preprocessor expression}}
+#endif
+#if or
+#endif
+
+#ifdef OPERATOR_NAMES
+//expected-error@+2 {{invalid token at start of a preprocessor expression}}
+#endif
+#if and_eq
+#endif
diff --git a/test/Preprocessor/init.c b/test/Preprocessor/init.c
index 0ee7edecb3c4..5a77d06d2403 100644
--- a/test/Preprocessor/init.c
+++ b/test/Preprocessor/init.c
@@ -9,6 +9,15 @@
 // BLOCKS:#define __block __attribute__((__blocks__(byref)))
 //
 //
+// RUN: %clang_cc1 -x c++ -std=c++2a -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX2A %s
+//
+// CXX2A:#define __GNUG__ {{.*}}
+// CXX2A:#define __GXX_EXPERIMENTAL_CXX0X__ 1
+// CXX2A:#define __GXX_RTTI 1
+// CXX2A:#define __GXX_WEAK__ 1
+// CXX2A:#define __cplusplus 201707L
+// CXX2A:#define __private_extern__ extern
+//
 // RUN: %clang_cc1 -x c++ -std=c++1z -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX1Z %s
 //
 // CXX1Z:#define __GNUG__ {{.*}}
@@ -110,6 +119,13 @@
 // RUN: %clang_cc1 -ffreestanding -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix FREESTANDING %s
 // FREESTANDING:#define __STDC_HOSTED__ 0
 //
+// RUN: %clang_cc1 -x c++ -std=gnu++2a -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX2A %s
+//
+// GXX2A:#define __GNUG__ {{.*}}
+// GXX2A:#define __GXX_WEAK__ 1
+// GXX2A:#define __cplusplus 201707L
+// GXX2A:#define __private_extern__ extern
+//
 //
 // RUN: %clang_cc1 -x c++ -std=gnu++1z -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX1Z %s
 //
diff --git a/test/Preprocessor/predefined-arch-macros.c b/test/Preprocessor/predefined-arch-macros.c
index da4c5d03574b..8976f9e8fb27 100644
--- a/test/Preprocessor/predefined-arch-macros.c
+++ b/test/Preprocessor/predefined-arch-macros.c
@@ -2237,6 +2237,25 @@
 // CHECK_SYSTEMZ_ARCH11: #define __s390x__ 1
 // CHECK_SYSTEMZ_ARCH11: #define __zarch__ 1
 //
+// RUN: %clang -march=arch12 -E -dM %s -o - 2>&1 \
+// RUN:     -target s390x-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ARCH12
+// RUN: %clang -march=z14 -E -dM %s -o - 2>&1 \
+// RUN:     -target s390x-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ARCH12
+//
+// CHECK_SYSTEMZ_ARCH12: #define __ARCH__ 12
+// CHECK_SYSTEMZ_ARCH12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SYSTEMZ_ARCH12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SYSTEMZ_ARCH12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SYSTEMZ_ARCH12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+// CHECK_SYSTEMZ_ARCH12: #define __HTM__ 1
+// CHECK_SYSTEMZ_ARCH12: #define __LONG_DOUBLE_128__ 1
+// CHECK_SYSTEMZ_ARCH12: #define __VX__ 1
+// CHECK_SYSTEMZ_ARCH12: #define __s390__ 1
+// CHECK_SYSTEMZ_ARCH12: #define __s390x__ 1
+// CHECK_SYSTEMZ_ARCH12: #define __zarch__ 1
+//
 // RUN: %clang -mhtm -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_HTM
@@ -2256,7 +2275,7 @@
 // RUN:     -target s390x-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
 //
-// CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10301
+// CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10302
 
 // Begin amdgcn tests ----------------
 //
diff --git a/test/Sema/tls.c b/test/Sema/tls.c
index a3fdc8ea5132..6d10363305ff 100644
--- a/test/Sema/tls.c
+++ b/test/Sema/tls.c
@@ -12,9 +12,9 @@
 // RUN: %clang_cc1 -triple x86_64-pc-win32 -fsyntax-only %s
 // RUN: %clang_cc1 -triple i386-pc-win32 -fsyntax-only %s
 
-// OpenBSD does not suppport TLS.
-// RUN: not %clang_cc1 -triple x86_64-pc-openbsd -fsyntax-only %s
-// RUN: not %clang_cc1 -triple i386-pc-openbsd -fsyntax-only %s
+// OpenBSD suppports TLS.
+// RUN: %clang_cc1 -triple x86_64-pc-openbsd -fsyntax-only %s
+// RUN: %clang_cc1 -triple i386-pc-openbsd -fsyntax-only %s
 
 // Haiku does not suppport TLS.
 // RUN: not %clang_cc1 -triple i586-pc-haiku -fsyntax-only %s
diff --git a/test/Sema/varargs-aarch64.c b/test/Sema/varargs-aarch64.c
new file mode 100644
index 000000000000..145c82105610
--- /dev/null
+++ b/test/Sema/varargs-aarch64.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple aarch64-linux-gnu
+
+void f1(int a, ...) {
+  __builtin_ms_va_list ap;
+  __builtin_ms_va_start(ap, a); // expected-error {{'__builtin_ms_va_start' used in System V ABI function}}
+}
+
+void __attribute__((ms_abi)) f2(int a, ...) {
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a); // expected-error {{'va_start' used in Win64 ABI function}}
+}
diff --git a/test/Sema/varargs-x86-32.c b/test/Sema/varargs-x86-32.c
index 6f57022ffcbf..0359e65892f0 100644
--- a/test/Sema/varargs-x86-32.c
+++ b/test/Sema/varargs-x86-32.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s -triple i386-apple-darwin9
 
 void foo(int a, ...) {
-  __builtin_ms_va_start((void *)0, a); // expected-error {{this builtin is only available on x86-64 targets}}
+  __builtin_ms_va_start((void *)0, a); // expected-error {{this builtin is only available on x86-64 and aarch64 targets}}
 }
diff --git a/test/Sema/zvector2.c b/test/Sema/zvector2.c
new file mode 100644
index 000000000000..0adc0315f0b8
--- /dev/null
+++ b/test/Sema/zvector2.c
@@ -0,0 +1,211 @@
+// RUN: %clang_cc1 -triple s390x-linux-gnu -fzvector -target-cpu z14 \
+// RUN:  -fno-lax-vector-conversions -W -Wall -Wconversion \
+// RUN:  -Werror -fsyntax-only -verify %s
+
+vector signed char sc, sc2;
+vector unsigned char uc, uc2;
+vector bool char bc, bc2;
+
+vector signed short ss, ss2;
+vector unsigned short us, us2;
+vector bool short bs, bs2;
+
+vector signed int si, si2;
+vector unsigned int ui, ui2;
+vector bool int bi, bi2;
+
+vector signed long long sl, sl2;
+vector unsigned long long ul, ul2;
+vector bool long long bl, bl2;
+
+vector double fd, fd2;
+
+vector float ff, ff2;
+
+void foo(void)
+{
+  // -------------------------------------------------------------------------
+  // Test assignment.
+  // -------------------------------------------------------------------------
+
+  ff = ff2;
+
+  sc = ff2; // expected-error {{incompatible type}}
+  ff = sc2; // expected-error {{incompatible type}}
+
+  uc = ff2; // expected-error {{incompatible type}}
+  ff = uc2; // expected-error {{incompatible type}}
+
+  bc = ff2; // expected-error {{incompatible type}}
+  ff = bc2; // expected-error {{incompatible type}}
+
+  fd = ff2; // expected-error {{incompatible type}}
+  ff = fd2; // expected-error {{incompatible type}}
+
+  // -------------------------------------------------------------------------
+  // Test casts to same element width.
+  // -------------------------------------------------------------------------
+
+  ui = (vector unsigned int)ff2;
+  ff = (vector float)si2;
+
+  // -------------------------------------------------------------------------
+  // Test casts to different element width.
+  // -------------------------------------------------------------------------
+
+  uc = (vector unsigned char)ff2;
+  us = (vector unsigned short)ff2;
+  ul = (vector unsigned long long)ff2;
+
+  ff = (vector float)sc2;
+  ff = (vector float)ss2;
+  ff = (vector float)sl2;
+
+  // -------------------------------------------------------------------------
+  // Test unary operators.
+  // -------------------------------------------------------------------------
+
+  ++ff2;
+  ff++;
+
+  --ff2;
+  ff--;
+
+  ff = +ff2;
+
+  ff = -ff2;
+
+  ff = ~ff2; // expected-error {{invalid argument}}
+
+  // -------------------------------------------------------------------------
+  // Test binary arithmetic operators.
+  // -------------------------------------------------------------------------
+
+  ff = ff + ff2;
+  ff = ff + ui2; // expected-error {{cannot convert}}
+  ff = si + ff2; // expected-error {{cannot convert}}
+  ff = fd + ff2; // expected-error {{cannot convert}}
+  ff += ff2;
+  ff += fd2; // expected-error {{cannot convert}}
+  sc += ff2; // expected-error {{cannot convert}}
+
+  ff = ff - ff2;
+  ff = ff - ui2; // expected-error {{cannot convert}}
+  ff = si - ff2; // expected-error {{cannot convert}}
+  ff = fd - ff2; // expected-error {{cannot convert}}
+  ff -= ff2;
+  ff -= fd2; // expected-error {{cannot convert}}
+  sc -= ff2; // expected-error {{cannot convert}}
+
+  ff = ff * ff2;
+  ff = ff * ui2; // expected-error {{cannot convert}}
+  ff = si * ff2; // expected-error {{cannot convert}}
+  ff = fd * ff2; // expected-error {{cannot convert}}
+  ff *= ff2;
+  ff *= fd2; // expected-error {{cannot convert}}
+  sc *= ff2; // expected-error {{cannot convert}}
+
+  ff = ff / ff2;
+  ff = ff / ui2; // expected-error {{cannot convert}}
+  ff = si / ff2; // expected-error {{cannot convert}}
+  ff = fd / ff2; // expected-error {{cannot convert}}
+  ff /= ff2;
+  ff /= fd2; // expected-error {{cannot convert}}
+  sc /= ff2; // expected-error {{cannot convert}}
+
+  ff = ff % ff2; // expected-error {{invalid operands}}
+  ff = ff % ui2; // expected-error {{invalid operands}}
+  ff = si % ff2; // expected-error {{invalid operands}}
+  ff = fd % ff2; // expected-error {{invalid operands}}
+  ff %= ff2; // expected-error {{invalid operands}}
+  ff %= fd2; // expected-error {{invalid operands}}
+  sc %= ff2; // expected-error {{invalid operands}}
+
+  // -------------------------------------------------------------------------
+  // Test bitwise binary operators.
+  // -------------------------------------------------------------------------
+
+  ff = ff & ff2; // expected-error {{invalid operands}}
+  ff = bi & ff2; // expected-error {{invalid operands}}
+  ff = fd & ff2; // expected-error {{invalid operands}}
+  ff = ff & bi2; // expected-error {{invalid operands}}
+  ff = ff & si2; // expected-error {{invalid operands}}
+  ff = ff & ui2; // expected-error {{invalid operands}}
+  sc &= ff2; // expected-error {{invalid operands}}
+  ff &= bc2; // expected-error {{invalid operands}}
+  ff &= fd2; // expected-error {{invalid operands}}
+
+  ff = ff | ff2; // expected-error {{invalid operands}}
+  ff = bi | ff2; // expected-error {{invalid operands}}
+  ff = fd | ff2; // expected-error {{invalid operands}}
+  ff = ff | bi2; // expected-error {{invalid operands}}
+  ff = ff | si2; // expected-error {{invalid operands}}
+  ff = ff | ui2; // expected-error {{invalid operands}}
+  sc |= ff2; // expected-error {{invalid operands}}
+  ff |= bc2; // expected-error {{invalid operands}}
+  ff |= fd2; // expected-error {{invalid operands}}
+
+  ff = ff ^ ff2; // expected-error {{invalid operands}}
+  ff = bi ^ ff2; // expected-error {{invalid operands}}
+  ff = fd ^ ff2; // expected-error {{invalid operands}}
+  ff = ff ^ bi2; // expected-error {{invalid operands}}
+  ff = ff ^ si2; // expected-error {{invalid operands}}
+  ff = ff ^ ui2; // expected-error {{invalid operands}}
+  sc ^= ff2; // expected-error {{invalid operands}}
+  ff ^= bc2; // expected-error {{invalid operands}}
+  ff ^= fd2; // expected-error {{invalid operands}}
+
+  // -------------------------------------------------------------------------
+  // Test shift operators.
+  // -------------------------------------------------------------------------
+
+  ff = ff << ff2; // expected-error {{integer is required}}
+  ff = ff << fd2; // expected-error {{integer is required}}
+  ff = ff << ui2; // expected-error {{integer is required}}
+  ff = sl << ff2; // expected-error {{integer is required}}
+  sc <<= ff2; // expected-error {{integer is required}}
+  ff <<= ff2; // expected-error {{integer is required}}
+  fd <<= ff2; // expected-error {{integer is required}}
+
+  ff = ff >> ff2; // expected-error {{integer is required}}
+  ff = ff >> fd2; // expected-error {{integer is required}}
+  ff = ff >> ui2; // expected-error {{integer is required}}
+  ff = sl >> ff2; // expected-error {{integer is required}}
+  sc >>= ff2; // expected-error {{integer is required}}
+  ff >>= ff2; // expected-error {{integer is required}}
+  fd >>= ff2; // expected-error {{integer is required}}
+
+  // -------------------------------------------------------------------------
+  // Test comparison operators.
+  // -------------------------------------------------------------------------
+
+  (void)(ff == ff2);
+  (void)(ff == fd2); // expected-error {{cannot convert}}
+  (void)(ff == ui2); // expected-error {{cannot convert}}
+  (void)(ui == ff2); // expected-error {{cannot convert}}
+
+  (void)(ff != ff2);
+  (void)(ff != fd2); // expected-error {{cannot convert}}
+  (void)(ff != ui2); // expected-error {{cannot convert}}
+  (void)(ui != ff2); // expected-error {{cannot convert}}
+
+  (void)(ff <= ff2);
+  (void)(ff <= fd2); // expected-error {{cannot convert}}
+  (void)(ff <= ui2); // expected-error {{cannot convert}}
+  (void)(ui <= ff2); // expected-error {{cannot convert}}
+
+  (void)(ff >= ff2);
+  (void)(ff >= fd2); // expected-error {{cannot convert}}
+  (void)(ff >= ui2); // expected-error {{cannot convert}}
+  (void)(ui >= ff2); // expected-error {{cannot convert}}
+
+  (void)(ff < ff2);
+  (void)(ff < fd2); // expected-error {{cannot convert}}
+  (void)(ff < ui2); // expected-error {{cannot convert}}
+  (void)(ui < ff2); // expected-error {{cannot convert}}
+
+  (void)(ff > ff2);
+  (void)(ff > fd2); // expected-error {{cannot convert}}
+  (void)(ff > ui2); // expected-error {{cannot convert}}
+  (void)(ui > ff2); // expected-error {{cannot convert}}
+}
diff --git a/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp b/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp
index 48ccdec251af..ec69e986144f 100644
--- a/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp
+++ b/test/SemaCXX/attr-x86-no_caller_saved_registers.cpp
@@ -24,7 +24,7 @@ void foo6(){} // expected-note {{previous declaration is here}}
 void __attribute__((no_caller_saved_registers)) foo6(); // expected-error {{function declared with 'no_caller_saved_registers' attribute was previously declared without the 'no_caller_saved_registers' attribute}} 
 
 int main(int argc, char **argv) {
-  void (*fp)(int *) = foo; // expected-error {{cannot initialize a variable of type 'void (*)(int *)' with an lvalue of type 'void (int *)__attribute__((no_caller_saved_registers))'}}
+  void (*fp)(int *) = foo; // expected-error {{cannot initialize a variable of type 'void (*)(int *)' with an lvalue of type 'void (int *) __attribute__((no_caller_saved_registers))'}} 
   a::foo(&argc);
   foo3 func = foo2;
   func(&argc);
diff --git a/test/SemaCXX/cxx1z-noexcept-function-type.cpp b/test/SemaCXX/cxx1z-noexcept-function-type.cpp
index 524ea8e53c10..6578eb89b0ba 100644
--- a/test/SemaCXX/cxx1z-noexcept-function-type.cpp
+++ b/test/SemaCXX/cxx1z-noexcept-function-type.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -std=c++14 -verify -fexceptions -fcxx-exceptions %s
 // RUN: %clang_cc1 -std=c++1z -verify -fexceptions -fcxx-exceptions %s -Wno-dynamic-exception-spec
 // RUN: %clang_cc1 -std=c++14 -verify -fexceptions -fcxx-exceptions -Wno-c++1z-compat-mangling -DNO_COMPAT_MANGLING %s
+// RUN: %clang_cc1 -std=c++14 -verify -fexceptions -fcxx-exceptions -Wno-noexcept-type -DNO_COMPAT_MANGLING %s
 
 #if __cplusplus > 201402L
 
diff --git a/test/SemaObjC/arc-property-decl-attrs.m b/test/SemaObjC/arc-property-decl-attrs.m
index 6c96ba481c4a..ee48d310edc0 100644
--- a/test/SemaObjC/arc-property-decl-attrs.m
+++ b/test/SemaObjC/arc-property-decl-attrs.m
@@ -121,3 +121,107 @@ __attribute__((objc_root_class))
 @implementation I2
 @synthesize prop;
 @end
+
+// rdar://31579994
+// Verify that the all of the property declarations in inherited protocols are
+// compatible when synthesing a property from a protocol.
+
+@protocol CopyVsAssign1
+@property (copy, nonatomic,  readonly) id prop; // expected-error {{property with attribute 'copy' was selected for synthesis}}
+@end
+@protocol CopyVsAssign2
+@property (assign, nonatomic, readonly) id prop; // expected-note {{it could also be property without attribute 'copy' declared here}}
+@end
+
+@interface CopyVsAssign: Foo <CopyVsAssign1, CopyVsAssign2>
+@end
+@implementation CopyVsAssign
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
+
+@protocol RetainVsNonRetain1
+@property (readonly) id prop; // expected-error {{property without attribute 'retain (or strong)' was selected for synthesis}}
+@end
+@protocol RetainVsNonRetain2
+@property (retain, readonly) id prop; // expected-note {{it could also be property with attribute 'retain (or strong)' declared here}}
+@end
+
+@interface RetainVsNonRetain: Foo <RetainVsNonRetain1, RetainVsNonRetain2>
+@end
+@implementation RetainVsNonRetain
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
+
+@protocol AtomicVsNonatomic1
+@property (copy, nonatomic, readonly) id prop; // expected-error {{property without attribute 'atomic' was selected for synthesis}}
+@end
+@protocol AtomicVsNonatomic2
+@property (copy, atomic, readonly) id prop; // expected-note {{it could also be property with attribute 'atomic' declared here}}
+@end
+
+@interface AtomicVsNonAtomic: Foo <AtomicVsNonatomic1, AtomicVsNonatomic2>
+@end
+@implementation AtomicVsNonAtomic
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
+
+@protocol Getter1
+@property (copy, readonly) id prop; // expected-error {{property with getter 'prop' was selected for synthesis}}
+@end
+@protocol Getter2
+@property (copy, getter=x, readonly) id prop; // expected-note {{it could also be property with getter 'x' declared here}}
+@end
+
+@interface GetterVsGetter: Foo <Getter1, Getter2>
+@end
+@implementation GetterVsGetter
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
+
+@protocol Setter1
+@property (copy, readonly) id prop;
+@end
+@protocol Setter2
+@property (copy, setter=setp:, readwrite) id prop; // expected-error {{property with setter 'setp:' was selected for synthesis}}
+@end
+@protocol Setter3
+@property (copy, readwrite) id prop; // expected-note {{it could also be property with setter 'setProp:' declared here}}
+@end
+
+@interface SetterVsSetter: Foo <Setter1, Setter2, Setter3>
+@end
+@implementation SetterVsSetter
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
+
+@protocol TypeVsAttribute1
+@property (assign, atomic, readonly) int prop; // expected-error {{property of type 'int' was selected for synthesis}}
+@end
+@protocol TypeVsAttribute2
+@property (assign, atomic, readonly) id prop; // expected-note {{it could also be property of type 'id' declared here}}
+@end
+@protocol TypeVsAttribute3
+@property (copy, readonly) id prop; // expected-note {{it could also be property with attribute 'copy' declared here}}
+@end
+
+@interface TypeVsAttribute: Foo <TypeVsAttribute1, TypeVsAttribute2, TypeVsAttribute3>
+@end
+@implementation TypeVsAttribute
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
+
+@protocol TypeVsSetter1
+@property (assign, nonatomic, readonly) int prop; // expected-note {{it could also be property of type 'int' declared here}}
+@end
+@protocol TypeVsSetter2
+@property (assign, nonatomic, readonly) id prop; // ok
+@end
+@protocol TypeVsSetter3
+@property (assign, nonatomic, readwrite) id prop; // expected-error {{property of type 'id' was selected for synthesis}}
+@end
+
+@interface TypeVsSetter: Foo <TypeVsSetter1, TypeVsSetter2, TypeVsSetter3>
+@end
+@implementation TypeVsSetter
+@synthesize prop; // expected-note {{property synthesized here}}
+@end
diff --git a/test/SemaObjC/attr-ns_returns_retained.m b/test/SemaObjC/attr-ns_returns_retained.m
new file mode 100644
index 000000000000..83397a6556cc
--- /dev/null
+++ b/test/SemaObjC/attr-ns_returns_retained.m
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fsyntax-only -fblocks -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fobjc-arc -fblocks -verify %s
+
+// rdar://20130079
+
+#if __has_feature(objc_arc)
+__attribute__((ns_returns_retained)) id (^invalidBlockRedecl)(); // expected-note {{previous definition is here}}
+id (^invalidBlockRedecl)(); //expected-error {{redefinition of 'invalidBlockRedecl' with a different type: 'id (^__strong)()' vs 'id ((^__strong))() __attribute__((ns_returns_retained))'}}
+#else
+__attribute__((ns_returns_retained)) id (^invalidBlockRedecl)();
+id (^invalidBlockRedecl)();
+#endif
+
+typedef __attribute__((ns_returns_retained)) id (^blockType)();
+
+typedef __attribute__((ns_returns_retained)) int (^invalidBlockType)(); // expected-warning {{'ns_returns_retained' attribute only applies to functions that return an Objective-C object}}
+
+__attribute__((ns_returns_retained)) int functionDecl();  // expected-warning {{'ns_returns_retained' attribute only applies to functions that return an Objective-C object}}
diff --git a/test/SemaObjC/property-ambiguous-synthesis.m b/test/SemaObjC/property-ambiguous-synthesis.m
index 98f59450744c..5c652fa472e7 100644
--- a/test/SemaObjC/property-ambiguous-synthesis.m
+++ b/test/SemaObjC/property-ambiguous-synthesis.m
@@ -2,7 +2,7 @@
 // rdar://13075400
 
 @protocol FooAsID
-@property (copy) id foo; // expected-note 2 {{it could also be property of type 'id' declared here}} \\
+@property (assign) id foo; // expected-note 2 {{it could also be property of type 'id' declared here}} \\
 			 // expected-warning {{property of type 'id' was selected for synthesis}}
 @end
 
diff --git a/test/SemaObjC/warn-deprecated-implementations.m b/test/SemaObjC/warn-deprecated-implementations.m
index df2557b9cd52..440b2886f01d 100644
--- a/test/SemaObjC/warn-deprecated-implementations.m
+++ b/test/SemaObjC/warn-deprecated-implementations.m
@@ -1,9 +1,11 @@
-// RUN: %clang_cc1 -fsyntax-only -Wdeprecated-implementations -verify -Wno-objc-root-class %s
+// RUN: %clang_cc1 -triple=x86_64-apple-macos10.10 -fsyntax-only -Wdeprecated-implementations -verify -Wno-objc-root-class %s
 // rdar://8973810
 // rdar://12717705
 
 @protocol P
 - (void) D __attribute__((deprecated)); // expected-note {{method 'D' declared here}}
+
+- (void) unavailable __attribute__((__unavailable__)); // expected-note {{method 'unavailable' declared here}}
 @end
 
 @interface A <P>
@@ -16,8 +18,10 @@
 
 @implementation A
 + (void)F { }	// No warning, implementing its own deprecated method
-- (void) D {} //  expected-warning {{Implementing deprecated method}}
+- (void) D {} //  expected-warning {{implementing deprecated method}}
 - (void) E {} // No warning, implementing deprecated method in its class extension.
+
+- (void) unavailable { } // expected-warning {{implementing unavailable metho}}
 @end
 
 @interface A(CAT)
@@ -32,10 +36,10 @@ __attribute__((deprecated)) // expected-note {{'CL' has been explicitly marked d
 @interface CL // expected-note 2 {{class declared here}} 
 @end
 
-@implementation CL // expected-warning {{Implementing deprecated class}}
+@implementation CL // expected-warning {{implementing deprecated class}}
 @end
 
-@implementation CL (SomeCategory) // expected-warning {{Implementing deprecated category}}
+@implementation CL (SomeCategory) // expected-warning {{implementing deprecated category}}
 @end
 
 @interface CL_SUB : CL // expected-warning {{'CL' is deprecated}}
@@ -43,13 +47,16 @@ __attribute__((deprecated)) // expected-note {{'CL' has been explicitly marked d
 
 @interface BASE
 - (void) B __attribute__((deprecated)); // expected-note {{method 'B' declared here}}
+
++ (void) unavailable __attribute__((availability(macos, unavailable))); // expected-note {{method 'unavailable' declared here}}
 @end
 
 @interface SUB : BASE
 @end
 
 @implementation SUB
-- (void) B {} // expected-warning {{Implementing deprecated method}}
+- (void) B {} // expected-warning {{implementing deprecated method}}
++ (void) unavailable { } // expected-warning {{implementing unavailable method}}
 @end
 
 @interface Test
@@ -69,5 +76,5 @@ __attribute__((deprecated))
 @interface Test(DeprecatedCategory) // expected-note {{category declared here}}
 @end
 
-@implementation Test(DeprecatedCategory) // expected-warning {{Implementing deprecated category}}
+@implementation Test(DeprecatedCategory) // expected-warning {{implementing deprecated category}}
 @end
diff --git a/tools/clang-fuzzer/CMakeLists.txt b/tools/clang-fuzzer/CMakeLists.txt
index 87d21c6bf116..a4ea4ca19cdd 100644
--- a/tools/clang-fuzzer/CMakeLists.txt
+++ b/tools/clang-fuzzer/CMakeLists.txt
@@ -1,5 +1,5 @@
 if( LLVM_USE_SANITIZE_COVERAGE )
-  set(LLVM_LINK_COMPONENTS support)
+  set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD})
 
   add_clang_executable(clang-fuzzer
     EXCLUDE_FROM_ALL
@@ -10,6 +10,7 @@ if( LLVM_USE_SANITIZE_COVERAGE )
     ${CLANG_FORMAT_LIB_DEPS}
     clangAST
     clangBasic
+    clangCodeGen
     clangDriver
     clangFrontend
     clangRewriteFrontend
diff --git a/tools/clang-fuzzer/ClangFuzzer.cpp b/tools/clang-fuzzer/ClangFuzzer.cpp
index 1692882c0b5f..9eceb843e581 100644
--- a/tools/clang-fuzzer/ClangFuzzer.cpp
+++ b/tools/clang-fuzzer/ClangFuzzer.cpp
@@ -14,18 +14,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Tooling.h"
-#include "clang/Frontend/FrontendActions.h"
+#include "clang/CodeGen/CodeGenAction.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Support/TargetSelect.h"
 
 using namespace clang;
 
 extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) {
   std::string s((const char *)data, size);
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllAsmParsers();
+
   llvm::opt::ArgStringList CC1Args;
   CC1Args.push_back("-cc1");
   CC1Args.push_back("./test.cc");
+  CC1Args.push_back("-O2");
   llvm::IntrusiveRefCntPtr<FileManager> Files(
       new FileManager(FileSystemOptions()));
   IgnoringDiagConsumer Diags;
@@ -39,7 +46,7 @@ extern "C" int LLVMFuzzerTestOneInput(uint8_t *data, size_t size) {
       llvm::MemoryBuffer::getMemBuffer(s);
   Invocation->getPreprocessorOpts().addRemappedFile("./test.cc", Input.release());
   std::unique_ptr<tooling::ToolAction> action(
-      tooling::newFrontendActionFactory<clang::SyntaxOnlyAction>());
+      tooling::newFrontendActionFactory<clang::EmitObjAction>());
   std::shared_ptr<PCHContainerOperations> PCHContainerOps =
       std::make_shared<PCHContainerOperations>();
   action->runInvocation(std::move(Invocation), Files.get(), PCHContainerOps,
diff --git a/tools/libclang/ARCMigrate.cpp b/tools/libclang/ARCMigrate.cpp
index 44a60c4e3e2c..0f2bd06db4b4 100644
--- a/tools/libclang/ARCMigrate.cpp
+++ b/tools/libclang/ARCMigrate.cpp
@@ -14,6 +14,7 @@
 #include "clang-c/Index.h"
 #include "CXString.h"
 #include "clang/ARCMigrate/ARCMT.h"
+#include "clang/Config/config.h"
 #include "clang/Frontend/TextDiagnosticBuffer.h"
 #include "llvm/Support/FileSystem.h"
 
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index 236f264c1758..d527535a17c1 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -2264,6 +2264,23 @@ void OMPClauseEnqueue::VisitOMPReductionClause(const OMPReductionClause *C) {
     Visitor->AddStmt(E);
   }
 }
+void OMPClauseEnqueue::VisitOMPTaskReductionClause(
+    const OMPTaskReductionClause *C) {
+  VisitOMPClauseList(C);
+  VisitOMPClauseWithPostUpdate(C);
+  for (auto *E : C->privates()) {
+    Visitor->AddStmt(E);
+  }
+  for (auto *E : C->lhs_exprs()) {
+    Visitor->AddStmt(E);
+  }
+  for (auto *E : C->rhs_exprs()) {
+    Visitor->AddStmt(E);
+  }
+  for (auto *E : C->reduction_ops()) {
+    Visitor->AddStmt(E);
+  }
+}
 void OMPClauseEnqueue::VisitOMPLinearClause(const OMPLinearClause *C) {
   VisitOMPClauseList(C);
   VisitOMPClauseWithPostUpdate(C);
@@ -8195,7 +8212,7 @@ cxindex::checkForMacroInMacroDefinition(const MacroInfo *MI, const Token &Tok,
     return nullptr;
 
   // Check that the identifier is not one of the macro arguments.
-  if (std::find(MI->arg_begin(), MI->arg_end(), &II) != MI->arg_end())
+  if (std::find(MI->param_begin(), MI->param_end(), &II) != MI->param_end())
     return nullptr;
 
   MacroDirective *InnerMD = PP.getLocalMacroDirectiveHistory(&II);
diff --git a/tools/libclang/CXType.cpp b/tools/libclang/CXType.cpp
index 5875459734a2..d2cb50905915 100644
--- a/tools/libclang/CXType.cpp
+++ b/tools/libclang/CXType.cpp
@@ -611,7 +611,7 @@ CXCallingConv clang_getFunctionTypeCallingConv(CXType X) {
       TCALLINGCONV(X86Pascal);
       TCALLINGCONV(X86RegCall);
       TCALLINGCONV(X86VectorCall);
-      TCALLINGCONV(X86_64Win64);
+      TCALLINGCONV(Win64);
       TCALLINGCONV(X86_64SysV);
       TCALLINGCONV(AAPCS);
       TCALLINGCONV(AAPCS_VFP);
diff --git a/tools/scan-build-py/libscanbuild/analyze.py b/tools/scan-build-py/libscanbuild/analyze.py
index a09c72389d76..b5614b5b6da0 100644
--- a/tools/scan-build-py/libscanbuild/analyze.py
+++ b/tools/scan-build-py/libscanbuild/analyze.py
@@ -249,7 +249,7 @@ def prefix_with(constant, pieces):
     if args.output_format:
         result.append('-analyzer-output={0}'.format(args.output_format))
     if args.analyzer_config:
-        result.append(args.analyzer_config)
+        result.extend(['-analyzer-config', args.analyzer_config])
     if args.verbose >= 4:
         result.append('-analyzer-display-progress')
     if args.plugins:
diff --git a/unittests/ASTMatchers/Dynamic/ParserTest.cpp b/unittests/ASTMatchers/Dynamic/ParserTest.cpp
index ed184a8c1497..0c4c7dfd7802 100644
--- a/unittests/ASTMatchers/Dynamic/ParserTest.cpp
+++ b/unittests/ASTMatchers/Dynamic/ParserTest.cpp
@@ -80,8 +80,8 @@ TEST(ParserTest, ParseBoolean) {
   Sema.parse("true");
   Sema.parse("false");
   EXPECT_EQ(2U, Sema.Values.size());
-  EXPECT_EQ(true, Sema.Values[0].getBoolean());
-  EXPECT_EQ(false, Sema.Values[1].getBoolean());
+  EXPECT_TRUE(Sema.Values[0].getBoolean());
+  EXPECT_FALSE(Sema.Values[1].getBoolean());
 }
 
 TEST(ParserTest, ParseDouble) {
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index 937362f5c9d7..f533ebf2234b 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -333,6 +333,16 @@ TEST_F(FormatTest, RecognizesBinaryOperatorKeywords) {
   verifyFormat("x = (a) xor (b);");
 }
 
+TEST_F(FormatTest, RecognizesUnaryOperatorKeywords) {
+  verifyFormat("x = compl(a);");
+  verifyFormat("x = not(a);");
+  verifyFormat("x = bitand(a);");
+  // Unary operator must not be merged with the next identifier
+  verifyFormat("x = compl a;");
+  verifyFormat("x = not a;");
+  verifyFormat("x = bitand a;");
+}
+
 //===----------------------------------------------------------------------===//
 // Tests for control statements.
 //===----------------------------------------------------------------------===//
@@ -5340,6 +5350,7 @@ TEST_F(FormatTest, UnderstandsUsesOfStarAndAmp) {
   verifyFormat("x = *a(x) = *a(y);", Left);
   verifyFormat("for (;; *a = b) {\n}", Left);
   verifyFormat("return *this += 1;", Left);
+  verifyFormat("throw *x;", Left);
 
   verifyIndependentOfContext("a = *(x + y);");
   verifyIndependentOfContext("a = &(x + y);");
diff --git a/unittests/Format/FormatTestJS.cpp b/unittests/Format/FormatTestJS.cpp
index 11e386a1c7c7..c256ebe46263 100644
--- a/unittests/Format/FormatTestJS.cpp
+++ b/unittests/Format/FormatTestJS.cpp
@@ -1464,6 +1464,17 @@ TEST_F(FormatTestJS, ImportWrapping) {
                "  A,\n"
                "} from 'some/module.js';",
                Style);
+  Style.ColumnLimit = 40;
+  // Using this version of verifyFormat because test::messUp hides the issue.
+  verifyFormat("import {\n"
+               "  A,\n"
+               "} from\n"
+               "    'some/path/longer/than/column/limit/module.js';",
+               " import  {  \n"
+               "    A,  \n"
+               "  }    from\n"
+               "      'some/path/longer/than/column/limit/module.js'  ; ",
+               Style);
 }
 
 TEST_F(FormatTestJS, TemplateStrings) {
diff --git a/unittests/Format/SortImportsTestJS.cpp b/unittests/Format/SortImportsTestJS.cpp
index 7e766e1969e1..4208b29702dd 100644
--- a/unittests/Format/SortImportsTestJS.cpp
+++ b/unittests/Format/SortImportsTestJS.cpp
@@ -283,6 +283,23 @@ TEST_F(SortImportsTestJS, SortCaseInsensitive) {
              "1;");
 }
 
+TEST_F(SortImportsTestJS, SortMultiLine) {
+  // Reproduces issue where multi-line import was not parsed correctly.
+  verifySort("import {A} from 'a';\n"
+             "import {A} from 'b';\n"
+             "\n"
+             "1;",
+             "import\n"
+             "{\n"
+             "A\n"
+             "}\n"
+             "from\n"
+             "'b';\n"
+             "import {A} from 'a';\n"
+             "\n"
+             "1;");
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/unittests/Lex/LexerTest.cpp b/unittests/Lex/LexerTest.cpp
index a887c22bd301..923aff18472b 100644
--- a/unittests/Lex/LexerTest.cpp
+++ b/unittests/Lex/LexerTest.cpp
@@ -379,11 +379,11 @@ TEST_F(LexerTest, DontOverallocateStringifyArgs) {
   auto PP = CreatePP("\"StrArg\", 5, 'C'", ModLoader);
 
   llvm::BumpPtrAllocator Allocator;
-  std::array<IdentifierInfo *, 3> ArgList;
+  std::array<IdentifierInfo *, 3> ParamList;
   MacroInfo *MI = PP->AllocateMacroInfo({});
   MI->setIsFunctionLike();
-  MI->setArgumentList(ArgList, Allocator);
-  EXPECT_EQ(3u, MI->getNumArgs());
+  MI->setParameterList(ParamList, Allocator);
+  EXPECT_EQ(3u, MI->getNumParams());
   EXPECT_TRUE(MI->isFunctionLike());
 
   Token Eof;
diff --git a/unittests/Tooling/CMakeLists.txt b/unittests/Tooling/CMakeLists.txt
index 8ed35480cb88..1359c7cabd1e 100644
--- a/unittests/Tooling/CMakeLists.txt
+++ b/unittests/Tooling/CMakeLists.txt
@@ -14,6 +14,7 @@ add_clang_unittest(ToolingTests
   CastExprTest.cpp
   CommentHandlerTest.cpp
   CompilationDatabaseTest.cpp
+  DiagnosticsYamlTest.cpp
   FixItTest.cpp
   LookupTest.cpp
   QualTypeNamesTest.cpp
diff --git a/unittests/Tooling/DiagnosticsYamlTest.cpp b/unittests/Tooling/DiagnosticsYamlTest.cpp
new file mode 100644
index 000000000000..83e09eaeef2d
--- /dev/null
+++ b/unittests/Tooling/DiagnosticsYamlTest.cpp
@@ -0,0 +1,167 @@
+//===- unittests/Tooling/DiagnosticsYamlTest.cpp - Serialization tests ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for serialization of Diagnostics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/DiagnosticsYaml.h"
+#include "clang/Tooling/Core/Diagnostic.h"
+#include "clang/Tooling/ReplacementsYaml.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang::tooling;
+using clang::tooling::Diagnostic;
+
+static Diagnostic makeDiagnostic(StringRef DiagnosticName,
+                                 const std::string &Message, int FileOffset,
+                                 const std::string &FilePath,
+                                 const StringMap<Replacements> &Fix) {
+  DiagnosticMessage DiagMessage;
+  DiagMessage.Message = Message;
+  DiagMessage.FileOffset = FileOffset;
+  DiagMessage.FilePath = FilePath;
+  return Diagnostic(DiagnosticName, DiagMessage, Fix, {}, Diagnostic::Warning,
+                    "path/to/build/directory");
+}
+
+TEST(DiagnosticsYamlTest, serializesDiagnostics) {
+  TranslationUnitDiagnostics TUD;
+  TUD.MainSourceFile = "path/to/source.cpp";
+
+  StringMap<Replacements> Fix1 = {
+      {"path/to/source.cpp",
+       Replacements({"path/to/source.cpp", 100, 12, "replacement #1"})}};
+  TUD.Diagnostics.push_back(makeDiagnostic("diagnostic#1", "message #1", 55,
+                                           "path/to/source.cpp", Fix1));
+
+  StringMap<Replacements> Fix2 = {
+      {"path/to/header.h",
+       Replacements({"path/to/header.h", 62, 2, "replacement #2"})}};
+  TUD.Diagnostics.push_back(makeDiagnostic("diagnostic#2", "message #2", 60,
+                                           "path/to/header.h", Fix2));
+
+  TUD.Diagnostics.push_back(makeDiagnostic("diagnostic#3", "message #3", 72,
+                                           "path/to/source2.cpp", {}));
+
+  std::string YamlContent;
+  raw_string_ostream YamlContentStream(YamlContent);
+
+  yaml::Output YAML(YamlContentStream);
+  YAML << TUD;
+
+  EXPECT_EQ("---\n"
+            "MainSourceFile:  path/to/source.cpp\n"
+            "Diagnostics:     \n"
+            "  - DiagnosticName:  'diagnostic#1\'\n"
+            "    Message:         'message #1'\n"
+            "    FileOffset:      55\n"
+            "    FilePath:        path/to/source.cpp\n"
+            "    Replacements:    \n"
+            "      - FilePath:        path/to/source.cpp\n"
+            "        Offset:          100\n"
+            "        Length:          12\n"
+            "        ReplacementText: 'replacement #1'\n"
+            "  - DiagnosticName:  'diagnostic#2'\n"
+            "    Message:         'message #2'\n"
+            "    FileOffset:      60\n"
+            "    FilePath:        path/to/header.h\n"
+            "    Replacements:    \n"
+            "      - FilePath:        path/to/header.h\n"
+            "        Offset:          62\n"
+            "        Length:          2\n"
+            "        ReplacementText: 'replacement #2'\n"
+            "  - DiagnosticName:  'diagnostic#3'\n"
+            "    Message:         'message #3'\n"
+            "    FileOffset:      72\n"
+            "    FilePath:        path/to/source2.cpp\n"
+            "    Replacements:    \n"
+            "...\n",
+            YamlContentStream.str());
+}
+
+TEST(DiagnosticsYamlTest, deserializesDiagnostics) {
+  std::string YamlContent = "---\n"
+                            "MainSourceFile:  path/to/source.cpp\n"
+                            "Diagnostics:     \n"
+                            "  - DiagnosticName:  'diagnostic#1'\n"
+                            "    Message:         'message #1'\n"
+                            "    FileOffset:      55\n"
+                            "    FilePath:        path/to/source.cpp\n"
+                            "    Replacements:    \n"
+                            "      - FilePath:        path/to/source.cpp\n"
+                            "        Offset:          100\n"
+                            "        Length:          12\n"
+                            "        ReplacementText: 'replacement #1'\n"
+                            "  - DiagnosticName:  'diagnostic#2'\n"
+                            "    Message:         'message #2'\n"
+                            "    FileOffset:      60\n"
+                            "    FilePath:        path/to/header.h\n"
+                            "    Replacements:    \n"
+                            "      - FilePath:        path/to/header.h\n"
+                            "        Offset:          62\n"
+                            "        Length:          2\n"
+                            "        ReplacementText: 'replacement #2'\n"
+                            "  - DiagnosticName:  'diagnostic#3'\n"
+                            "    Message:         'message #3'\n"
+                            "    FileOffset:      98\n"
+                            "    FilePath:        path/to/source.cpp\n"
+                            "    Replacements:    \n"
+                            "...\n";
+  TranslationUnitDiagnostics TUDActual;
+  yaml::Input YAML(YamlContent);
+  YAML >> TUDActual;
+
+  ASSERT_FALSE(YAML.error());
+  ASSERT_EQ(3u, TUDActual.Diagnostics.size());
+  EXPECT_EQ("path/to/source.cpp", TUDActual.MainSourceFile);
+
+  auto getFixes = [](const StringMap<Replacements> &Fix) {
+    std::vector<Replacement> Fixes;
+    for (auto &Replacements : Fix) {
+      for (auto &Replacement : Replacements.second) {
+        Fixes.push_back(Replacement);
+      }
+    }
+    return Fixes;
+  };
+
+  Diagnostic D1 = TUDActual.Diagnostics[0];
+  EXPECT_EQ("diagnostic#1", D1.DiagnosticName);
+  EXPECT_EQ("message #1", D1.Message.Message);
+  EXPECT_EQ(55u, D1.Message.FileOffset);
+  EXPECT_EQ("path/to/source.cpp", D1.Message.FilePath);
+  std::vector<Replacement> Fixes1 = getFixes(D1.Fix);
+  ASSERT_EQ(1u, Fixes1.size());
+  EXPECT_EQ("path/to/source.cpp", Fixes1[0].getFilePath());
+  EXPECT_EQ(100u, Fixes1[0].getOffset());
+  EXPECT_EQ(12u, Fixes1[0].getLength());
+  EXPECT_EQ("replacement #1", Fixes1[0].getReplacementText());
+
+  Diagnostic D2 = TUDActual.Diagnostics[1];
+  EXPECT_EQ("diagnostic#2", D2.DiagnosticName);
+  EXPECT_EQ("message #2", D2.Message.Message);
+  EXPECT_EQ(60u, D2.Message.FileOffset);
+  EXPECT_EQ("path/to/header.h", D2.Message.FilePath);
+  std::vector<Replacement> Fixes2 = getFixes(D2.Fix);
+  ASSERT_EQ(1u, Fixes2.size());
+  EXPECT_EQ("path/to/header.h", Fixes2[0].getFilePath());
+  EXPECT_EQ(62u, Fixes2[0].getOffset());
+  EXPECT_EQ(2u, Fixes2[0].getLength());
+  EXPECT_EQ("replacement #2", Fixes2[0].getReplacementText());
+
+  Diagnostic D3 = TUDActual.Diagnostics[2];
+  EXPECT_EQ("diagnostic#3", D3.DiagnosticName);
+  EXPECT_EQ("message #3", D3.Message.Message);
+  EXPECT_EQ(98u, D3.Message.FileOffset);
+  EXPECT_EQ("path/to/source.cpp", D3.Message.FilePath);
+  std::vector<Replacement> Fixes3 = getFixes(D3.Fix);
+  EXPECT_TRUE(Fixes3.empty());
+}
diff --git a/utils/bash-autocomplete.sh b/utils/bash-autocomplete.sh
index 1e38bcfcaad7..72531b99e3da 100755
--- a/utils/bash-autocomplete.sh
+++ b/utils/bash-autocomplete.sh
@@ -20,18 +20,21 @@ _clang()
     cur="${COMP_WORDS[$cword]}"
   fi
 
-  # bash always separates '=' as a token even if there's no space before/after '='.
-  # On the other hand, '=' is just a regular character for clang options that
-  # contain '='. For example, "-stdlib=" is defined as is, instead of "-stdlib" and "=".
-  # So, we need to partially undo bash tokenization here for integrity.
   w1="${COMP_WORDS[$cword - 1]}"
   if [[ $cword > 1 ]]; then
     w2="${COMP_WORDS[$cword - 2]}"
+  fi
+
   # Clang want to know if -cc1 or -Xclang option is specified or not, because we don't want to show
   # cc1 options otherwise.
   if [[ "${COMP_WORDS[1]}" == "-cc1" || "$w1" == "-Xclang" ]]; then
     arg="#"
   fi
+
+  # bash always separates '=' as a token even if there's no space before/after '='.
+  # On the other hand, '=' is just a regular character for clang options that
+  # contain '='. For example, "-stdlib=" is defined as is, instead of "-stdlib" and "=".
+  # So, we need to partially undo bash tokenization here for integrity.
   if [[ "$cur" == -* ]]; then
     # -foo<tab>
     arg="$arg$cur"
diff --git a/www/analyzer/alpha_checks.html b/www/analyzer/alpha_checks.html
index ce9392b9960c..7d84d23343f9 100644
--- a/www/analyzer/alpha_checks.html
+++ b/www/analyzer/alpha_checks.html
@@ -24,6 +24,7 @@ keep them from being on by default. They are likely to have false positives.
 Bug reports are welcome but will likely not be investigated for some time.
 Patches welcome!
 <ul>
+<li><a href="#clone_alpha_checkers">Clone Alpha Checkers</a></li>
 <li><a href="#core_alpha_checkers">Core Alpha Checkers</a></li>
 <li><a href="#cplusplus_alpha_checkers">C++ Alpha Checkers</a></li>
 <li><a href="#valist_alpha_checkers">Variable Argument Alpha Checkers</a></li>
@@ -33,6 +34,38 @@ Patches welcome!
 <li><a href="#unix_alpha_checkers">Unix Alpha Checkers</a></li>
 </ul>
 
+<!-- ============================= clone alpha ============================= -->
+
+<h3 id="clone_alpha_checkers">Clone Alpha Checkers</h3>
+<table class="checkers">
+<colgroup><col class="namedescr"><col class="example"></colgroup>
+<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
+
+<tbody>
+<tr><td><div class="namedescr expandable"><span class="name">
+alpha.clone.CloneChecker</span><span class="lang">
+(C, C++, ObjC)</span><div class="descr">
+Reports similar pieces of code.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+void log();
+
+int max(int a, int b) { // warn
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int x, int y) { // similar code here
+  log();
+  if (x > y)
+    return x;
+  return y;
+}
+</pre></div></div></td></tr>
+</tbody></table>
+
 <!-- ============================= core alpha ============================= -->
 <h3 id="core_alpha_checkers">Core Alpha Checkers</h3>
 <table class="checkers">
@@ -52,6 +85,28 @@ void test() {
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+alpha.core.CallAndMessageUnInitRefArg</span><span class="lang">
+(C, C++)</span><div class="descr">
+Check for uninitialized arguments in function calls and Objective-C 
+message expressions.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+void test(void) {
+  int t;
+  int &p = t;
+  int &s = p;
+  int &q = s;
+  foo(q); // warn
+}
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+void test(void) {
+  int x;
+  foo(&x); // warn
+}
+</pre></div></div></td></tr>
+
 <tr><td><div class="namedescr expandable"><span class="name">
 alpha.core.CastSize</span><span class="lang">
 (C)</span><div class="descr">
@@ -90,6 +145,47 @@ void test(int *p) {
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+alpha.core.Conversion</span><span class="lang">
+(C, C++, ObjC)</span><div class="descr">
+Loss of sign or precision in implicit conversions</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+void test(unsigned U, signed S) {
+  if (S > 10) {
+    if (U < S) {
+    }
+  }
+  if (S < -10) {
+    if (U < S) { // warn (loss of sign)
+    }
+  }
+}
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+void test() {
+  long long A = 1LL << 60;
+  short X = A; // warn (loss of precision)
+}
+</pre></div></div></td></tr>
+
+
+<tr><td><div class="namedescr expandable"><span class="name">
+alpha.core.DynamicTypeChecker</span><span class="lang">
+(ObjC)</span><div class="descr">
+Check for cases where the dynamic and the static type of an 
+object are unrelated.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+id date = [NSDate date];
+
+// Warning: Object has a dynamic type 'NSDate *' which is 
+// incompatible with static type 'NSNumber *'"
+NSNumber *number = date;
+[number doubleValue];
+</pre></div></div></td></tr>
+
+
 <tr><td><div class="namedescr expandable"><span class="name">
 alpha.core.FixedAddr</span><span class="lang">
 (C)</span><div class="descr">
@@ -178,6 +274,21 @@ int test(struct s *p) {
 }
 </pre></div></div></td></tr>
 
+
+<tr><td><div class="namedescr expandable"><span class="name">
+alpha.core.TestAfterDivZero</span><span class="lang">
+(C, C++, ObjC)</span><div class="descr">
+Check for division by variable that is later compared against 0. 
+Either the comparison is useless or there is division by zero.
+</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+void test(int x) {
+  var = 77 / x;
+  if (x == 0) { } // warn 
+}
+</pre></div></div></td></tr>
+
 </tbody></table>
 
 <!-- =========================== cplusplus alpha =========================== -->
@@ -187,19 +298,6 @@ int test(struct s *p) {
 <thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
 
 <tbody>
-<tr><td><div class="namedescr expandable"><span class="name">
-alpha.cplusplus.NewDeleteLeaks</span><span class="lang">
-(C++)</span><div class="descr">
-Check for memory leaks. Traces memory managed by <code>new</code>/<code>
-delete</code>.</div></div></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void test() {
-  int *p = new int;
-} // warn
-</pre></div></div></td></tr>
-
-
 <tr><td><div class="namedescr expandable"><span class="name">
 alpha.cplusplus.VirtualCall</span><span class="lang">
 (C++)</span><div class="descr">
@@ -344,66 +442,6 @@ void test(id x) {
 <thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
 
 <tbody>
-<tr><td><div class="namedescr expandable"><span class="name">
-alpha.osx.cocoa.Dealloc</span><span class="lang">
-(ObjC)</span><div class="descr">
-Warn about Objective-C classes that lack a correct implementation 
-of <code>-dealloc</code>.
-</div></div></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-@interface MyObject : NSObject {
-  id _myproperty;  
-}
-@end
-
-@implementation MyObject // warn: lacks 'dealloc'
-@end
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-@interface MyObject : NSObject {}
-@property(assign) id myproperty;
-@end
-
-@implementation MyObject // warn: does not send 'dealloc' to super
-- (void)dealloc {
-  self.myproperty = 0;
-}
-@end
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-@interface MyObject : NSObject {
-  id _myproperty;
-}
-@property(retain) id myproperty;
-@end
-
-@implementation MyObject
-@synthesize myproperty = _myproperty;
-  // warn: var was retained but wasn't released
-- (void)dealloc {
-  [super dealloc];
-}
-@end
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-@interface MyObject : NSObject {
-  id _myproperty;
-}
-@property(assign) id myproperty;
-@end
-
-@implementation MyObject
-@synthesize myproperty = _myproperty;
-  // warn: var wasn't retained but was released
-- (void)dealloc {
-  [_myproperty release];
-  [super dealloc];
-}
-@end
-</pre></div></div></td></tr>
-
-
 <tr><td><div class="namedescr expandable"><span class="name">
 alpha.osx.cocoa.DirectIvarAssignment</span><span class="lang">
 (ObjC)</span><div class="descr">
@@ -501,6 +539,32 @@ invalidatable instance variables.</div></div></td>
 @end
 </pre></div></div></td></tr>
 
+
+<tr><td><div class="namedescr expandable"><span class="name">
+alpha.osx.cocoa.localizability.PluralMisuseChecker</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warns against using one vs. many plural pattern in code 
+when generating localized strings.
+</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+NSString *reminderText = 
+  NSLocalizedString(@"None", @"Indicates no reminders");
+if (reminderCount == 1) {
+  // Warning: Plural cases are not supported accross all languages. 
+  // Use a .stringsdict file instead
+  reminderText = 
+    NSLocalizedString(@"1 Reminder", @"Indicates single reminder");
+} else if (reminderCount >= 2) {
+  // Warning: Plural cases are not supported accross all languages.
+  // Use a .stringsdict file instead
+  reminderText = 
+    [NSString stringWithFormat:
+      NSLocalizedString(@"%@ Reminders", @"Indicates multiple reminders"),
+        reminderCount];
+}
+</pre></div></div></td></tr>
+
 </tbody></table>
 
 <!-- =========================== security alpha =========================== -->
@@ -675,52 +739,6 @@ void test() {
 }
 </pre></div></div></td></tr>
 
-
-<tr><td><div class="namedescr expandable"><span class="name">
-alpha.unix.MallocWithAnnotations</span><span class="lang">
-(C)</span><div class="descr">
-Check for memory leaks, double free, and use-after-free problems. Assumes that
-all user-defined functions which might free a pointer are
-annotated.</div></div></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void __attribute((ownership_returns(malloc))) *my_malloc(size_t);
-
-void test() {
-  int *p = my_malloc(1);
-} // warn: potential leak
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void __attribute((ownership_returns(malloc))) *my_malloc(size_t);
-void __attribute((ownership_takes(malloc, 1))) my_free(void *);
-
-void test() {
-  int *p = my_malloc(1);
-  my_free(p);
-  my_free(p); // warn: attempt to free released
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void __attribute((ownership_returns(malloc))) *my_malloc(size_t);
-void __attribute((ownership_holds(malloc, 1))) my_hold(void *);
-
-void test() {
-  int *p = my_malloc(1);
-  my_hold(p);
-  free(p); // warn: attempt to free non-owned memory
-}
-</pre></div><div class="separator"></div>
-<div class="example"><pre>
-void __attribute((ownership_takes(malloc, 1))) my_free(void *);
-
-void test() {
-  int *p = malloc(1);
-  my_free(p);
-  *p = 1; // warn: use after free
-}
-</pre></div></div></td></tr>
-
-
 <tr><td><div class="namedescr expandable"><span class="name">
 alpha.unix.PthreadLock</span><span class="lang">
 (C)</span><div class="descr">
@@ -910,30 +928,6 @@ void test(char *y) {
 }
 </pre></div></div></td></tr>
 
-
-<tr><td><div class="namedescr expandable"><span class="name">
-alpha.unix.cstring.BlockInCriticalSection</span><span class="lang">
-(C)</span><div class="descr">
-Check for calls to blocking functions inside a critical section; applies
-to:<div class=functions>
-lock, unlock<br>
-sleep<br>
-getc<br>
-fgets<br>
-read<br>
-recv<br>
-pthread_mutex_lock, pthread_mutex_trylock, pthread_mutex_unlock<br>
-mtx_lock, mtx_timedlock, mtx_trylock, mtx_unlock<br>
-</div></div></div></td>
-<td><div class="exampleContainer expandable">
-<div class="example"><pre>
-void testBlockInCriticalSection() {
-  std::mutex m;
-  m.lock();
-  sleep(3); // warn
-  m.unlock();
-}
-</pre></div></div></td></tr>
 </tbody></table>
 
 </div> <!-- page -->
diff --git a/www/analyzer/available_checks.html b/www/analyzer/available_checks.html
index 7707fc0150d5..eca8dca616d2 100644
--- a/www/analyzer/available_checks.html
+++ b/www/analyzer/available_checks.html
@@ -38,12 +38,14 @@ Experimental (Alpha) Checkers</a>.
 <li><a href="#core_checkers">Core Checkers</a> model core language features and perform general-purpose checks such as division by zero, null pointer dereference, usage of uninitialized values, etc.</li>
 <li><a href="#cplusplus_checkers">C++ Checkers</a> perform C++-specific checks</li>
 <li><a href="#deadcode_checkers">Dead Code Checkers</a> check for unused code</li>
+<li><a href="#nullability_checkers">Nullability Checkers</a> </li>
+<li><a href="#optin_checkers">Optin Checkers</a> </li>
 <li><a href="#osx_checkers">OS X Checkers</a> perform Objective-C-specific checks and check the use of Apple's SDKs (OS X and iOS)</li>
 <li><a href="#security_checkers">Security Checkers</a> check for insecure API usage and perform checks based on the CERT Secure Coding Standards</li>
 <li><a href="#unix_checkers">Unix Checkers</a> check the use of Unix and POSIX APIs</li>
 </ul>
 
-<!------------------------------------ core ----------------------------------->
+<!-- =========================== core =========================== -->
 <h3 id="core_checkers">Core Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -360,7 +362,7 @@ int test() {
 
 </tbody></table>
 
-<!------------------------------------ C++ ------------------------------------>
+<!-- =========================== C++ =========================== -->
 <h3 id="cplusplus_checkers">C++ Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -421,9 +423,21 @@ void test() {
 }
 </pre></div></div></td></tr>
 
+<tr><td><div class="namedescr expandable"><span class="name">
+cplusplus.NewDeleteLeaks</span><span class="lang">
+(C++)</span><div class="descr">
+Check for memory leaks. Traces memory managed by <code>new</code>/<code>
+delete</code>.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+void test() {
+  int *p = new int;
+} // warn
+</pre></div></div></td></tr>
+
 </tbody></table>
 
-<!--------------------------------- dead code --------------------------------->
+<!-- =========================== dead code =========================== -->
 <h3 id="deadcode_checkers">Dead Code Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -444,7 +458,161 @@ void test() {
 
 </tbody></table>
 
-<!---------------------------------- OS X ------------------------------------>
+<!-- =========================== nullability =========================== -->
+<h3 id="nullability_checkers">Nullability Checkers</h3>
+<table class="checkers">
+<colgroup><col class="namedescr"><col class="example"></colgroup>
+<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
+
+<tbody>
+<tr><td><div class="namedescr expandable"><span class="name">
+nullability.NullPassedToNonnull</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warns when a null pointer is passed to a pointer which has a 
+_Nonnull type.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+if (name != nil)
+  return;
+// Warning: nil passed to a callee that requires a non-null 1st parameter
+NSString *greeting = [@"Hello " stringByAppendingString:name];
+</pre></div></div></td></tr>
+
+
+<tr><td><div class="namedescr expandable"><span class="name">
+nullability.NullReturnedFromNonnull</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warns when a null pointer is returned from a function that has 
+_Nonnull return type.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+- (nonnull id)firstChild {
+  id result = nil;
+  if ([_children count] > 0)
+    result = _children[0];
+
+  // Warning: nil returned from a method that is expected 
+  // to return a non-null value
+  return result;
+}
+</pre></div></div></td></tr>
+
+
+<tr><td><div class="namedescr expandable"><span class="name">
+nullability.NullableDereferenced</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warns when a nullable pointer is dereferenced.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+struct LinkedList {
+  int data;
+  struct LinkedList *next;
+};
+
+struct LinkedList * _Nullable getNext(struct LinkedList *l);
+
+void updateNextData(struct LinkedList *list, int newData) {
+  struct LinkedList *next = getNext(list);
+  // Warning: Nullable pointer is dereferenced
+  next->data = 7;
+}
+</pre></div></div></td></tr>
+
+
+<tr><td><div class="namedescr expandable"><span class="name">
+nullability.NullablePassedToNonnull</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warns when a nullable pointer is passed to a pointer which has a _Nonnull type.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+typedef struct Dummy { int val; } Dummy;
+Dummy *_Nullable returnsNullable();
+void takesNonnull(Dummy *_Nonnull);
+
+void test() {
+  Dummy *p = returnsNullable();
+  takesNonnull(p); // warn
+}
+</pre></div></div></td></tr>
+
+</tbody></table>
+
+<!-- =========================== optin =========================== -->
+<h3 id="optin_checkers">Optin Checkers</h3>
+<table class="checkers">
+<colgroup><col class="namedescr"><col class="example"></colgroup>
+<thead><tr><td>Name, Description</td><td>Example</td></tr></thead>
+
+<tbody>
+<tr><td><div class="namedescr expandable"><span class="name">
+optin.mpi.MPI-Checker</span><span class="lang">
+(C)</span><div class="descr">
+Checks MPI code</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+void test() {
+  double buf = 0;
+  MPI_Request sendReq1;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 
+      0, MPI_COMM_WORLD, &sendReq1);
+} // warn: request 'sendReq1' has no matching wait.
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+void test() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq);
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // warn
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // warn
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+void missingNonBlocking() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1[10][10][10];
+  MPI_Wait(&sendReq1[1][7][9], MPI_STATUS_IGNORE); // warn
+}
+</pre></div></div></td></tr>
+
+
+<tr><td><div class="namedescr expandable"><span class="name">
+optin.osx.cocoa.localizability.EmptyLocalizationContextChecker</span><span class="lang">
+(ObjC)</span><div class="descr">
+Check that NSLocalizedString macros include a comment for context.</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+- (void)test {
+  NSString *string = NSLocalizedString(@"LocalizedString", nil); // warn
+  NSString *string2 = NSLocalizedString(@"LocalizedString", @" "); // warn
+  NSString *string3 = NSLocalizedStringWithDefaultValue(
+    @"LocalizedString", nil, [[NSBundle alloc] init], nil,@""); // warn
+}
+</pre></div></div></td></tr>
+
+
+<tr><td><div class="namedescr expandable"><span class="name">
+optin.osx.cocoa.localizability.NonLocalizedStringChecker</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warns about uses of non-localized NSStrings passed to UI methods 
+expecting localized NSStrings</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+NSString *alarmText = 
+  NSLocalizedString(@"Enabled", @"Indicates alarm is turned on");
+if (!isEnabled) {
+  alarmText = @"Disabled";
+}
+UILabel *alarmStateLabel = [[UILabel alloc] init];
+
+// Warning: User-facing text should use localized string macro
+[alarmStateLabel setText:alarmText];
+</pre></div></div></td></tr>
+
+</tbody></table>
+
+<!-- =========================== OS X =========================== -->
 <h3 id="osx_checkers">OS X Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -465,6 +633,22 @@ void test() {
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+osx.NumberObjectConversion</span><span class="lang">
+(C, C++, ObjC)</span><div class="descr">
+Check for erroneous conversions of objects representing numbers 
+into numbers</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+NSNumber *photoCount = [albumDescriptor objectForKey:@"PhotoCount"];
+// Warning: Comparing a pointer value of type 'NSNumber *' 
+// to a scalar integer value
+if (photoCount > 0) {
+  [self displayPhotos];
+}
+</pre></div></div></td></tr>
+
+
 <tr><td><div class="namedescr expandable"><span class="name">
 osx.SecKeychainAPI</span><span class="lang">
 (C)</span><div class="descr">
@@ -580,6 +764,66 @@ void test(void) {
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+osx.cocoa.Dealloc</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warn about Objective-C classes that lack a correct implementation 
+of <code>-dealloc</code>.
+</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+@interface MyObject : NSObject {
+  id _myproperty;  
+}
+@end
+
+@implementation MyObject // warn: lacks 'dealloc'
+@end
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+@interface MyObject : NSObject {}
+@property(assign) id myproperty;
+@end
+
+@implementation MyObject // warn: does not send 'dealloc' to super
+- (void)dealloc {
+  self.myproperty = 0;
+}
+@end
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+@interface MyObject : NSObject {
+  id _myproperty;
+}
+@property(retain) id myproperty;
+@end
+
+@implementation MyObject
+@synthesize myproperty = _myproperty;
+  // warn: var was retained but wasn't released
+- (void)dealloc {
+  [super dealloc];
+}
+@end
+</pre></div><div class="separator"></div>
+<div class="example"><pre>
+@interface MyObject : NSObject {
+  id _myproperty;
+}
+@property(assign) id myproperty;
+@end
+
+@implementation MyObject
+@synthesize myproperty = _myproperty;
+  // warn: var wasn't retained but was released
+- (void)dealloc {
+  [_myproperty release];
+  [super dealloc];
+}
+@end
+</pre></div></div></td></tr>
+
+
 <tr><td><div class="namedescr expandable"><span class="name">
 osx.cocoa.IncompatibleMethodTypes</span><span class="lang">
 (ObjC)</span><div class="descr">
@@ -687,6 +931,21 @@ NSComparisonResult test(NSString *s) {
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+osx.cocoa.ObjCGenerics</span><span class="lang">
+(ObjC)</span><div class="descr">
+Check for type errors when using Objective-C generics</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+NSMutableArray<NSString *> *names = [NSMutableArray array];
+NSMutableArray *birthDates = names;
+
+// Warning: Conversion from value of type 'NSDate *' 
+// to incompatible type 'NSString *'
+[birthDates addObject: [NSDate date]];
+</pre></div></div></td></tr>
+
+
 <tr><td><div class="namedescr expandable"><span class="name">
 osx.cocoa.RetainCount</span><span class="lang">
 (ObjC)</span><div class="descr">
@@ -741,6 +1000,26 @@ method.</div></div></td>
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+osx.cocoa.SuperDealloc</span><span class="lang">
+(ObjC)</span><div class="descr">
+Warn about improper use of '[super dealloc]' in Objective-C</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+@interface SuperDeallocThenReleaseIvarClass : NSObject {
+  NSObject *_ivar;
+}
+@end
+
+@implementation SuperDeallocThenReleaseIvarClass
+- (void)dealloc {
+  [super dealloc];
+  [_ivar release]; // warn
+}
+@end
+</pre></div></div></td></tr>
+
+
 <tr><td><div class="namedescr expandable"><span class="name">
 osx.cocoa.UnusedIvars</span><span class="lang">
 (ObjC)</span><div class="descr">
@@ -855,7 +1134,7 @@ void test() {
 
 </tbody></table>
 
-<!------------------------------- security ------------------------------------>
+<!-- =========================== security =========================== -->
 <h3 id="security_checkers">Security Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -995,7 +1274,7 @@ void test() {
 
 </tbody></table>
 
-<!--------------------------------- unix -------------------------------------->
+<!-- =========================== unix =========================== -->
 <h3 id="unix_checkers">Unix Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -1187,6 +1466,38 @@ void test(NSUInteger dataLength) {
 </pre></div></div></td></tr>
 
 
+<tr><td><div class="namedescr expandable"><span class="name">
+unix.Vfork</span><span class="lang">
+(C)</span><div class="descr">
+Check for proper usage of vfork</div></div></td>
+<td><div class="exampleContainer expandable">
+<div class="example"><pre>
+int test(int x) {
+  pid_t pid = vfork(); // warn
+  if (pid != 0)
+    return 0;
+
+  switch (x) {
+  case 0:
+    pid = 1;
+    execl("", "", 0);
+    _exit(1);
+    break;
+  case 1:
+    x = 0; // warn: this assignment is prohibited
+    break;
+  case 2:
+    foo(); // warn: this function call is prohibited
+    break;
+  default:
+    return 0; // warn: return is prohibited
+  }
+
+  while(1);
+}
+</pre></div></div></td></tr>
+
+
 <tr><td><div class="namedescr expandable"><span class="name">
 unix.cstring.BadSizeArg</span><span class="lang">
 (C)</span><div class="descr">
diff --git a/www/analyzer/implicit_checks.html b/www/analyzer/implicit_checks.html
index a476c9bf32bc..948f4533b89b 100644
--- a/www/analyzer/implicit_checks.html
+++ b/www/analyzer/implicit_checks.html
@@ -27,7 +27,7 @@ and <a href = "alpha_checks.html">Experimental (Alpha) Checkers</a>.
 <li><a href="#osx_implicit_checkers">OS X Implicit Checkers</a></li>
 </ul>
 
-<!------------------------------- core implicit ------------------------------->
+<!-- =========================== core implicit =========================== -->
 <h3 id="core_implicit_checkers">Core Implicit Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
@@ -124,7 +124,7 @@ void test() {
 
 </tbody></table>
 
-<!---------------------------- OS X implicit ---------------------------------->
+<!-- =========================== OS X implicit =========================== -->
 <h3 id="osx_implicit_checkers">OS X Implicit Checkers</h3>
 <table class="checkers">
 <colgroup><col class="namedescr"><col class="example"></colgroup>
diff --git a/www/cxx_status.html b/www/cxx_status.html
index 23d95bda67c0..39c3147ca47c 100644
--- a/www/cxx_status.html
+++ b/www/cxx_status.html
@@ -2,7 +2,7 @@
 <html>
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-  <title>Clang - C++1z, C++14, C++11 and C++98 Status</title>
+  <title>Clang - C++17, C++14, C++11 and C++98 Status</title>
   <link type="text/css" rel="stylesheet" href="menu.css">
   <link type="text/css" rel="stylesheet" href="content.css">
   <style type="text/css">
@@ -26,13 +26,13 @@
 <!--*************************************************************************-->
 <h1>C++ Support in Clang</h1>
 <!--*************************************************************************-->
-<p>Last updated: $Date: 2017-07-06 02:29:13 +0200 (Thu, 06 Jul 2017) $</p>
+<p>Last updated: $Date: 2017-07-15 17:51:59 +0200 (Sat, 15 Jul 2017) $</p>
 
 <p>Clang fully implements all published ISO C++ standards (<a
 href="#cxx98">C++98 / C++03</a>, <a
 href="#cxx11">C++11</a>, and <a
 href="#cxx14">C++14</a>), and most of the upcoming <a
-href="#cxx17">C++1z</a> standard.
+href="#cxx17">C++17</a> standard.
 
 <p>The Clang community is continually striving to improve C++ standards
 compliance between releases by submitting and tracking <a
@@ -519,14 +519,12 @@ version 3.7.
 </p>
 </details>
 
-<h2 id="cxx17">C++1z implementation status</h2>
+<h2 id="cxx17">C++17 implementation status</h2>
 
-<p>Clang has <b>experimental</b> support for some proposed features of
-the C++ standard following C++14, provisionally named C++1z.
-Note that support for these features may change or be removed without notice,
-as the draft C++1z standard evolves.
+<p>Clang SVN implements all the features
+of the C++ 2017 Draft International Standard.
 
-<p>You can use Clang in C++1z mode with the <code>-std=c++1z</code> option.</p>
+<p>You can use Clang in C++17 mode with the <code>-std=c++1z</code> option.</p>
 
 <details open>
 <summary>List of features and minimum Clang version with support</summary>
@@ -534,7 +532,7 @@ as the draft C++1z standard evolves.
 <table width="689" border="1" cellspacing="0">
  <tr>
     <th>Language Feature</th>
-    <th>C++1z Proposal</th>
+    <th>C++17 Proposal</th>
     <th>Available in Clang?</th>
  </tr>
     <!-- Issaquah 2014 papers -->
@@ -551,7 +549,7 @@ as the draft C++1z standard evolves.
     </tr>
     <!--
     <tr>
-      <td rowspan="2">Terse range-based for loops (removed from C++1z)</td>
+      <td rowspan="2">Terse range-based for loops (removed from C++17)</td>
       <td rowspan="2"><a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n3994.htm">N3994</a></td>
       <td class="none" align="center">Clang 3.5: Yes</td>
     </tr>
@@ -777,6 +775,69 @@ code. This issue is expected to be rectified soon.
 </p>
 </details>
 
+<h2 id="cxx20">C++2a implementation status</h2>
+
+<p>Clang does not yet support any of the proposed features of
+<!--<p>Clang has <b>experimental</b> support for some proposed features of-->
+the C++ standard following C++17, provisionally named C++2a.
+Note that support for these features may change or be removed without notice,
+as the draft C++2a standard evolves.
+
+<!--<p>You can use Clang in C++2a mode with the <code>-std=c++2a</code> option.</p>-->
+
+<details open>
+<summary>List of features and minimum Clang version with support</summary>
+
+<table width="689" border="1" cellspacing="0">
+ <tr>
+    <th>Language Feature</th>
+    <th>C++2a Proposal</th>
+    <th>Available in Clang?</th>
+ </tr>
+    <!-- Toronto 2017 papers -->
+    <tr>
+      <td>Default member initializers for bit-fields</td>
+      <td><a href="http://wg21.link/p0683r1">P0683R1</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td><tt>const&amp;</tt>-qualified pointers to members</td>
+      <td><a href="http://wg21.link/p0704r1">P0704R1</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Allow <i>lambda-capture</i> <tt>[=, this]</tt></td>
+      <td><a href="http://wg21.link/p0409r2">P0409R2</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td><tt>__VA_OPT__</tt> for preprocessor comma elision</td>
+      <td><a href="http://wg21.link/p0306r4">P0306R4</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Designated initializers</td>
+      <td><a href="http://wg21.link/p0329r4">P0329R4</a></td>
+      <td class="partial" align="center">Partial (extension)</td>
+    </tr>
+    <tr>
+      <td><i>template-parameter-list</i> for generic lambdas</td>
+      <td><a href="http://wg21.link/p0428r2">P0428R2</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Initializer list constructors in class template argument deduction</td>
+      <td><a href="http://wg21.link/p0702r1">P0702R1</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Concepts</td>
+      <td><a href="http://wg21.link/p0734r0">P0734R0</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+</table>
+</details>
+
 <h2 id="ts">Technical specifications and standing documents</h2>
 
 <p>ISO C++ also publishes a number of documents describing additional language

From 462d72ec21756f5ca1002f128f014d36a845f3d2 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 19 Jul 2017 07:02:40 +0000
Subject: [PATCH 4/4] Vendor import of compiler-rt trunk r308421:
 https://llvm.org/svn/llvm-project/compiler-rt/trunk@308421

---
 CMakeLists.txt                                |  12 +-
 lib/asan/asan_allocator.cc                    |  42 +-
 lib/asan/asan_interceptors.cc                 |   4 +
 lib/asan/tests/asan_test.cc                   |   8 +-
 lib/builtins/cpu_model.c                      |  41 +-
 lib/lsan/lsan_allocator.cc                    |  29 +-
 lib/lsan/lsan_common.cc                       |  12 +-
 lib/lsan/lsan_common.h                        |   6 +
 lib/lsan/lsan_common_linux.cc                 |   7 +
 lib/lsan/lsan_common_mac.cc                   |   5 +
 lib/lsan/lsan_interceptors.cc                 |   6 +
 lib/msan/msan.h                               |  14 +-
 lib/msan/msan_allocator.cc                    |  85 +++-
 lib/msan/msan_interceptors.cc                 |  49 +-
 lib/msan/msan_new_delete.cc                   |   2 +-
 lib/sanitizer_common/sanitizer_allocator.cc   |   9 +-
 lib/sanitizer_common/sanitizer_allocator.h    |   5 -
 .../sanitizer_allocator_checks.h              |  64 +++
 lib/sanitizer_common/sanitizer_errno.h        |   2 +
 lib/sanitizer_common/sanitizer_linux.cc       |   3 +-
 lib/sanitizer_common/sanitizer_linux.h        |   4 +-
 lib/sanitizer_common/sanitizer_mac.cc         |  31 +-
 lib/sanitizer_common/sanitizer_platform.h     |  11 +-
 .../sanitizer_platform_interceptors.h         | 140 +++---
 .../sanitizer_stoptheworld_linux_libcdep.cc   |   2 +-
 .../tests/sanitizer_bitvector_test.cc         |   4 +-
 lib/scudo/scudo_allocator.cpp                 | 149 +++---
 lib/scudo/scudo_allocator_combined.h          |  34 +-
 lib/tsan/rtl/tsan_clock.cc                    | 436 ++++++++++++------
 lib/tsan/rtl/tsan_clock.h                     | 207 ++++++---
 lib/tsan/rtl/tsan_defs.h                      |  33 +-
 lib/tsan/rtl/tsan_mman.cc                     |   1 +
 lib/tsan/rtl/tsan_platform_linux.cc           |   2 +-
 lib/tsan/tests/unit/tsan_clock_test.cc        |  33 +-
 lib/ubsan/ubsan_handlers.cc                   |  15 +-
 test/asan/TestCases/allocator_returns_null.cc |  12 +-
 test/lsan/TestCases/allocator_returns_null.cc |  10 +-
 test/msan/allocator_returns_null.cc           |  11 +-
 test/scudo/memalign.cpp                       |   8 +-
 test/tsan/Linux/check_memcpy.cc               |   2 +
 .../Pointer/unsigned-index-expression.cpp     |  15 +-
 41 files changed, 1066 insertions(+), 499 deletions(-)
 create mode 100644 lib/sanitizer_common/sanitizer_allocator_checks.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3195de1e5d1e..f997c53410c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,10 +172,16 @@ endif()
 
 append_list_if(COMPILER_RT_DEBUG -DSANITIZER_DEBUG=1 SANITIZER_COMMON_CFLAGS)
 
-# Build with optimization, unless we're in debug mode. If we're using MSVC,
+# If we're using MSVC,
 # always respect the optimization flags set by CMAKE_BUILD_TYPE instead.
-if(NOT COMPILER_RT_DEBUG AND NOT MSVC)
-  list(APPEND SANITIZER_COMMON_CFLAGS -O3)
+if (NOT MSVC)
+
+  # Build with optimization, unless we're in debug mode. 
+  if(COMPILER_RT_DEBUG)
+    list(APPEND SANITIZER_COMMON_CFLAGS -O0)
+  else()
+    list(APPEND SANITIZER_COMMON_CFLAGS -O3)
+  endif()
 endif()
 
 # Determine if we should restrict stack frame sizes.
diff --git a/lib/asan/asan_allocator.cc b/lib/asan/asan_allocator.cc
index de6613f56727..92963ddfc4da 100644
--- a/lib/asan/asan_allocator.cc
+++ b/lib/asan/asan_allocator.cc
@@ -21,7 +21,9 @@
 #include "asan_report.h"
 #include "asan_stack.h"
 #include "asan_thread.h"
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_flags.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_list.h"
@@ -799,11 +801,6 @@ void PrintInternalAllocatorStats() {
   instance.PrintStats();
 }
 
-void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
-                    AllocType alloc_type) {
-  return instance.Allocate(size, alignment, stack, alloc_type, true);
-}
-
 void asan_free(void *ptr, BufferedStackTrace *stack, AllocType alloc_type) {
   instance.Deallocate(ptr, 0, stack, alloc_type);
 }
@@ -814,16 +811,16 @@ void asan_sized_free(void *ptr, uptr size, BufferedStackTrace *stack,
 }
 
 void *asan_malloc(uptr size, BufferedStackTrace *stack) {
-  return instance.Allocate(size, 8, stack, FROM_MALLOC, true);
+  return SetErrnoOnNull(instance.Allocate(size, 8, stack, FROM_MALLOC, true));
 }
 
 void *asan_calloc(uptr nmemb, uptr size, BufferedStackTrace *stack) {
-  return instance.Calloc(nmemb, size, stack);
+  return SetErrnoOnNull(instance.Calloc(nmemb, size, stack));
 }
 
 void *asan_realloc(void *p, uptr size, BufferedStackTrace *stack) {
   if (!p)
-    return instance.Allocate(size, 8, stack, FROM_MALLOC, true);
+    return SetErrnoOnNull(instance.Allocate(size, 8, stack, FROM_MALLOC, true));
   if (size == 0) {
     if (flags()->allocator_frees_and_returns_null_on_realloc_zero) {
       instance.Deallocate(p, 0, stack, FROM_MALLOC);
@@ -832,26 +829,41 @@ void *asan_realloc(void *p, uptr size, BufferedStackTrace *stack) {
     // Allocate a size of 1 if we shouldn't free() on Realloc to 0
     size = 1;
   }
-  return instance.Reallocate(p, size, stack);
+  return SetErrnoOnNull(instance.Reallocate(p, size, stack));
 }
 
 void *asan_valloc(uptr size, BufferedStackTrace *stack) {
-  return instance.Allocate(size, GetPageSizeCached(), stack, FROM_MALLOC, true);
+  return SetErrnoOnNull(
+      instance.Allocate(size, GetPageSizeCached(), stack, FROM_MALLOC, true));
 }
 
 void *asan_pvalloc(uptr size, BufferedStackTrace *stack) {
   uptr PageSize = GetPageSizeCached();
-  size = RoundUpTo(size, PageSize);
-  if (size == 0) {
-    // pvalloc(0) should allocate one page.
-    size = PageSize;
+  // pvalloc(0) should allocate one page.
+  size = size ? RoundUpTo(size, PageSize) : PageSize;
+  return SetErrnoOnNull(
+      instance.Allocate(size, PageSize, stack, FROM_MALLOC, true));
+}
+
+void *asan_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
+                    AllocType alloc_type) {
+  if (UNLIKELY(!IsPowerOfTwo(alignment))) {
+    errno = errno_EINVAL;
+    return AsanAllocator::FailureHandler::OnBadRequest();
   }
-  return instance.Allocate(size, PageSize, stack, FROM_MALLOC, true);
+  return SetErrnoOnNull(
+      instance.Allocate(size, alignment, stack, alloc_type, true));
 }
 
 int asan_posix_memalign(void **memptr, uptr alignment, uptr size,
                         BufferedStackTrace *stack) {
+  if (UNLIKELY(!CheckPosixMemalignAlignment(alignment))) {
+    AsanAllocator::FailureHandler::OnBadRequest();
+    return errno_EINVAL;
+  }
   void *ptr = instance.Allocate(size, alignment, stack, FROM_MALLOC, true);
+  if (UNLIKELY(!ptr))
+    return errno_ENOMEM;
   CHECK(IsAligned((uptr)ptr, alignment));
   *memptr = ptr;
   return 0;
diff --git a/lib/asan/asan_interceptors.cc b/lib/asan/asan_interceptors.cc
index ed12a9ac9015..34ca22b8616e 100644
--- a/lib/asan/asan_interceptors.cc
+++ b/lib/asan/asan_interceptors.cc
@@ -178,6 +178,10 @@ void SetThreadName(const char *name) {
 }
 
 int OnExit() {
+  if (CAN_SANITIZE_LEAKS && common_flags()->detect_leaks &&
+      __lsan::HasReportedLeaks()) {
+    return common_flags()->exitcode;
+  }
   // FIXME: ask frontend whether we need to return failure.
   return 0;
 }
diff --git a/lib/asan/tests/asan_test.cc b/lib/asan/tests/asan_test.cc
index d0128e34de8d..7e9cf3babc67 100644
--- a/lib/asan/tests/asan_test.cc
+++ b/lib/asan/tests/asan_test.cc
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "asan_test_utils.h"
 
+#include <errno.h>
+
 NOINLINE void *malloc_fff(size_t size) {
   void *res = malloc/**/(size); break_optimization(0); return res;}
 NOINLINE void *malloc_eee(size_t size) {
@@ -74,9 +76,11 @@ TEST(AddressSanitizer, VariousMallocsTest) {
   delete c;
 
 #if SANITIZER_TEST_HAS_POSIX_MEMALIGN
-  int *pm;
-  int pm_res = posix_memalign((void**)&pm, kPageSize, kPageSize);
+  void *pm = 0;
+  // Valid allocation.
+  int pm_res = posix_memalign(&pm, kPageSize, kPageSize);
   EXPECT_EQ(0, pm_res);
+  EXPECT_NE(nullptr, pm);
   free(pm);
 #endif  // SANITIZER_TEST_HAS_POSIX_MEMALIGN
 
diff --git a/lib/builtins/cpu_model.c b/lib/builtins/cpu_model.c
index c6b30eda0a77..83ea7a49faf7 100644
--- a/lib/builtins/cpu_model.c
+++ b/lib/builtins/cpu_model.c
@@ -190,8 +190,8 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__x86_64__) || defined(_M_X64)
 #if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__)
   // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
   // FIXME: should we save this for Clang?
   __asm__("movq\t%%rbx, %%rsi\n\t"
@@ -200,6 +200,16 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value), "c"(subleaf));
   return false;
+#elif defined(__i386__)
+  __asm__("movl\t%%ebx, %%esi\n\t"
+          "cpuid\n\t"
+          "xchgl\t%%ebx, %%esi\n\t"
+          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+          : "a"(value), "c"(subleaf));
+  return false;
+#else
+  return true;
+#endif
 #elif defined(_MSC_VER)
   int registers[4];
   __cpuidex(registers, value, subleaf);
@@ -211,35 +221,6 @@ static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
 #else
   return true;
 #endif
-#elif defined(__i386__) || defined(_M_IX86)
-#if defined(__GNUC__) || defined(__clang__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#elif defined(_MSC_VER)
-  __asm {
-      mov   eax,value
-      mov   ecx,subleaf
-      cpuid
-      mov   esi,rEAX
-      mov   dword ptr [esi],eax
-      mov   esi,rEBX
-      mov   dword ptr [esi],ebx
-      mov   esi,rECX
-      mov   dword ptr [esi],ecx
-      mov   esi,rEDX
-      mov   dword ptr [esi],edx
-  }
-  return false;
-#else
-  return true;
-#endif
-#else
-  return true;
-#endif
 }
 
 // Read control register 0 (XCR0). Used to detect features such as AVX.
diff --git a/lib/lsan/lsan_allocator.cc b/lib/lsan/lsan_allocator.cc
index 6514aea6f609..2df58b44f6b8 100644
--- a/lib/lsan/lsan_allocator.cc
+++ b/lib/lsan/lsan_allocator.cc
@@ -15,7 +15,9 @@
 #include "lsan_allocator.h"
 
 #include "sanitizer_common/sanitizer_allocator.h"
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
@@ -86,6 +88,13 @@ void *Allocate(const StackTrace &stack, uptr size, uptr alignment,
   return p;
 }
 
+static void *Calloc(uptr nmemb, uptr size, const StackTrace &stack) {
+  if (UNLIKELY(CheckForCallocOverflow(size, nmemb)))
+    return Allocator::FailureHandler::OnBadRequest();
+  size *= nmemb;
+  return Allocate(stack, size, 1, true);
+}
+
 void Deallocate(void *p) {
   if (&__sanitizer_free_hook) __sanitizer_free_hook(p);
   RunFreeHooks(p);
@@ -118,11 +127,15 @@ uptr GetMallocUsableSize(const void *p) {
 }
 
 void *lsan_memalign(uptr alignment, uptr size, const StackTrace &stack) {
-  return Allocate(stack, size, alignment, kAlwaysClearMemory);
+  if (UNLIKELY(!IsPowerOfTwo(alignment))) {
+    errno = errno_EINVAL;
+    return Allocator::FailureHandler::OnBadRequest();
+  }
+  return SetErrnoOnNull(Allocate(stack, size, alignment, kAlwaysClearMemory));
 }
 
 void *lsan_malloc(uptr size, const StackTrace &stack) {
-  return Allocate(stack, size, 1, kAlwaysClearMemory);
+  return SetErrnoOnNull(Allocate(stack, size, 1, kAlwaysClearMemory));
 }
 
 void lsan_free(void *p) {
@@ -130,20 +143,16 @@ void lsan_free(void *p) {
 }
 
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack) {
-  return Reallocate(stack, p, size, 1);
+  return SetErrnoOnNull(Reallocate(stack, p, size, 1));
 }
 
 void *lsan_calloc(uptr nmemb, uptr size, const StackTrace &stack) {
-  if (CheckForCallocOverflow(size, nmemb))
-    return Allocator::FailureHandler::OnBadRequest();
-  size *= nmemb;
-  return Allocate(stack, size, 1, true);
+  return SetErrnoOnNull(Calloc(nmemb, size, stack));
 }
 
 void *lsan_valloc(uptr size, const StackTrace &stack) {
-  if (size == 0)
-    size = GetPageSizeCached();
-  return Allocate(stack, size, GetPageSizeCached(), kAlwaysClearMemory);
+  return SetErrnoOnNull(
+      Allocate(stack, size, GetPageSizeCached(), kAlwaysClearMemory));
 }
 
 uptr lsan_mz_size(const void *p) {
diff --git a/lib/lsan/lsan_common.cc b/lib/lsan/lsan_common.cc
index 4ffa91568cc8..c121e6a8fb24 100644
--- a/lib/lsan/lsan_common.cc
+++ b/lib/lsan/lsan_common.cc
@@ -576,18 +576,16 @@ static bool CheckForLeaks() {
   return false;
 }
 
+static bool has_reported_leaks = false;
+bool HasReportedLeaks() { return has_reported_leaks; }
+
 void DoLeakCheck() {
   BlockingMutexLock l(&global_mutex);
   static bool already_done;
   if (already_done) return;
   already_done = true;
-  bool have_leaks = CheckForLeaks();
-  if (!have_leaks) {
-    return;
-  }
-  if (common_flags()->exitcode) {
-    Die();
-  }
+  has_reported_leaks = CheckForLeaks();
+  if (has_reported_leaks) HandleLeaks();
 }
 
 static int DoRecoverableLeakCheck() {
diff --git a/lib/lsan/lsan_common.h b/lib/lsan/lsan_common.h
index d93ac1b10919..31bf3eb1df42 100644
--- a/lib/lsan/lsan_common.h
+++ b/lib/lsan/lsan_common.h
@@ -226,6 +226,12 @@ IgnoreObjectResult IgnoreObjectLocked(const void *p);
 // Return the linker module, if valid for the platform.
 LoadedModule *GetLinker();
 
+// Return true if LSan has finished leak checking and reported leaks.
+bool HasReportedLeaks();
+
+// Run platform-specific leak handlers.
+void HandleLeaks();
+
 // Wrapper for chunk metadata operations.
 class LsanMetadata {
  public:
diff --git a/lib/lsan/lsan_common_linux.cc b/lib/lsan/lsan_common_linux.cc
index c903be42d1e7..5042c7b3ada5 100644
--- a/lib/lsan/lsan_common_linux.cc
+++ b/lib/lsan/lsan_common_linux.cc
@@ -100,6 +100,13 @@ struct DoStopTheWorldParam {
   void *argument;
 };
 
+// While calling Die() here is undefined behavior and can potentially
+// cause race conditions, it isn't possible to intercept exit on linux,
+// so we have no choice but to call Die() from the atexit handler.
+void HandleLeaks() {
+  if (common_flags()->exitcode) Die();
+}
+
 static int DoStopTheWorldCallback(struct dl_phdr_info *info, size_t size,
                                   void *data) {
   DoStopTheWorldParam *param = reinterpret_cast<DoStopTheWorldParam *>(data);
diff --git a/lib/lsan/lsan_common_mac.cc b/lib/lsan/lsan_common_mac.cc
index f87c6b7e0425..ade94340ae81 100644
--- a/lib/lsan/lsan_common_mac.cc
+++ b/lib/lsan/lsan_common_mac.cc
@@ -164,6 +164,11 @@ void ProcessPlatformSpecificAllocations(Frontier *frontier) {
   }
 }
 
+// On darwin, we can intercept _exit gracefully, and return a failing exit code
+// if required at that point. Calling Die() here is undefined behavior and
+// causes rare race conditions.
+void HandleLeaks() {}
+
 void DoStopTheWorld(StopTheWorldCallback callback, void *argument) {
   StopTheWorld(callback, argument);
 }
diff --git a/lib/lsan/lsan_interceptors.cc b/lib/lsan/lsan_interceptors.cc
index 7d514402ad4b..168868b012bc 100644
--- a/lib/lsan/lsan_interceptors.cc
+++ b/lib/lsan/lsan_interceptors.cc
@@ -352,6 +352,11 @@ INTERCEPTOR(int, pthread_join, void *th, void **ret) {
   return res;
 }
 
+INTERCEPTOR(void, _exit, int status) {
+  if (status == 0 && HasReportedLeaks()) status = common_flags()->exitcode;
+  REAL(_exit)(status);
+}
+
 namespace __lsan {
 
 void InitializeInterceptors() {
@@ -371,6 +376,7 @@ void InitializeInterceptors() {
   LSAN_MAYBE_INTERCEPT_MALLOPT;
   INTERCEPT_FUNCTION(pthread_create);
   INTERCEPT_FUNCTION(pthread_join);
+  INTERCEPT_FUNCTION(_exit);
 
   if (pthread_key_create(&g_thread_finalize_key, &thread_finalize)) {
     Report("LeakSanitizer: failed to create thread key.\n");
diff --git a/lib/msan/msan.h b/lib/msan/msan.h
index 0709260eebe2..fa9c15b88bef 100644
--- a/lib/msan/msan.h
+++ b/lib/msan/msan.h
@@ -280,10 +280,18 @@ void InitializeInterceptors();
 
 void MsanAllocatorInit();
 void MsanAllocatorThreadFinish();
-void *MsanCalloc(StackTrace *stack, uptr nmemb, uptr size);
-void *MsanReallocate(StackTrace *stack, void *oldp, uptr size,
-                     uptr alignment, bool zeroise);
 void MsanDeallocate(StackTrace *stack, void *ptr);
+
+void *msan_malloc(uptr size, StackTrace *stack);
+void *msan_calloc(uptr nmemb, uptr size, StackTrace *stack);
+void *msan_realloc(void *ptr, uptr size, StackTrace *stack);
+void *msan_valloc(uptr size, StackTrace *stack);
+void *msan_pvalloc(uptr size, StackTrace *stack);
+void *msan_aligned_alloc(uptr alignment, uptr size, StackTrace *stack);
+void *msan_memalign(uptr alignment, uptr size, StackTrace *stack);
+int msan_posix_memalign(void **memptr, uptr alignment, uptr size,
+                        StackTrace *stack);
+
 void InstallTrapHandler();
 void InstallAtExitHandler();
 
diff --git a/lib/msan/msan_allocator.cc b/lib/msan/msan_allocator.cc
index a92b7fd12f92..1034dbdf9b55 100644
--- a/lib/msan/msan_allocator.cc
+++ b/lib/msan/msan_allocator.cc
@@ -13,7 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_common/sanitizer_allocator.h"
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "msan.h"
 #include "msan_allocator.h"
 #include "msan_origin.h"
@@ -194,20 +196,8 @@ void MsanDeallocate(StackTrace *stack, void *p) {
   }
 }
 
-void *MsanCalloc(StackTrace *stack, uptr nmemb, uptr size) {
-  if (CheckForCallocOverflow(size, nmemb))
-    return Allocator::FailureHandler::OnBadRequest();
-  return MsanReallocate(stack, nullptr, nmemb * size, sizeof(u64), true);
-}
-
 void *MsanReallocate(StackTrace *stack, void *old_p, uptr new_size,
-                     uptr alignment, bool zeroise) {
-  if (!old_p)
-    return MsanAllocate(stack, new_size, alignment, zeroise);
-  if (!new_size) {
-    MsanDeallocate(stack, old_p);
-    return nullptr;
-  }
+                     uptr alignment) {
   Metadata *meta = reinterpret_cast<Metadata*>(allocator.GetMetaData(old_p));
   uptr old_size = meta->requested_size;
   uptr actually_allocated_size = allocator.GetActuallyAllocatedSize(old_p);
@@ -215,10 +205,7 @@ void *MsanReallocate(StackTrace *stack, void *old_p, uptr new_size,
     // We are not reallocating here.
     meta->requested_size = new_size;
     if (new_size > old_size) {
-      if (zeroise) {
-        __msan_clear_and_unpoison((char *)old_p + old_size,
-                                  new_size - old_size);
-      } else if (flags()->poison_in_malloc) {
+      if (flags()->poison_in_malloc) {
         stack->tag = StackTrace::TAG_ALLOC;
         PoisonMemory((char *)old_p + old_size, new_size - old_size, stack);
       }
@@ -226,8 +213,7 @@ void *MsanReallocate(StackTrace *stack, void *old_p, uptr new_size,
     return old_p;
   }
   uptr memcpy_size = Min(new_size, old_size);
-  void *new_p = MsanAllocate(stack, new_size, alignment, zeroise);
-  // Printf("realloc: old_size %zd new_size %zd\n", old_size, new_size);
+  void *new_p = MsanAllocate(stack, new_size, alignment, false /*zeroise*/);
   if (new_p) {
     CopyMemory(new_p, old_p, memcpy_size, stack);
     MsanDeallocate(stack, old_p);
@@ -243,6 +229,67 @@ static uptr AllocationSize(const void *p) {
   return b->requested_size;
 }
 
+void *msan_malloc(uptr size, StackTrace *stack) {
+  return SetErrnoOnNull(MsanAllocate(stack, size, sizeof(u64), false));
+}
+
+void *msan_calloc(uptr nmemb, uptr size, StackTrace *stack) {
+  if (UNLIKELY(CheckForCallocOverflow(size, nmemb)))
+    return SetErrnoOnNull(Allocator::FailureHandler::OnBadRequest());
+  return SetErrnoOnNull(MsanAllocate(stack, nmemb * size, sizeof(u64), true));
+}
+
+void *msan_realloc(void *ptr, uptr size, StackTrace *stack) {
+  if (!ptr)
+    return SetErrnoOnNull(MsanAllocate(stack, size, sizeof(u64), false));
+  if (size == 0) {
+    MsanDeallocate(stack, ptr);
+    return nullptr;
+  }
+  return SetErrnoOnNull(MsanReallocate(stack, ptr, size, sizeof(u64)));
+}
+
+void *msan_valloc(uptr size, StackTrace *stack) {
+  return SetErrnoOnNull(MsanAllocate(stack, size, GetPageSizeCached(), false));
+}
+
+void *msan_pvalloc(uptr size, StackTrace *stack) {
+  uptr PageSize = GetPageSizeCached();
+  // pvalloc(0) should allocate one page.
+  size = size == 0 ? PageSize : RoundUpTo(size, PageSize);
+  return SetErrnoOnNull(MsanAllocate(stack, size, PageSize, false));
+}
+
+void *msan_aligned_alloc(uptr alignment, uptr size, StackTrace *stack) {
+  if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(alignment, size))) {
+    errno = errno_EINVAL;
+    return Allocator::FailureHandler::OnBadRequest();
+  }
+  return SetErrnoOnNull(MsanAllocate(stack, size, alignment, false));
+}
+
+void *msan_memalign(uptr alignment, uptr size, StackTrace *stack) {
+  if (UNLIKELY(!IsPowerOfTwo(alignment))) {
+    errno = errno_EINVAL;
+    return Allocator::FailureHandler::OnBadRequest();
+  }
+  return SetErrnoOnNull(MsanAllocate(stack, size, alignment, false));
+}
+
+int msan_posix_memalign(void **memptr, uptr alignment, uptr size,
+                        StackTrace *stack) {
+  if (UNLIKELY(!CheckPosixMemalignAlignment(alignment))) {
+    Allocator::FailureHandler::OnBadRequest();
+    return errno_EINVAL;
+  }
+  void *ptr = MsanAllocate(stack, size, alignment, false);
+  if (UNLIKELY(!ptr))
+    return errno_ENOMEM;
+  CHECK(IsAligned((uptr)ptr, alignment));
+  *memptr = ptr;
+  return 0;
+}
+
 } // namespace __msan
 
 using namespace __msan;
diff --git a/lib/msan/msan_interceptors.cc b/lib/msan/msan_interceptors.cc
index 069777c7f5e7..b5d22baca08d 100644
--- a/lib/msan/msan_interceptors.cc
+++ b/lib/msan/msan_interceptors.cc
@@ -161,58 +161,45 @@ INTERCEPTOR(void *, bcopy, const void *src, void *dest, SIZE_T n) {
 
 INTERCEPTOR(int, posix_memalign, void **memptr, SIZE_T alignment, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
-  CHECK_EQ(alignment & (alignment - 1), 0);
   CHECK_NE(memptr, 0);
-  *memptr = MsanReallocate(&stack, nullptr, size, alignment, false);
-  CHECK_NE(*memptr, 0);
-  __msan_unpoison(memptr, sizeof(*memptr));
-  return 0;
+  int res = msan_posix_memalign(memptr, alignment, size, &stack);
+  if (!res)
+    __msan_unpoison(memptr, sizeof(*memptr));
+  return res;
 }
 
 #if !SANITIZER_FREEBSD
-INTERCEPTOR(void *, memalign, SIZE_T boundary, SIZE_T size) {
+INTERCEPTOR(void *, memalign, SIZE_T alignment, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
-  CHECK_EQ(boundary & (boundary - 1), 0);
-  void *ptr = MsanReallocate(&stack, nullptr, size, boundary, false);
-  return ptr;
+  return msan_memalign(alignment, size, &stack);
 }
 #define MSAN_MAYBE_INTERCEPT_MEMALIGN INTERCEPT_FUNCTION(memalign)
 #else
 #define MSAN_MAYBE_INTERCEPT_MEMALIGN
 #endif
 
-INTERCEPTOR(void *, aligned_alloc, SIZE_T boundary, SIZE_T size) {
+INTERCEPTOR(void *, aligned_alloc, SIZE_T alignment, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
-  CHECK_EQ(boundary & (boundary - 1), 0);
-  void *ptr = MsanReallocate(&stack, nullptr, size, boundary, false);
-  return ptr;
+  return msan_aligned_alloc(alignment, size, &stack);
 }
 
-INTERCEPTOR(void *, __libc_memalign, SIZE_T boundary, SIZE_T size) {
+INTERCEPTOR(void *, __libc_memalign, SIZE_T alignment, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
-  CHECK_EQ(boundary & (boundary - 1), 0);
-  void *ptr = MsanReallocate(&stack, nullptr, size, boundary, false);
-  DTLS_on_libc_memalign(ptr, size);
+  void *ptr = msan_memalign(alignment, size, &stack);
+  if (ptr)
+    DTLS_on_libc_memalign(ptr, size);
   return ptr;
 }
 
 INTERCEPTOR(void *, valloc, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
-  void *ptr = MsanReallocate(&stack, nullptr, size, GetPageSizeCached(), false);
-  return ptr;
+  return msan_valloc(size, &stack);
 }
 
 #if !SANITIZER_FREEBSD
 INTERCEPTOR(void *, pvalloc, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
-  uptr PageSize = GetPageSizeCached();
-  size = RoundUpTo(size, PageSize);
-  if (size == 0) {
-    // pvalloc(0) should allocate one page.
-    size = PageSize;
-  }
-  void *ptr = MsanReallocate(&stack, nullptr, size, PageSize, false);
-  return ptr;
+  return msan_pvalloc(size, &stack);
 }
 #define MSAN_MAYBE_INTERCEPT_PVALLOC INTERCEPT_FUNCTION(pvalloc)
 #else
@@ -853,7 +840,7 @@ INTERCEPTOR(void *, calloc, SIZE_T nmemb, SIZE_T size) {
   if (UNLIKELY(!msan_inited))
     // Hack: dlsym calls calloc before REAL(calloc) is retrieved from dlsym.
     return AllocateFromLocalPool(nmemb * size);
-  return MsanCalloc(&stack, nmemb, size);
+  return msan_calloc(nmemb, size, &stack);
 }
 
 INTERCEPTOR(void *, realloc, void *ptr, SIZE_T size) {
@@ -866,12 +853,12 @@ INTERCEPTOR(void *, realloc, void *ptr, SIZE_T size) {
       new_ptr = AllocateFromLocalPool(copy_size);
     } else {
       copy_size = size;
-      new_ptr = MsanReallocate(&stack, nullptr, copy_size, sizeof(u64), false);
+      new_ptr = msan_malloc(copy_size, &stack);
     }
     internal_memcpy(new_ptr, ptr, copy_size);
     return new_ptr;
   }
-  return MsanReallocate(&stack, ptr, size, sizeof(u64), false);
+  return msan_realloc(ptr, size, &stack);
 }
 
 INTERCEPTOR(void *, malloc, SIZE_T size) {
@@ -879,7 +866,7 @@ INTERCEPTOR(void *, malloc, SIZE_T size) {
   if (UNLIKELY(!msan_inited))
     // Hack: dlsym calls malloc before REAL(malloc) is retrieved from dlsym.
     return AllocateFromLocalPool(size);
-  return MsanReallocate(&stack, nullptr, size, sizeof(u64), false);
+  return msan_malloc(size, &stack);
 }
 
 void __msan_allocated_memory(const void *data, uptr size) {
diff --git a/lib/msan/msan_new_delete.cc b/lib/msan/msan_new_delete.cc
index c7295feebfe4..721926791029 100644
--- a/lib/msan/msan_new_delete.cc
+++ b/lib/msan/msan_new_delete.cc
@@ -31,7 +31,7 @@ namespace std {
 // TODO(alekseys): throw std::bad_alloc instead of dying on OOM.
 #define OPERATOR_NEW_BODY(nothrow) \
   GET_MALLOC_STACK_TRACE; \
-  void *res = MsanReallocate(&stack, 0, size, sizeof(u64), false);\
+  void *res = msan_malloc(size, &stack);\
   if (!nothrow && UNLIKELY(!res)) DieOnFailure::OnOOM();\
   return res
 
diff --git a/lib/sanitizer_common/sanitizer_allocator.cc b/lib/sanitizer_common/sanitizer_allocator.cc
index 2f8f6e3f9aa7..84f523c5e431 100644
--- a/lib/sanitizer_common/sanitizer_allocator.cc
+++ b/lib/sanitizer_common/sanitizer_allocator.cc
@@ -14,6 +14,7 @@
 
 #include "sanitizer_allocator.h"
 
+#include "sanitizer_allocator_checks.h"
 #include "sanitizer_allocator_internal.h"
 #include "sanitizer_atomic.h"
 #include "sanitizer_common.h"
@@ -160,7 +161,7 @@ void *InternalRealloc(void *addr, uptr size, InternalAllocatorCache *cache) {
 }
 
 void *InternalCalloc(uptr count, uptr size, InternalAllocatorCache *cache) {
-  if (CheckForCallocOverflow(count, size))
+  if (UNLIKELY(CheckForCallocOverflow(count, size)))
     return InternalAllocator::FailureHandler::OnBadRequest();
   void *p = InternalAlloc(count * size, cache);
   if (p) internal_memset(p, 0, count * size);
@@ -202,12 +203,6 @@ void SetLowLevelAllocateCallback(LowLevelAllocateCallback callback) {
   low_level_alloc_callback = callback;
 }
 
-bool CheckForCallocOverflow(uptr size, uptr n) {
-  if (!size) return false;
-  uptr max = (uptr)-1L;
-  return (max / size) < n;
-}
-
 static atomic_uint8_t allocator_out_of_memory = {0};
 static atomic_uint8_t allocator_may_return_null = {0};
 
diff --git a/lib/sanitizer_common/sanitizer_allocator.h b/lib/sanitizer_common/sanitizer_allocator.h
index 0fb8a087ed6b..8c5696ea789c 100644
--- a/lib/sanitizer_common/sanitizer_allocator.h
+++ b/lib/sanitizer_common/sanitizer_allocator.h
@@ -56,11 +56,6 @@ struct NoOpMapUnmapCallback {
 // Callback type for iterating over chunks.
 typedef void (*ForEachChunkCallback)(uptr chunk, void *arg);
 
-// Returns true if calloc(size, n) call overflows on size*n calculation.
-// The caller should "return POLICY::OnBadRequest();" where POLICY is the
-// current allocator failure handling policy.
-bool CheckForCallocOverflow(uptr size, uptr n);
-
 #include "sanitizer_allocator_size_class_map.h"
 #include "sanitizer_allocator_stats.h"
 #include "sanitizer_allocator_primary64.h"
diff --git a/lib/sanitizer_common/sanitizer_allocator_checks.h b/lib/sanitizer_common/sanitizer_allocator_checks.h
new file mode 100644
index 000000000000..202916eae348
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_allocator_checks.h
@@ -0,0 +1,64 @@
+//===-- sanitizer_allocator_checks.h ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Various checks shared between ThreadSanitizer, MemorySanitizer, etc. memory
+// allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_ALLOCATOR_CHECKS_H
+#define SANITIZER_ALLOCATOR_CHECKS_H
+
+#include "sanitizer_errno.h"
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_common.h"
+#include "sanitizer_platform.h"
+
+namespace __sanitizer {
+
+// A common errno setting logic shared by almost all sanitizer allocator APIs.
+INLINE void *SetErrnoOnNull(void *ptr) {
+  if (UNLIKELY(!ptr))
+    errno = errno_ENOMEM;
+  return ptr;
+}
+
+// In case of the check failure, the caller of the following Check... functions
+// should "return POLICY::OnBadRequest();" where POLICY is the current allocator
+// failure handling policy.
+
+// Checks aligned_alloc() parameters, verifies that the alignment is a power of
+// two and that the size is a multiple of alignment for POSIX implementation,
+// and a bit relaxed requirement for non-POSIX ones, that the size is a multiple
+// of alignment.
+INLINE bool CheckAlignedAllocAlignmentAndSize(uptr alignment, uptr size) {
+#if SANITIZER_POSIX
+  return IsPowerOfTwo(alignment) && (size & (alignment - 1)) == 0;
+#else
+  return size % alignment == 0;
+#endif
+}
+
+// Checks posix_memalign() parameters, verifies that alignment is a power of two
+// and a multiple of sizeof(void *).
+INLINE bool CheckPosixMemalignAlignment(uptr alignment) {
+  return IsPowerOfTwo(alignment) && (alignment % sizeof(void *)) == 0; // NOLINT
+}
+
+// Returns true if calloc(size, n) call overflows on size*n calculation.
+INLINE bool CheckForCallocOverflow(uptr size, uptr n) {
+  if (!size)
+    return false;
+  uptr max = (uptr)-1L;
+  return (max / size) < n;
+}
+
+} // namespace __sanitizer
+
+#endif  // SANITIZER_ALLOCATOR_CHECKS_H
diff --git a/lib/sanitizer_common/sanitizer_errno.h b/lib/sanitizer_common/sanitizer_errno.h
index c405307ba8ec..7872b89c227c 100644
--- a/lib/sanitizer_common/sanitizer_errno.h
+++ b/lib/sanitizer_common/sanitizer_errno.h
@@ -26,6 +26,8 @@
 #  define __errno_location __error
 #elif SANITIZER_ANDROID
 #  define __errno_location __errno
+#elif SANITIZER_WINDOWS
+#  define __errno_location _errno
 #endif
 
 extern "C" int *__errno_location();
diff --git a/lib/sanitizer_common/sanitizer_linux.cc b/lib/sanitizer_common/sanitizer_linux.cc
index a79a2a155db9..8c3c1e5d6a5d 100644
--- a/lib/sanitizer_common/sanitizer_linux.cc
+++ b/lib/sanitizer_common/sanitizer_linux.cc
@@ -629,8 +629,7 @@ uptr internal_prctl(int option, uptr arg2, uptr arg3, uptr arg4, uptr arg5) {
 }
 #endif
 
-uptr internal_sigaltstack(const struct sigaltstack *ss,
-                         struct sigaltstack *oss) {
+uptr internal_sigaltstack(const void *ss, void *oss) {
   return internal_syscall(SYSCALL(sigaltstack), (uptr)ss, (uptr)oss);
 }
 
diff --git a/lib/sanitizer_common/sanitizer_linux.h b/lib/sanitizer_common/sanitizer_linux.h
index ee336f7ddff3..11cad6b80933 100644
--- a/lib/sanitizer_common/sanitizer_linux.h
+++ b/lib/sanitizer_common/sanitizer_linux.h
@@ -21,7 +21,6 @@
 #include "sanitizer_platform_limits_posix.h"
 
 struct link_map;  // Opaque type returned by dlopen().
-struct sigaltstack;
 
 namespace __sanitizer {
 // Dirent structure for getdents(). Note that this structure is different from
@@ -30,8 +29,7 @@ struct linux_dirent;
 
 // Syscall wrappers.
 uptr internal_getdents(fd_t fd, struct linux_dirent *dirp, unsigned int count);
-uptr internal_sigaltstack(const struct sigaltstack* ss,
-                          struct sigaltstack* oss);
+uptr internal_sigaltstack(const void* ss, void* oss);
 uptr internal_sigprocmask(int how, __sanitizer_sigset_t *set,
     __sanitizer_sigset_t *oldset);
 
diff --git a/lib/sanitizer_common/sanitizer_mac.cc b/lib/sanitizer_common/sanitizer_mac.cc
index 8df01815f9f7..1edd4157fd6b 100644
--- a/lib/sanitizer_common/sanitizer_mac.cc
+++ b/lib/sanitizer_common/sanitizer_mac.cc
@@ -805,14 +805,35 @@ char **GetArgv() {
 // fields only available in 10.12+. Declare the struct manually to be able to
 // build against older SDKs.
 struct __sanitizer_task_vm_info {
-  uptr _unused[(SANITIZER_WORDSIZE == 32) ? 20 : 19];
-  uptr min_address;
-  uptr max_address;
+  mach_vm_size_t virtual_size;
+  integer_t region_count;
+  integer_t page_size;
+  mach_vm_size_t resident_size;
+  mach_vm_size_t resident_size_peak;
+  mach_vm_size_t device;
+  mach_vm_size_t device_peak;
+  mach_vm_size_t internal;
+  mach_vm_size_t internal_peak;
+  mach_vm_size_t external;
+  mach_vm_size_t external_peak;
+  mach_vm_size_t reusable;
+  mach_vm_size_t reusable_peak;
+  mach_vm_size_t purgeable_volatile_pmap;
+  mach_vm_size_t purgeable_volatile_resident;
+  mach_vm_size_t purgeable_volatile_virtual;
+  mach_vm_size_t compressed;
+  mach_vm_size_t compressed_peak;
+  mach_vm_size_t compressed_lifetime;
+  mach_vm_size_t phys_footprint;
+  mach_vm_address_t min_address;
+  mach_vm_address_t max_address;
 };
+#define __SANITIZER_TASK_VM_INFO_COUNT ((mach_msg_type_number_t) \
+    (sizeof(__sanitizer_task_vm_info) / sizeof(natural_t)))
 
 uptr GetTaskInfoMaxAddress() {
-  __sanitizer_task_vm_info vm_info = {{0}, 0, 0};
-  mach_msg_type_number_t count = sizeof(vm_info) / sizeof(int);
+  __sanitizer_task_vm_info vm_info = {};
+  mach_msg_type_number_t count = __SANITIZER_TASK_VM_INFO_COUNT;
   int err = task_info(mach_task_self(), TASK_VM_INFO, (int *)&vm_info, &count);
   if (err == 0) {
     return vm_info.max_address - 1;
diff --git a/lib/sanitizer_common/sanitizer_platform.h b/lib/sanitizer_common/sanitizer_platform.h
index 49732aa32323..396f7c9346d6 100644
--- a/lib/sanitizer_common/sanitizer_platform.h
+++ b/lib/sanitizer_common/sanitizer_platform.h
@@ -13,7 +13,7 @@
 #ifndef SANITIZER_PLATFORM_H
 #define SANITIZER_PLATFORM_H
 
-#if !defined(__linux__) && !defined(__FreeBSD__) && \
+#if !defined(__linux__) && !defined(__FreeBSD__) && !defined(__NetBSD__) && \
   !defined(__APPLE__) && !defined(_WIN32)
 # error "This operating system is not supported"
 #endif
@@ -30,6 +30,12 @@
 # define SANITIZER_FREEBSD 0
 #endif
 
+#if defined(__NetBSD__)
+# define SANITIZER_NETBSD 1
+#else
+# define SANITIZER_NETBSD 0
+#endif
+
 #if defined(__APPLE__)
 # define SANITIZER_MAC     1
 # include <TargetConditionals.h>
@@ -79,7 +85,8 @@
 # define SANITIZER_ANDROID 0
 #endif
 
-#define SANITIZER_POSIX (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_MAC)
+#define SANITIZER_POSIX \
+  (SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_MAC || SANITIZER_NETBSD)
 
 #if __LP64__ || defined(_WIN64)
 #  define SANITIZER_WORDSIZE 64
diff --git a/lib/sanitizer_common/sanitizer_platform_interceptors.h b/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 1bc43e817230..0380cee92a00 100644
--- a/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -49,6 +49,12 @@
 # define SI_FREEBSD 0
 #endif
 
+#if SANITIZER_NETBSD
+# define SI_NETBSD 1
+#else
+# define SI_NETBSD 0
+#endif
+
 #if SANITIZER_LINUX
 # define SI_LINUX 1
 #else
@@ -109,9 +115,9 @@
 // memmem on Darwin doesn't exist on 10.6
 // FIXME: enable memmem on Windows.
 #define SANITIZER_INTERCEPT_MEMMEM \
-  SI_NOT_WINDOWS && !SI_MAC_DEPLOYMENT_BELOW_10_7
+  (SI_NOT_WINDOWS && !SI_MAC_DEPLOYMENT_BELOW_10_7)
 #define SANITIZER_INTERCEPT_MEMCHR 1
-#define SANITIZER_INTERCEPT_MEMRCHR SI_FREEBSD || SI_LINUX
+#define SANITIZER_INTERCEPT_MEMRCHR (SI_FREEBSD || SI_LINUX || SI_NETBSD)
 
 #define SANITIZER_INTERCEPT_READ   SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_PREAD  SI_NOT_WINDOWS
@@ -127,7 +133,8 @@
 #define SANITIZER_INTERCEPT_READV SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_WRITEV SI_NOT_WINDOWS
 
-#define SANITIZER_INTERCEPT_PREADV SI_FREEBSD || SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_PREADV \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_PWRITEV SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_PREADV64 SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_PWRITEV64 SI_LINUX_NOT_ANDROID
@@ -142,7 +149,7 @@
 
 #ifndef SANITIZER_INTERCEPT_PRINTF
 # define SANITIZER_INTERCEPT_PRINTF SI_NOT_WINDOWS
-# define SANITIZER_INTERCEPT_PRINTF_L SI_FREEBSD
+# define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD)
 # define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_LINUX_NOT_ANDROID
 #endif
 
@@ -151,13 +158,14 @@
 
 #define SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GETPWNAM_R_AND_FRIENDS \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_GETPWENT \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_FGETPWENT SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_GETPWENT_R SI_FREEBSD || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_SETPWENT SI_MAC || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_CLOCK_GETTIME SI_FREEBSD || SI_LINUX
+#define SANITIZER_INTERCEPT_GETPWENT_R \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_SETPWENT (SI_MAC || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_CLOCK_GETTIME (SI_FREEBSD || SI_NETBSD || SI_LINUX)
 #define SANITIZER_INTERCEPT_GETITIMER SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_TIME SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GLOB SI_LINUX_NOT_ANDROID
@@ -168,10 +176,11 @@
 #define SANITIZER_INTERCEPT_GETNAMEINFO SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GETSOCKNAME SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GETHOSTBYNAME SI_NOT_WINDOWS
-#define SANITIZER_INTERCEPT_GETHOSTBYNAME_R SI_FREEBSD || SI_LINUX
-#define SANITIZER_INTERCEPT_GETHOSTBYNAME2_R SI_FREEBSD || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_GETHOSTBYADDR_R SI_FREEBSD || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_GETHOSTENT_R SI_FREEBSD || SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_GETHOSTBYNAME_R (SI_FREEBSD || SI_LINUX)
+#define SANITIZER_INTERCEPT_GETHOSTBYNAME2_R \
+  (SI_FREEBSD || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_GETHOSTBYADDR_R (SI_FREEBSD || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_GETHOSTENT_R (SI_FREEBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_GETSOCKOPT SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_ACCEPT SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_ACCEPT4 SI_LINUX_NOT_ANDROID
@@ -197,63 +206,67 @@
 #define SANITIZER_INTERCEPT_GET_CURRENT_DIR_NAME SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_STRTOIMAX SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_MBSTOWCS SI_NOT_WINDOWS
-#define SANITIZER_INTERCEPT_MBSNRTOWCS SI_MAC || SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_MBSNRTOWCS (SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_WCSTOMBS SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_WCSNRTOMBS \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_WCRTOMB \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_TCGETATTR SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_REALPATH SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_CANONICALIZE_FILE_NAME SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_CONFSTR \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_SCHED_GETAFFINITY SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_SCHED_GETPARAM SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_STRERROR SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_STRERROR_R SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_XPG_STRERROR_R SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_SCANDIR \
-  SI_FREEBSD || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_SCANDIR64 SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_GETGROUPS SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_POLL SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_PPOLL SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_WORDEXP \
-  SI_FREEBSD || (SI_MAC && !SI_IOS) || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || (SI_MAC && !SI_IOS) || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_SIGWAIT SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_SIGWAITINFO SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_SIGTIMEDWAIT SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_SIGSETOPS \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_SIGPENDING SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_SIGPROCMASK SI_NOT_WINDOWS
-#define SANITIZER_INTERCEPT_BACKTRACE SI_FREEBSD || SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_BACKTRACE \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_GETMNTENT SI_LINUX
 #define SANITIZER_INTERCEPT_GETMNTENT_R SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_STATFS SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_STATFS \
+  (SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_STATFS64 \
-  (SI_MAC && !SI_IOS) || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_STATVFS SI_FREEBSD || SI_LINUX_NOT_ANDROID
+  ((SI_MAC && !SI_IOS) || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_STATVFS \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_STATVFS64 SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_INITGROUPS SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_ETHER_NTOA_ATON SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_ETHER_HOST \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_ETHER_R SI_FREEBSD || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_SHMCTL \
-  ((SI_FREEBSD || SI_LINUX_NOT_ANDROID) && SANITIZER_WORDSIZE == 64)
+  (SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_ETHER_R (SI_FREEBSD || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_SHMCTL                       \
+  (SI_NETBSD || ((SI_FREEBSD || SI_LINUX_NOT_ANDROID) && \
+                 SANITIZER_WORDSIZE == 64))  // NOLINT
 #define SANITIZER_INTERCEPT_RANDOM_R SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_PTHREAD_ATTR_GET SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_PTHREAD_ATTR_GETINHERITSCHED \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_PTHREAD_ATTR_GETAFFINITY_NP SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_PTHREAD_MUTEXATTR_GETPSHARED SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_PTHREAD_MUTEXATTR_GETTYPE SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_PTHREAD_MUTEXATTR_GETPROTOCOL \
-  SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_MAC || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_PTHREAD_MUTEXATTR_GETPRIOCEILING \
-  SI_MAC || SI_LINUX_NOT_ANDROID
+  (SI_MAC || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_PTHREAD_MUTEXATTR_GETROBUST SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_PTHREAD_MUTEXATTR_GETROBUST_NP SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_PTHREAD_RWLOCKATTR_GETPSHARED SI_NOT_WINDOWS
@@ -268,33 +281,36 @@
 #define SANITIZER_INTERCEPT_SINCOS SI_LINUX
 #define SANITIZER_INTERCEPT_REMQUO SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_LGAMMA SI_NOT_WINDOWS
-#define SANITIZER_INTERCEPT_LGAMMA_R SI_FREEBSD || SI_LINUX
+#define SANITIZER_INTERCEPT_LGAMMA_R (SI_FREEBSD || SI_LINUX)
 #define SANITIZER_INTERCEPT_LGAMMAL_R SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_DRAND48_R SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_RAND_R \
-  SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_ICONV SI_FREEBSD || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID)
+#define SANITIZER_INTERCEPT_ICONV \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_TIMES SI_NOT_WINDOWS
 
 // FIXME: getline seems to be available on OSX 10.7
-#define SANITIZER_INTERCEPT_GETLINE SI_FREEBSD || SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_GETLINE \
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 
-#define SANITIZER_INTERCEPT__EXIT SI_LINUX || SI_FREEBSD || SI_MAC
+#define SANITIZER_INTERCEPT__EXIT \
+  (SI_LINUX || SI_FREEBSD || SI_NETBSD || SI_MAC)
 
 #define SANITIZER_INTERCEPT_PHTREAD_MUTEX SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_PTHREAD_SETNAME_NP \
-  SI_FREEBSD || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 
 #define SANITIZER_INTERCEPT_TLS_GET_ADDR \
-  SI_FREEBSD || SI_LINUX_NOT_ANDROID
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 
 #define SANITIZER_INTERCEPT_LISTXATTR SI_LINUX
 #define SANITIZER_INTERCEPT_GETXATTR SI_LINUX
 #define SANITIZER_INTERCEPT_GETRESID SI_LINUX
 #define SANITIZER_INTERCEPT_GETIFADDRS \
-  SI_FREEBSD || SI_LINUX_NOT_ANDROID || SI_MAC
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID || SI_MAC)
 #define SANITIZER_INTERCEPT_IF_INDEXTONAME \
-  SI_FREEBSD || SI_LINUX_NOT_ANDROID || SI_MAC
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID || SI_MAC)
 #define SANITIZER_INTERCEPT_CAPGET SI_LINUX_NOT_ANDROID
 #if SI_LINUX && defined(__arm__)
 #define SANITIZER_INTERCEPT_AEABI_MEM 1
@@ -302,55 +318,61 @@
 #define SANITIZER_INTERCEPT_AEABI_MEM 0
 #endif
 #define SANITIZER_INTERCEPT___BZERO SI_MAC
-#define SANITIZER_INTERCEPT_FTIME !SI_FREEBSD && SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_FTIME (!SI_FREEBSD && !SI_NETBSD && SI_NOT_WINDOWS)
 #define SANITIZER_INTERCEPT_XDR SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_TSEARCH SI_LINUX_NOT_ANDROID || SI_MAC
+#define SANITIZER_INTERCEPT_TSEARCH \
+  (SI_LINUX_NOT_ANDROID || SI_MAC || SI_NETBSD)
 #define SANITIZER_INTERCEPT_LIBIO_INTERNALS SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_FOPEN SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_FOPEN64 SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_OPEN_MEMSTREAM SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT_OPEN_MEMSTREAM (SI_LINUX_NOT_ANDROID || SI_NETBSD)
 #define SANITIZER_INTERCEPT_OBSTACK SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_FFLUSH SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_FCLOSE SI_NOT_WINDOWS
 
 #ifndef SANITIZER_INTERCEPT_DLOPEN_DLCLOSE
 #define SANITIZER_INTERCEPT_DLOPEN_DLCLOSE \
-    SI_FREEBSD || SI_LINUX_NOT_ANDROID || SI_MAC
+  (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID || SI_MAC)
 #endif
 
-#define SANITIZER_INTERCEPT_GETPASS SI_LINUX_NOT_ANDROID || SI_MAC
+#define SANITIZER_INTERCEPT_GETPASS \
+  (SI_LINUX_NOT_ANDROID || SI_MAC || SI_NETBSD)
 #define SANITIZER_INTERCEPT_TIMERFD SI_LINUX_NOT_ANDROID
 
 #define SANITIZER_INTERCEPT_MLOCKX SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_FOPENCOOKIE SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_SEM SI_LINUX || SI_FREEBSD
+#define SANITIZER_INTERCEPT_SEM (SI_LINUX || SI_FREEBSD || SI_NETBSD)
 #define SANITIZER_INTERCEPT_PTHREAD_SETCANCEL SI_NOT_WINDOWS
-#define SANITIZER_INTERCEPT_MINCORE SI_LINUX
+#define SANITIZER_INTERCEPT_MINCORE (SI_LINUX || SI_NETBSD)
 #define SANITIZER_INTERCEPT_PROCESS_VM_READV SI_LINUX
-#define SANITIZER_INTERCEPT_CTERMID SI_LINUX || SI_MAC || SI_FREEBSD
-#define SANITIZER_INTERCEPT_CTERMID_R SI_MAC || SI_FREEBSD
+#define SANITIZER_INTERCEPT_CTERMID \
+  (SI_LINUX || SI_MAC || SI_FREEBSD || SI_NETBSD)
+#define SANITIZER_INTERCEPT_CTERMID_R (SI_MAC || SI_FREEBSD)
 
-#define SANITIZER_INTERCEPTOR_HOOKS SI_LINUX || SI_MAC || SI_WINDOWS
+#define SANITIZER_INTERCEPTOR_HOOKS (SI_LINUX || SI_MAC || SI_WINDOWS)
 #define SANITIZER_INTERCEPT_RECV_RECVFROM SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_SEND_SENDTO SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE SI_LINUX
 
-#define SANITIZER_INTERCEPT_STAT (SI_FREEBSD || SI_MAC || SI_ANDROID)
-#define SANITIZER_INTERCEPT___XSTAT !SANITIZER_INTERCEPT_STAT && SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_STAT \
+  (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD)
+#define SANITIZER_INTERCEPT___XSTAT \
+  (!SANITIZER_INTERCEPT_STAT && SI_NOT_WINDOWS)
 #define SANITIZER_INTERCEPT___XSTAT64 SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT___LXSTAT SANITIZER_INTERCEPT___XSTAT
 #define SANITIZER_INTERCEPT___LXSTAT64 SI_LINUX_NOT_ANDROID
 
-#define SANITIZER_INTERCEPT_UTMP SI_NOT_WINDOWS && !SI_MAC && !SI_FREEBSD
-#define SANITIZER_INTERCEPT_UTMPX SI_LINUX_NOT_ANDROID || SI_MAC || SI_FREEBSD
+#define SANITIZER_INTERCEPT_UTMP (SI_NOT_WINDOWS && !SI_MAC && !SI_FREEBSD)
+#define SANITIZER_INTERCEPT_UTMPX (SI_LINUX_NOT_ANDROID || SI_MAC || SI_FREEBSD)
 
 #define SANITIZER_INTERCEPT_GETLOADAVG \
-  SI_LINUX_NOT_ANDROID || SI_MAC || SI_FREEBSD
+  (SI_LINUX_NOT_ANDROID || SI_MAC || SI_FREEBSD || SI_NETBSD)
 
-#define SANITIZER_INTERCEPT_MALLOPT_AND_MALLINFO (!SI_FREEBSD && !SI_MAC)
-#define SANITIZER_INTERCEPT_MEMALIGN (!SI_FREEBSD && !SI_MAC)
-#define SANITIZER_INTERCEPT_PVALLOC (!SI_FREEBSD && !SI_MAC)
-#define SANITIZER_INTERCEPT_CFREE (!SI_FREEBSD && !SI_MAC)
+#define SANITIZER_INTERCEPT_MALLOPT_AND_MALLINFO \
+  (!SI_FREEBSD && !SI_MAC && !SI_NETBSD)
+#define SANITIZER_INTERCEPT_MEMALIGN (!SI_FREEBSD && !SI_MAC && !SI_NETBSD)
+#define SANITIZER_INTERCEPT_PVALLOC (!SI_FREEBSD && !SI_MAC && !SI_NETBSD)
+#define SANITIZER_INTERCEPT_CFREE (!SI_FREEBSD && !SI_MAC && !SI_NETBSD)
 #define SANITIZER_INTERCEPT_ALIGNED_ALLOC (!SI_MAC)
 #define SANITIZER_INTERCEPT_MALLOC_USABLE_SIZE (!SI_MAC)
 #define SANITIZER_INTERCEPT_MCHECK_MPROBE SI_LINUX_NOT_ANDROID
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
index 03f73ae88308..d7fa5f6451d1 100644
--- a/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
@@ -287,7 +287,7 @@ static int TracerThread(void* argument) {
 
   // Alternate stack for signal handling.
   InternalScopedBuffer<char> handler_stack_memory(kHandlerStackSize);
-  struct sigaltstack handler_stack;
+  stack_t handler_stack;
   internal_memset(&handler_stack, 0, sizeof(handler_stack));
   handler_stack.ss_sp = handler_stack_memory.data();
   handler_stack.ss_size = kHandlerStackSize;
diff --git a/lib/sanitizer_common/tests/sanitizer_bitvector_test.cc b/lib/sanitizer_common/tests/sanitizer_bitvector_test.cc
index dec5459b2515..669365b80ecb 100644
--- a/lib/sanitizer_common/tests/sanitizer_bitvector_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_bitvector_test.cc
@@ -62,14 +62,14 @@ void Print(const BV &bv) {
   t.copyFrom(bv);
   while (!t.empty()) {
     uptr idx = t.getAndClearFirstOne();
-    fprintf(stderr, "%zd ", idx);
+    fprintf(stderr, "%lu ", idx);
   }
   fprintf(stderr, "\n");
 }
 
 void Print(const set<uptr> &s) {
   for (set<uptr>::iterator it = s.begin(); it != s.end(); ++it) {
-    fprintf(stderr, "%zd ", *it);
+    fprintf(stderr, "%lu ", *it);
   }
   fprintf(stderr, "\n");
 }
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
index ec9132f90a4f..6f30ee987513 100644
--- a/lib/scudo/scudo_allocator.cpp
+++ b/lib/scudo/scudo_allocator.cpp
@@ -19,10 +19,11 @@
 #include "scudo_tls.h"
 #include "scudo_utils.h"
 
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
+#include "sanitizer_common/sanitizer_errno.h"
 #include "sanitizer_common/sanitizer_quarantine.h"
 
-#include <errno.h>
 #include <string.h>
 
 namespace __scudo {
@@ -73,7 +74,7 @@ struct ScudoChunk : UnpackedHeader {
   // beginning of the user data to the end of the backend allocated chunk.
   uptr getUsableSize(UnpackedHeader *Header) {
     uptr Size =
-        getBackendAllocator().GetActuallyAllocatedSize(getAllocBeg(Header),
+        getBackendAllocator().getActuallyAllocatedSize(getAllocBeg(Header),
                                                        Header->FromPrimary);
     if (Size == 0)
       return 0;
@@ -232,7 +233,10 @@ struct QuarantineCallback {
     }
     Chunk->eraseHeader();
     void *Ptr = Chunk->getAllocBeg(&Header);
-    getBackendAllocator().Deallocate(Cache_, Ptr, Header.FromPrimary);
+    if (Header.FromPrimary)
+      getBackendAllocator().deallocatePrimary(Cache_, Ptr);
+    else
+      getBackendAllocator().deallocateSecondary(Ptr);
   }
 
   // Internal quarantine allocation and deallocation functions. We first check
@@ -240,11 +244,11 @@ struct QuarantineCallback {
   // TODO(kostyak): figure out the best way to protect the batches.
   COMPILER_CHECK(sizeof(QuarantineBatch) < SizeClassMap::kMaxSize);
   void *Allocate(uptr Size) {
-    return getBackendAllocator().Allocate(Cache_, Size, MinAlignment, true);
+    return getBackendAllocator().allocatePrimary(Cache_, Size);
   }
 
   void Deallocate(void *Ptr) {
-    getBackendAllocator().Deallocate(Cache_, Ptr, true);
+    getBackendAllocator().deallocatePrimary(Cache_, Ptr);
   }
 
   AllocatorCache *Cache_;
@@ -277,6 +281,9 @@ struct ScudoAllocator {
   ScudoBackendAllocator BackendAllocator;
   ScudoQuarantine AllocatorQuarantine;
 
+  StaticSpinMutex GlobalPrngMutex;
+  ScudoPrng GlobalPrng;
+
   // The fallback caches are used when the thread local caches have been
   // 'detroyed' on thread tear-down. They are protected by a Mutex as they can
   // be accessed by different threads.
@@ -303,10 +310,10 @@ struct ScudoAllocator {
     // result, the maximum offset will be at most the maximum alignment for the
     // last size class minus the header size, in multiples of MinAlignment.
     UnpackedHeader Header = {};
-    uptr MaxPrimaryAlignment = 1 << MostSignificantSetBitIndex(
-        SizeClassMap::kMaxSize - MinAlignment);
-    uptr MaxOffset = (MaxPrimaryAlignment - AlignedChunkHeaderSize) >>
-        MinAlignmentLog;
+    uptr MaxPrimaryAlignment =
+        1 << MostSignificantSetBitIndex(SizeClassMap::kMaxSize - MinAlignment);
+    uptr MaxOffset =
+        (MaxPrimaryAlignment - AlignedChunkHeaderSize) >> MinAlignmentLog;
     Header.Offset = MaxOffset;
     if (Header.Offset != MaxOffset) {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
@@ -328,13 +335,14 @@ struct ScudoAllocator {
     DeleteSizeMismatch = Options.DeleteSizeMismatch;
     ZeroContents = Options.ZeroContents;
     SetAllocatorMayReturnNull(Options.MayReturnNull);
-    BackendAllocator.Init(Options.ReleaseToOSIntervalMs);
+    BackendAllocator.init(Options.ReleaseToOSIntervalMs);
     AllocatorQuarantine.Init(
         static_cast<uptr>(Options.QuarantineSizeMb) << 20,
         static_cast<uptr>(Options.ThreadLocalQuarantineSizeKb) << 10);
-    BackendAllocator.InitCache(&FallbackAllocatorCache);
+    GlobalPrng.init();
+    Cookie = GlobalPrng.getU64();
+    BackendAllocator.initCache(&FallbackAllocatorCache);
     FallbackPrng.init();
-    Cookie = FallbackPrng.getU64();
   }
 
   // Helper function that checks for a valid Scudo chunk. nullptr isn't.
@@ -374,28 +382,36 @@ struct ScudoAllocator {
 
     void *Ptr;
     u8 Salt;
-    uptr AllocationSize = FromPrimary ? AlignedSize : NeededSize;
-    uptr AllocationAlignment = FromPrimary ? MinAlignment : Alignment;
-    ScudoThreadContext *ThreadContext = getThreadContextAndLock();
-    if (LIKELY(ThreadContext)) {
-      Salt = getPrng(ThreadContext)->getU8();
-      Ptr = BackendAllocator.Allocate(getAllocatorCache(ThreadContext),
-                                      AllocationSize, AllocationAlignment,
-                                      FromPrimary);
-      ThreadContext->unlock();
+    uptr AllocSize;
+    if (FromPrimary) {
+      AllocSize = AlignedSize;
+      ScudoThreadContext *ThreadContext = getThreadContextAndLock();
+      if (LIKELY(ThreadContext)) {
+        Salt = getPrng(ThreadContext)->getU8();
+        Ptr = BackendAllocator.allocatePrimary(getAllocatorCache(ThreadContext),
+                                               AllocSize);
+        ThreadContext->unlock();
+      } else {
+        SpinMutexLock l(&FallbackMutex);
+        Salt = FallbackPrng.getU8();
+        Ptr = BackendAllocator.allocatePrimary(&FallbackAllocatorCache,
+                                               AllocSize);
+      }
     } else {
-      SpinMutexLock l(&FallbackMutex);
-      Salt = FallbackPrng.getU8();
-      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, AllocationSize,
-                                      AllocationAlignment, FromPrimary);
+      {
+        SpinMutexLock l(&GlobalPrngMutex);
+        Salt = GlobalPrng.getU8();
+      }
+      AllocSize = NeededSize;
+      Ptr = BackendAllocator.allocateSecondary(AllocSize, Alignment);
     }
     if (UNLIKELY(!Ptr))
       return FailureHandler::OnOOM();
 
     // If requested, we will zero out the entire contents of the returned chunk.
     if ((ForceZeroContents || ZeroContents) && FromPrimary)
-       memset(Ptr, 0,
-              BackendAllocator.GetActuallyAllocatedSize(Ptr, FromPrimary));
+      memset(Ptr, 0, BackendAllocator.getActuallyAllocatedSize(
+          Ptr, /*FromPrimary=*/true));
 
     UnpackedHeader Header = {};
     uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
@@ -409,11 +425,11 @@ struct ScudoAllocator {
       uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
       Header.Offset = Offset >> MinAlignmentLog;
     }
-    CHECK_LE(UserBeg + Size, AllocBeg + AllocationSize);
+    CHECK_LE(UserBeg + Size, AllocBeg + AllocSize);
     Header.State = ChunkAllocated;
     Header.AllocType = Type;
     if (FromPrimary) {
-      Header.FromPrimary = FromPrimary;
+      Header.FromPrimary = 1;
       Header.SizeOrUnusedBytes = Size;
     } else {
       // The secondary fits the allocations to a page, so the amount of unused
@@ -424,7 +440,7 @@ struct ScudoAllocator {
       if (TrailingBytes)
         Header.SizeOrUnusedBytes = PageSize - TrailingBytes;
     }
-    Header.Salt = static_cast<u8>(Salt);
+    Header.Salt = Salt;
     getScudoChunk(UserBeg)->storeHeader(&Header);
     void *UserPtr = reinterpret_cast<void *>(UserBeg);
     // if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(UserPtr, Size);
@@ -442,15 +458,18 @@ struct ScudoAllocator {
     if (BypassQuarantine) {
       Chunk->eraseHeader();
       void *Ptr = Chunk->getAllocBeg(Header);
-      ScudoThreadContext *ThreadContext = getThreadContextAndLock();
-      if (LIKELY(ThreadContext)) {
-        getBackendAllocator().Deallocate(getAllocatorCache(ThreadContext), Ptr,
-                                         FromPrimary);
-        ThreadContext->unlock();
+      if (FromPrimary) {
+        ScudoThreadContext *ThreadContext = getThreadContextAndLock();
+        if (LIKELY(ThreadContext)) {
+          getBackendAllocator().deallocatePrimary(
+              getAllocatorCache(ThreadContext), Ptr);
+          ThreadContext->unlock();
+        } else {
+          SpinMutexLock Lock(&FallbackMutex);
+          getBackendAllocator().deallocatePrimary(&FallbackAllocatorCache, Ptr);
+        }
       } else {
-        SpinMutexLock Lock(&FallbackMutex);
-        getBackendAllocator().Deallocate(&FallbackAllocatorCache, Ptr,
-                                         FromPrimary);
+        getBackendAllocator().deallocateSecondary(Ptr);
       }
     } else {
       UnpackedHeader NewHeader = *Header;
@@ -580,7 +599,7 @@ struct ScudoAllocator {
 
   void *calloc(uptr NMemB, uptr Size) {
     initThreadMaybe();
-    if (CheckForCallocOverflow(NMemB, Size))
+    if (UNLIKELY(CheckForCallocOverflow(NMemB, Size)))
       return FailureHandler::OnBadRequest();
     return allocate(NMemB * Size, MinAlignment, FromMalloc, true);
   }
@@ -589,13 +608,13 @@ struct ScudoAllocator {
     AllocatorCache *Cache = getAllocatorCache(ThreadContext);
     AllocatorQuarantine.Drain(getQuarantineCache(ThreadContext),
                               QuarantineCallback(Cache));
-    BackendAllocator.DestroyCache(Cache);
+    BackendAllocator.destroyCache(Cache);
   }
 
   uptr getStats(AllocatorStat StatType) {
     initThreadMaybe();
     uptr stats[AllocatorStatCount];
-    BackendAllocator.GetStats(stats);
+    BackendAllocator.getStats(stats);
     return stats[StatType];
   }
 };
@@ -611,7 +630,7 @@ static void initScudoInternal(const AllocatorOptions &Options) {
 }
 
 void ScudoThreadContext::init() {
-  getBackendAllocator().InitCache(&Cache);
+  getBackendAllocator().initCache(&Cache);
   Prng.init();
   memset(QuarantineCachePlaceHolder, 0, sizeof(QuarantineCachePlaceHolder));
 }
@@ -621,7 +640,7 @@ void ScudoThreadContext::commitBack() {
 }
 
 void *scudoMalloc(uptr Size, AllocType Type) {
-  return Instance.allocate(Size, MinAlignment, Type);
+  return SetErrnoOnNull(Instance.allocate(Size, MinAlignment, Type));
 }
 
 void scudoFree(void *Ptr, AllocType Type) {
@@ -634,54 +653,56 @@ void scudoSizedFree(void *Ptr, uptr Size, AllocType Type) {
 
 void *scudoRealloc(void *Ptr, uptr Size) {
   if (!Ptr)
-    return Instance.allocate(Size, MinAlignment, FromMalloc);
+    return SetErrnoOnNull(Instance.allocate(Size, MinAlignment, FromMalloc));
   if (Size == 0) {
     Instance.deallocate(Ptr, 0, FromMalloc);
     return nullptr;
   }
-  return Instance.reallocate(Ptr, Size);
+  return SetErrnoOnNull(Instance.reallocate(Ptr, Size));
 }
 
 void *scudoCalloc(uptr NMemB, uptr Size) {
-  return Instance.calloc(NMemB, Size);
+  return SetErrnoOnNull(Instance.calloc(NMemB, Size));
 }
 
 void *scudoValloc(uptr Size) {
-  return Instance.allocate(Size, GetPageSizeCached(), FromMemalign);
+  return SetErrnoOnNull(
+      Instance.allocate(Size, GetPageSizeCached(), FromMemalign));
 }
 
 void *scudoPvalloc(uptr Size) {
   uptr PageSize = GetPageSizeCached();
-  Size = RoundUpTo(Size, PageSize);
-  if (Size == 0) {
-    // pvalloc(0) should allocate one page.
-    Size = PageSize;
-  }
-  return Instance.allocate(Size, PageSize, FromMemalign);
+  // pvalloc(0) should allocate one page.
+  Size = Size ? RoundUpTo(Size, PageSize) : PageSize;
+  return SetErrnoOnNull(Instance.allocate(Size, PageSize, FromMemalign));
 }
 
 void *scudoMemalign(uptr Alignment, uptr Size) {
-  if (UNLIKELY(!IsPowerOfTwo(Alignment)))
+  if (UNLIKELY(!IsPowerOfTwo(Alignment))) {
+    errno = errno_EINVAL;
     return ScudoAllocator::FailureHandler::OnBadRequest();
-  return Instance.allocate(Size, Alignment, FromMemalign);
+  }
+  return SetErrnoOnNull(Instance.allocate(Size, Alignment, FromMemalign));
 }
 
 int scudoPosixMemalign(void **MemPtr, uptr Alignment, uptr Size) {
-  if (UNLIKELY(!IsPowerOfTwo(Alignment) || (Alignment % sizeof(void *)) != 0)) {
-    *MemPtr = ScudoAllocator::FailureHandler::OnBadRequest();
-    return EINVAL;
+  if (UNLIKELY(!CheckPosixMemalignAlignment(Alignment))) {
+    ScudoAllocator::FailureHandler::OnBadRequest();
+    return errno_EINVAL;
   }
-  *MemPtr = Instance.allocate(Size, Alignment, FromMemalign);
-  if (!*MemPtr)
-    return ENOMEM;
+  void *Ptr = Instance.allocate(Size, Alignment, FromMemalign);
+  if (UNLIKELY(!Ptr))
+    return errno_ENOMEM;
+  *MemPtr = Ptr;
   return 0;
 }
 
 void *scudoAlignedAlloc(uptr Alignment, uptr Size) {
-  // Alignment must be a power of 2, Size must be a multiple of Alignment.
-  if (UNLIKELY(!IsPowerOfTwo(Alignment) || (Size & (Alignment - 1)) != 0))
+  if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(Alignment, Size))) {
+    errno = errno_EINVAL;
     return ScudoAllocator::FailureHandler::OnBadRequest();
-  return Instance.allocate(Size, Alignment, FromMalloc);
+  }
+  return SetErrnoOnNull(Instance.allocate(Size, Alignment, FromMalloc));
 }
 
 uptr scudoMallocUsableSize(void *Ptr) {
diff --git a/lib/scudo/scudo_allocator_combined.h b/lib/scudo/scudo_allocator_combined.h
index 818272868880..7599c12abb6d 100644
--- a/lib/scudo/scudo_allocator_combined.h
+++ b/lib/scudo/scudo_allocator_combined.h
@@ -23,41 +23,47 @@ template <class PrimaryAllocator, class AllocatorCache,
     class SecondaryAllocator>
 class ScudoCombinedAllocator {
  public:
-  void Init(s32 ReleaseToOSIntervalMs) {
+  void init(s32 ReleaseToOSIntervalMs) {
     Primary.Init(ReleaseToOSIntervalMs);
     Secondary.Init();
     Stats.Init();
   }
 
-  void *Allocate(AllocatorCache *Cache, uptr Size, uptr Alignment,
-                 bool FromPrimary) {
-    if (FromPrimary)
-      return Cache->Allocate(&Primary, Primary.ClassID(Size));
+  // Primary allocations are always MinAlignment aligned, and as such do not
+  // require an Alignment parameter.
+  void *allocatePrimary(AllocatorCache *Cache, uptr Size) {
+    return Cache->Allocate(&Primary, Primary.ClassID(Size));
+  }
+
+  // Secondary allocations do not require a Cache, but do require an Alignment
+  // parameter.
+  void *allocateSecondary(uptr Size, uptr Alignment) {
     return Secondary.Allocate(&Stats, Size, Alignment);
   }
 
-  void Deallocate(AllocatorCache *Cache, void *Ptr, bool FromPrimary) {
-    if (FromPrimary)
-      Cache->Deallocate(&Primary, Primary.GetSizeClass(Ptr), Ptr);
-    else
-      Secondary.Deallocate(&Stats, Ptr);
+  void deallocatePrimary(AllocatorCache *Cache, void *Ptr) {
+    Cache->Deallocate(&Primary, Primary.GetSizeClass(Ptr), Ptr);
   }
 
-  uptr GetActuallyAllocatedSize(void *Ptr, bool FromPrimary) {
+  void deallocateSecondary(void *Ptr) {
+    Secondary.Deallocate(&Stats, Ptr);
+  }
+
+  uptr getActuallyAllocatedSize(void *Ptr, bool FromPrimary) {
     if (FromPrimary)
       return PrimaryAllocator::ClassIdToSize(Primary.GetSizeClass(Ptr));
     return Secondary.GetActuallyAllocatedSize(Ptr);
   }
 
-  void InitCache(AllocatorCache *Cache) {
+  void initCache(AllocatorCache *Cache) {
     Cache->Init(&Stats);
   }
 
-  void DestroyCache(AllocatorCache *Cache) {
+  void destroyCache(AllocatorCache *Cache) {
     Cache->Destroy(&Primary, &Stats);
   }
 
-  void GetStats(AllocatorStatCounters StatType) const {
+  void getStats(AllocatorStatCounters StatType) const {
     Stats.Get(StatType);
   }
 
diff --git a/lib/tsan/rtl/tsan_clock.cc b/lib/tsan/rtl/tsan_clock.cc
index 9ee9104283f8..ef984a45cd9d 100644
--- a/lib/tsan/rtl/tsan_clock.cc
+++ b/lib/tsan/rtl/tsan_clock.cc
@@ -61,20 +61,13 @@
 // an exclusive lock; ThreadClock's are private to respective threads and so
 // do not need any protection.
 //
-// Description of ThreadClock state:
-// clk_ - fixed size vector clock.
-// nclk_ - effective size of the vector clock (the rest is zeros).
-// tid_ - index of the thread associated with he clock ("current thread").
-// last_acquire_ - current thread time when it acquired something from
-//   other threads.
-//
 // Description of SyncClock state:
 // clk_ - variable size vector clock, low kClkBits hold timestamp,
 //   the remaining bits hold "acquired" flag (the actual value is thread's
 //   reused counter);
 //   if acquried == thr->reused_, then the respective thread has already
-//   acquired this clock (except possibly dirty_tids_).
-// dirty_tids_ - holds up to two indeces in the vector clock that other threads
+//   acquired this clock (except possibly for dirty elements).
+// dirty_ - holds up to two indeces in the vector clock that other threads
 //   need to acquire regardless of "acquired" flag value;
 // release_store_tid_ - denotes that the clock state is a result of
 //   release-store operation by the thread with release_store_tid_ index.
@@ -90,21 +83,51 @@
 
 namespace __tsan {
 
+static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
+  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
+}
+
+// Drop reference to the first level block idx.
+static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  atomic_uint32_t *ref = ref_ptr(cb);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  for (;;) {
+    CHECK_GT(v, 0);
+    if (v == 1)
+      break;
+    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
+      return;
+  }
+  // First level block owns second level blocks, so them as well.
+  for (uptr i = 0; i < blocks; i++)
+    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
+  ctx->clock_alloc.Free(c, idx);
+}
+
 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
     : tid_(tid)
-    , reused_(reused + 1) {  // 0 has special meaning
+    , reused_(reused + 1)  // 0 has special meaning
+    , cached_idx_()
+    , cached_size_()
+    , cached_blocks_() {
   CHECK_LT(tid, kMaxTidInClock);
   CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
   nclk_ = tid_ + 1;
   last_acquire_ = 0;
   internal_memset(clk_, 0, sizeof(clk_));
-  clk_[tid_].reused = reused_;
 }
 
 void ThreadClock::ResetCached(ClockCache *c) {
+  if (cached_idx_) {
+    UnrefClockBlock(c, cached_idx_, cached_blocks_);
+    cached_idx_ = 0;
+    cached_size_ = 0;
+    cached_blocks_ = 0;
+  }
 }
 
-void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
+void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(src->size_, kMaxTid);
   CPP_STAT_INC(StatClockAcquire);
@@ -116,50 +139,46 @@ void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
     return;
   }
 
-  // Check if we've already acquired src after the last release operation on src
   bool acquired = false;
-  if (nclk > tid_) {
-    if (src->elem(tid_).reused == reused_) {
-      for (unsigned i = 0; i < kDirtyTids; i++) {
-        unsigned tid = src->dirty_tids_[i];
-        if (tid != kInvalidTid) {
-          u64 epoch = src->elem(tid).epoch;
-          if (clk_[tid].epoch < epoch) {
-            clk_[tid].epoch = epoch;
-            acquired = true;
-          }
-        }
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    SyncClock::Dirty dirty = src->dirty_[i];
+    unsigned tid = dirty.tid;
+    if (tid != kInvalidTid) {
+      if (clk_[tid] < dirty.epoch) {
+        clk_[tid] = dirty.epoch;
+        acquired = true;
       }
-      if (acquired) {
-        CPP_STAT_INC(StatClockAcquiredSomething);
-        last_acquire_ = clk_[tid_].epoch;
-      }
-      return;
     }
   }
 
-  // O(N) acquire.
-  CPP_STAT_INC(StatClockAcquireFull);
-  nclk_ = max(nclk_, nclk);
-  for (uptr i = 0; i < nclk; i++) {
-    u64 epoch = src->elem(i).epoch;
-    if (clk_[i].epoch < epoch) {
-      clk_[i].epoch = epoch;
-      acquired = true;
+  // Check if we've already acquired src after the last release operation on src
+  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
+    // O(N) acquire.
+    CPP_STAT_INC(StatClockAcquireFull);
+    nclk_ = max(nclk_, nclk);
+    u64 *dst_pos = &clk_[0];
+    for (ClockElem &src_elem : *src) {
+      u64 epoch = src_elem.epoch;
+      if (*dst_pos < epoch) {
+        *dst_pos = epoch;
+        acquired = true;
+      }
+      dst_pos++;
     }
-  }
 
-  // Remember that this thread has acquired this clock.
-  if (nclk > tid_)
-    src->elem(tid_).reused = reused_;
+    // Remember that this thread has acquired this clock.
+    if (nclk > tid_)
+      src->elem(tid_).reused = reused_;
+  }
 
   if (acquired) {
     CPP_STAT_INC(StatClockAcquiredSomething);
-    last_acquire_ = clk_[tid_].epoch;
+    last_acquire_ = clk_[tid_];
+    ResetCached(c);
   }
 }
 
-void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
+void ThreadClock::release(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
 
@@ -179,7 +198,7 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
   // since the last release on dst. If so, we need to update
   // only dst->elem(tid_).
   if (dst->elem(tid_).epoch > last_acquire_) {
-    UpdateCurrentThread(dst);
+    UpdateCurrentThread(c, dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
       dst->release_store_tid_ = kInvalidTid;
@@ -188,23 +207,24 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
 
   // O(N) release.
   CPP_STAT_INC(StatClockReleaseFull);
+  dst->Unshare(c);
   // First, remember whether we've acquired dst.
   bool acquired = IsAlreadyAcquired(dst);
   if (acquired)
     CPP_STAT_INC(StatClockReleaseAcquired);
   // Update dst->clk_.
-  for (uptr i = 0; i < nclk_; i++) {
-    ClockElem &ce = dst->elem(i);
-    ce.epoch = max(ce.epoch, clk_[i].epoch);
+  dst->FlushDirty();
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = max(ce.epoch, clk_[i]);
     ce.reused = 0;
+    i++;
   }
   // Clear 'acquired' flag in the remaining elements.
   if (nclk_ < dst->size_)
     CPP_STAT_INC(StatClockReleaseClearTail);
   for (uptr i = nclk_; i < dst->size_; i++)
     dst->elem(i).reused = 0;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
   dst->release_store_tid_ = kInvalidTid;
   dst->release_store_reused_ = 0;
   // If we've acquired dst, remember this fact,
@@ -213,11 +233,37 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
     dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
+  if (dst->size_ == 0 && cached_idx_ != 0) {
+    // Reuse the cached clock.
+    // Note: we could reuse/cache the cached clock in more cases:
+    // we could update the existing clock and cache it, or replace it with the
+    // currently cached clock and release the old one. And for a shared
+    // existing clock, we could replace it with the currently cached;
+    // or unshare, update and cache. But, for simplicity, we currnetly reuse
+    // cached clock only when the target clock is empty.
+    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
+    dst->tab_idx_ = cached_idx_;
+    dst->size_ = cached_size_;
+    dst->blocks_ = cached_blocks_;
+    CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
+    // The cached clock is shared (immutable),
+    // so this is where we store the current clock.
+    dst->dirty_[0].tid = tid_;
+    dst->dirty_[0].epoch = clk_[tid_];
+    dst->release_store_tid_ = tid_;
+    dst->release_store_reused_ = reused_;
+    // Rememeber that we don't need to acquire it in future.
+    dst->elem(tid_).reused = reused_;
+    // Grab a reference.
+    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    return;
+  }
+
   // Check if we need to resize dst.
   if (dst->size_ < nclk_)
     dst->Resize(c, nclk_);
@@ -226,32 +272,41 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
       dst->release_store_reused_ == reused_ &&
       dst->elem(tid_).epoch > last_acquire_) {
     CPP_STAT_INC(StatClockStoreFast);
-    UpdateCurrentThread(dst);
+    UpdateCurrentThread(c, dst);
     return;
   }
 
   // O(N) release-store.
   CPP_STAT_INC(StatClockStoreFull);
-  for (uptr i = 0; i < nclk_; i++) {
-    ClockElem &ce = dst->elem(i);
-    ce.epoch = clk_[i].epoch;
+  dst->Unshare(c);
+  // Note: dst can be larger than this ThreadClock.
+  // This is fine since clk_ beyond size is all zeros.
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = clk_[i];
     ce.reused = 0;
+    i++;
   }
-  // Clear the tail of dst->clk_.
-  if (nclk_ < dst->size_) {
-    for (uptr i = nclk_; i < dst->size_; i++) {
-      ClockElem &ce = dst->elem(i);
-      ce.epoch = 0;
-      ce.reused = 0;
-    }
-    CPP_STAT_INC(StatClockStoreTail);
-  }
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dst->dirty_[i].tid = kInvalidTid;
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
   // Rememeber that we don't need to acquire it in future.
   dst->elem(tid_).reused = reused_;
+
+  // If the resulting clock is cachable, cache it for future release operations.
+  // The clock is always cachable if we released to an empty sync object.
+  if (cached_idx_ == 0 && dst->Cachable()) {
+    // Grab a reference to the ClockBlock.
+    atomic_uint32_t *ref = ref_ptr(dst->tab_);
+    if (atomic_load(ref, memory_order_acquire) == 1)
+      atomic_store_relaxed(ref, 2);
+    else
+      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    cached_idx_ = dst->tab_idx_;
+    cached_size_ = dst->size_;
+    cached_blocks_ = dst->blocks_;
+  }
 }
 
 void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
@@ -261,37 +316,36 @@ void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
 }
 
 // Updates only single element related to the current thread in dst->clk_.
-void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
+void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
-  dst->elem(tid_).epoch = clk_[tid_].epoch;
-
   for (unsigned i = 0; i < kDirtyTids; i++) {
-    if (dst->dirty_tids_[i] == tid_) {
+    SyncClock::Dirty *dirty = &dst->dirty_[i];
+    const unsigned tid = dirty->tid;
+    if (tid == tid_ || tid == kInvalidTid) {
       CPP_STAT_INC(StatClockReleaseFast);
-      return;
-    }
-    if (dst->dirty_tids_[i] == kInvalidTid) {
-      CPP_STAT_INC(StatClockReleaseFast);
-      dst->dirty_tids_[i] = tid_;
+      dirty->tid = tid_;
+      dirty->epoch = clk_[tid_];
       return;
     }
   }
   // Reset all 'acquired' flags, O(N).
+  // We are going to touch dst elements, so we need to unshare it.
+  dst->Unshare(c);
   CPP_STAT_INC(StatClockReleaseSlow);
+  dst->elem(tid_).epoch = clk_[tid_];
   for (uptr i = 0; i < dst->size_; i++)
     dst->elem(i).reused = 0;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
+  dst->FlushDirty();
 }
 
-// Checks whether the current threads has already acquired src.
+// Checks whether the current thread has already acquired src.
 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
   if (src->elem(tid_).reused != reused_)
     return false;
   for (unsigned i = 0; i < kDirtyTids; i++) {
-    unsigned tid = src->dirty_tids_[i];
-    if (tid != kInvalidTid) {
-      if (clk_[tid].epoch < src->elem(tid).epoch)
+    SyncClock::Dirty dirty = src->dirty_[i];
+    if (dirty.tid != kInvalidTid) {
+      if (clk_[dirty.tid] < dirty.epoch)
         return false;
     }
   }
@@ -302,22 +356,19 @@ bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
   DCHECK_LT(tid, kMaxTid);
-  DCHECK_GE(v, clk_[tid].epoch);
-  clk_[tid].epoch = v;
+  DCHECK_GE(v, clk_[tid]);
+  clk_[tid] = v;
   if (nclk_ <= tid)
     nclk_ = tid + 1;
-  last_acquire_ = clk_[tid_].epoch;
+  last_acquire_ = clk_[tid_];
+  ResetCached(c);
 }
 
 void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("clock=[");
   for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
-  printf("] reused=[");
-  for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
-  printf("] tid=%u/%u last_acq=%llu",
-      tid_, reused_, last_acquire_);
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
+  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
 }
 
 SyncClock::SyncClock() {
@@ -327,22 +378,14 @@ SyncClock::SyncClock() {
 SyncClock::~SyncClock() {
   // Reset must be called before dtor.
   CHECK_EQ(size_, 0);
+  CHECK_EQ(blocks_, 0);
   CHECK_EQ(tab_, 0);
   CHECK_EQ(tab_idx_, 0);
 }
 
 void SyncClock::Reset(ClockCache *c) {
-  if (size_ == 0) {
-    // nothing
-  } else if (size_ <= ClockBlock::kClockCount) {
-    // One-level table.
-    ctx->clock_alloc.Free(c, tab_idx_);
-  } else {
-    // Two-level table.
-    for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
-      ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
-    ctx->clock_alloc.Free(c, tab_idx_);
-  }
+  if (size_)
+    UnrefClockBlock(c, tab_idx_, blocks_);
   ResetImpl();
 }
 
@@ -350,66 +393,171 @@ void SyncClock::ResetImpl() {
   tab_ = 0;
   tab_idx_ = 0;
   size_ = 0;
+  blocks_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
-    dirty_tids_[i] = kInvalidTid;
+    dirty_[i].tid = kInvalidTid;
 }
 
 void SyncClock::Resize(ClockCache *c, uptr nclk) {
   CPP_STAT_INC(StatClockReleaseResize);
-  if (RoundUpTo(nclk, ClockBlock::kClockCount) <=
-      RoundUpTo(size_, ClockBlock::kClockCount)) {
-    // Growing within the same block.
+  Unshare(c);
+  if (nclk <= capacity()) {
     // Memory is already allocated, just increase the size.
     size_ = nclk;
     return;
   }
-  if (nclk <= ClockBlock::kClockCount) {
+  if (size_ == 0) {
     // Grow from 0 to one-level table.
     CHECK_EQ(size_, 0);
+    CHECK_EQ(blocks_, 0);
     CHECK_EQ(tab_, 0);
     CHECK_EQ(tab_idx_, 0);
-    size_ = nclk;
     tab_idx_ = ctx->clock_alloc.Alloc(c);
     tab_ = ctx->clock_alloc.Map(tab_idx_);
     internal_memset(tab_, 0, sizeof(*tab_));
-    return;
+    atomic_store_relaxed(ref_ptr(tab_), 1);
+    size_ = 1;
+  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
+    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
+    CHECK_LT(top, ClockBlock::kClockCount);
+    const uptr move = top * sizeof(tab_->clock[0]);
+    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
+    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
+    internal_memset(tab_->clock, 0, move);
+    append_block(idx);
   }
-  // Growing two-level table.
-  if (size_ == 0) {
-    // Allocate first level table.
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-  } else if (size_ <= ClockBlock::kClockCount) {
-    // Transform one-level table to two-level table.
-    u32 old = tab_idx_;
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-    tab_->table[0] = old;
-  }
-  // At this point we have first level table allocated.
+  // At this point we have first level table allocated and all clock elements
+  // are evacuated from it to a second level block.
   // Add second level tables as necessary.
-  for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount);
-      i < nclk; i += ClockBlock::kClockCount) {
+  while (nclk > capacity()) {
     u32 idx = ctx->clock_alloc.Alloc(c);
     ClockBlock *cb = ctx->clock_alloc.Map(idx);
     internal_memset(cb, 0, sizeof(*cb));
-    CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0);
-    tab_->table[i/ClockBlock::kClockCount] = idx;
+    append_block(idx);
   }
   size_ = nclk;
 }
 
-ClockElem &SyncClock::elem(unsigned tid) const {
+// Flushes all dirty elements into the main clock array.
+void SyncClock::FlushDirty() {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty *dirty = &dirty_[i];
+    if (dirty->tid != kInvalidTid) {
+      CHECK_LT(dirty->tid, size_);
+      elem(dirty->tid).epoch = dirty->epoch;
+      dirty->tid = kInvalidTid;
+    }
+  }
+}
+
+bool SyncClock::IsShared() const {
+  if (size_ == 0)
+    return false;
+  atomic_uint32_t *ref = ref_ptr(tab_);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  CHECK_GT(v, 0);
+  return v > 1;
+}
+
+// Unshares the current clock if it's shared.
+// Shared clocks are immutable, so they need to be unshared before any updates.
+// Note: this does not apply to dirty entries as they are not shared.
+void SyncClock::Unshare(ClockCache *c) {
+  if (!IsShared())
+    return;
+  // First, copy current state into old.
+  SyncClock old;
+  old.tab_ = tab_;
+  old.tab_idx_ = tab_idx_;
+  old.size_ = size_;
+  old.blocks_ = blocks_;
+  old.release_store_tid_ = release_store_tid_;
+  old.release_store_reused_ = release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    old.dirty_[i] = dirty_[i];
+  // Then, clear current object.
+  ResetImpl();
+  // Allocate brand new clock in the current object.
+  Resize(c, old.size_);
+  // Now copy state back into this object.
+  Iter old_iter(&old);
+  for (ClockElem &ce : *this) {
+    ce = *old_iter;
+    ++old_iter;
+  }
+  release_store_tid_ = old.release_store_tid_;
+  release_store_reused_ = old.release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dirty_[i] = old.dirty_[i];
+  // Drop reference to old and delete if necessary.
+  old.Reset(c);
+}
+
+// Can we cache this clock for future release operations?
+ALWAYS_INLINE bool SyncClock::Cachable() const {
+  if (size_ == 0)
+    return false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    if (dirty_[i].tid != kInvalidTid)
+      return false;
+  }
+  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
+}
+
+// elem linearizes the two-level structure into linear array.
+// Note: this is used only for one time accesses, vector operations use
+// the iterator as it is much faster.
+ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
   DCHECK_LT(tid, size_);
-  if (size_ <= ClockBlock::kClockCount)
+  const uptr block = tid / ClockBlock::kClockCount;
+  DCHECK_LE(block, blocks_);
+  tid %= ClockBlock::kClockCount;
+  if (block == blocks_)
     return tab_->clock[tid];
-  u32 idx = tab_->table[tid / ClockBlock::kClockCount];
+  u32 idx = get_block(block);
   ClockBlock *cb = ctx->clock_alloc.Map(idx);
-  return cb->clock[tid % ClockBlock::kClockCount];
+  return cb->clock[tid];
+}
+
+ALWAYS_INLINE uptr SyncClock::capacity() const {
+  if (size_ == 0)
+    return 0;
+  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
+  // How many clock elements we can fit into the first level block.
+  // +1 for ref counter.
+  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
+  return blocks_ * ClockBlock::kClockCount + top;
+}
+
+ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
+  DCHECK(size_);
+  DCHECK_LT(bi, blocks_);
+  return tab_->table[ClockBlock::kBlockIdx - bi];
+}
+
+ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
+  uptr bi = blocks_++;
+  CHECK_EQ(get_block(bi), 0);
+  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
+}
+
+// Used only by tests.
+u64 SyncClock::get(unsigned tid) const {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty dirty = dirty_[i];
+    if (dirty.tid == tid)
+      return dirty.epoch;
+  }
+  return elem(tid).epoch;
+}
+
+// Used only by Iter test.
+u64 SyncClock::get_clean(unsigned tid) const {
+  return elem(tid).epoch;
 }
 
 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
@@ -419,8 +567,32 @@ void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("] reused=[");
   for (uptr i = 0; i < size_; i++)
     printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
-  printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
+  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
       release_store_tid_, release_store_reused_,
-      dirty_tids_[0], dirty_tids_[1]);
+      dirty_[0].tid, dirty_[0].epoch,
+      dirty_[1].tid, dirty_[1].epoch);
+}
+
+void SyncClock::Iter::Next() {
+  // Finished with the current block, move on to the next one.
+  block_++;
+  if (block_ < parent_->blocks_) {
+    // Iterate over the next second level block.
+    u32 idx = parent_->get_block(block_);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    pos_ = &cb->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  if (block_ == parent_->blocks_ &&
+      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
+    // Iterate over elements in the first level block.
+    pos_ = &parent_->tab_->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  parent_ = nullptr;  // denotes end
 }
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_clock.h b/lib/tsan/rtl/tsan_clock.h
index 378b550fd11b..a891d7bbd889 100644
--- a/lib/tsan/rtl/tsan_clock.h
+++ b/lib/tsan/rtl/tsan_clock.h
@@ -18,25 +18,6 @@
 
 namespace __tsan {
 
-struct ClockElem {
-  u64 epoch  : kClkBits;
-  u64 reused : 64 - kClkBits;
-};
-
-struct ClockBlock {
-  static const uptr kSize = 512;
-  static const uptr kTableSize = kSize / sizeof(u32);
-  static const uptr kClockCount = kSize / sizeof(ClockElem);
-
-  union {
-    u32       table[kTableSize];
-    ClockElem clock[kClockCount];
-  };
-
-  ClockBlock() {
-  }
-};
-
 typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
 typedef DenseSlabAllocCache ClockCache;
 
@@ -46,69 +27,117 @@ class SyncClock {
   SyncClock();
   ~SyncClock();
 
-  uptr size() const {
-    return size_;
-  }
+  uptr size() const;
 
-  u64 get(unsigned tid) const {
-    return elem(tid).epoch;
-  }
+  // These are used only in tests.
+  u64 get(unsigned tid) const;
+  u64 get_clean(unsigned tid) const;
 
   void Resize(ClockCache *c, uptr nclk);
   void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
 
+  // Clock element iterator.
+  // Note: it iterates only over the table without regard to dirty entries.
+  class Iter {
+   public:
+    explicit Iter(SyncClock* parent);
+    Iter& operator++();
+    bool operator!=(const Iter& other);
+    ClockElem &operator*();
+
+   private:
+    SyncClock *parent_;
+    // [pos_, end_) is the current continuous range of clock elements.
+    ClockElem *pos_;
+    ClockElem *end_;
+    int block_;  // Current number of second level block.
+
+    NOINLINE void Next();
+  };
+
+  Iter begin();
+  Iter end();
+
  private:
-  friend struct ThreadClock;
+  friend class ThreadClock;
+  friend class Iter;
   static const uptr kDirtyTids = 2;
 
+  struct Dirty {
+    u64 epoch  : kClkBits;
+    u64 tid : 64 - kClkBits;  // kInvalidId if not active
+  };
+
   unsigned release_store_tid_;
   unsigned release_store_reused_;
-  unsigned dirty_tids_[kDirtyTids];
-  // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
-  // If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
-  // Otherwise, tab_ points to an array with 128 u32 elements,
+  Dirty dirty_[kDirtyTids];
+  // If size_ is 0, tab_ is nullptr.
+  // If size <= 64 (kClockCount), tab_ contains pointer to an array with
+  // 64 ClockElem's (ClockBlock::clock).
+  // Otherwise, tab_ points to an array with up to 127 u32 elements,
   // each pointing to the second-level 512b block with 64 ClockElem's.
+  // Unused space in the first level ClockBlock is used to store additional
+  // clock elements.
+  // The last u32 element in the first level ClockBlock is always used as
+  // reference counter.
+  //
+  // See the following scheme for details.
+  // All memory blocks are 512 bytes (allocated from ClockAlloc).
+  // Clock (clk) elements are 64 bits.
+  // Idx and ref are 32 bits.
+  //
+  // tab_
+  //    |
+  //    \/
+  //    +----------------------------------------------------+
+  //    | clk128 | clk129 | ...unused... | idx1 | idx0 | ref |
+  //    +----------------------------------------------------+
+  //                                        |      |
+  //                                        |      \/
+  //                                        |      +----------------+
+  //                                        |      | clk0 ... clk63 |
+  //                                        |      +----------------+
+  //                                        \/
+  //                                        +------------------+
+  //                                        | clk64 ... clk127 |
+  //                                        +------------------+
+  //
+  // Note: dirty entries, if active, always override what's stored in the clock.
   ClockBlock *tab_;
   u32 tab_idx_;
-  u32 size_;
+  u16 size_;
+  u16 blocks_;  // Number of second level blocks.
 
+  void Unshare(ClockCache *c);
+  bool IsShared() const;
+  bool Cachable() const;
   void ResetImpl();
+  void FlushDirty();
+  uptr capacity() const;
+  u32 get_block(uptr bi) const;
+  void append_block(u32 idx);
   ClockElem &elem(unsigned tid) const;
 };
 
 // The clock that lives in threads.
-struct ThreadClock {
+class ThreadClock {
  public:
   typedef DenseSlabAllocCache Cache;
 
   explicit ThreadClock(unsigned tid, unsigned reused = 0);
 
-  u64 get(unsigned tid) const {
-    DCHECK_LT(tid, kMaxTidInClock);
-    return clk_[tid].epoch;
-  }
-
+  u64 get(unsigned tid) const;
   void set(ClockCache *c, unsigned tid, u64 v);
+  void set(u64 v);
+  void tick();
+  uptr size() const;
 
-  void set(u64 v) {
-    DCHECK_GE(v, clk_[tid_].epoch);
-    clk_[tid_].epoch = v;
-  }
-
-  void tick() {
-    clk_[tid_].epoch++;
-  }
-
-  uptr size() const {
-    return nclk_;
-  }
-
-  void acquire(ClockCache *c, const SyncClock *src);
-  void release(ClockCache *c, SyncClock *dst) const;
+  void acquire(ClockCache *c, SyncClock *src);
+  void release(ClockCache *c, SyncClock *dst);
   void acq_rel(ClockCache *c, SyncClock *dst);
-  void ReleaseStore(ClockCache *c, SyncClock *dst) const;
+  void ReleaseStore(ClockCache *c, SyncClock *dst);
   void ResetCached(ClockCache *c);
 
   void DebugReset();
@@ -116,16 +145,82 @@ struct ThreadClock {
 
  private:
   static const uptr kDirtyTids = SyncClock::kDirtyTids;
+  // Index of the thread associated with he clock ("current thread").
   const unsigned tid_;
-  const unsigned reused_;
+  const unsigned reused_;  // tid_ reuse count.
+  // Current thread time when it acquired something from other threads.
   u64 last_acquire_;
+
+  // Cached SyncClock (without dirty entries and release_store_tid_).
+  // We reuse it for subsequent store-release operations without intervening
+  // acquire operations. Since it is shared (and thus constant), clock value
+  // for the current thread is then stored in dirty entries in the SyncClock.
+  // We host a refernece to the table while it is cached here.
+  u32 cached_idx_;
+  u16 cached_size_;
+  u16 cached_blocks_;
+
+  // Number of active elements in the clk_ table (the rest is zeros).
   uptr nclk_;
-  ClockElem clk_[kMaxTidInClock];
+  u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
-  void UpdateCurrentThread(SyncClock *dst) const;
+  void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
 };
 
+ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const {
+  DCHECK_LT(tid, kMaxTidInClock);
+  return clk_[tid];
+}
+
+ALWAYS_INLINE void ThreadClock::set(u64 v) {
+  DCHECK_GE(v, clk_[tid_]);
+  clk_[tid_] = v;
+}
+
+ALWAYS_INLINE void ThreadClock::tick() {
+  clk_[tid_]++;
+}
+
+ALWAYS_INLINE uptr ThreadClock::size() const {
+  return nclk_;
+}
+
+ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
+  return Iter(this);
+}
+
+ALWAYS_INLINE SyncClock::Iter SyncClock::end() {
+  return Iter(nullptr);
+}
+
+ALWAYS_INLINE uptr SyncClock::size() const {
+  return size_;
+}
+
+ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent)
+    : parent_(parent)
+    , pos_(nullptr)
+    , end_(nullptr)
+    , block_(-1) {
+  if (parent)
+    Next();
+}
+
+ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() {
+  pos_++;
+  if (UNLIKELY(pos_ >= end_))
+    Next();
+  return *this;
+}
+
+ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) {
+  return parent_ != other.parent_;
+}
+
+ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() {
+  return *pos_;
+}
 }  // namespace __tsan
 
 #endif  // TSAN_CLOCK_H
diff --git a/lib/tsan/rtl/tsan_defs.h b/lib/tsan/rtl/tsan_defs.h
index 8977fea7c552..3c775debfb09 100644
--- a/lib/tsan/rtl/tsan_defs.h
+++ b/lib/tsan/rtl/tsan_defs.h
@@ -38,15 +38,40 @@
 
 namespace __tsan {
 
+const int kClkBits = 42;
+const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
+
+struct ClockElem {
+  u64 epoch  : kClkBits;
+  u64 reused : 64 - kClkBits;  // tid reuse count
+};
+
+struct ClockBlock {
+  static const uptr kSize = 512;
+  static const uptr kTableSize = kSize / sizeof(u32);
+  static const uptr kClockCount = kSize / sizeof(ClockElem);
+  static const uptr kRefIdx = kTableSize - 1;
+  static const uptr kBlockIdx = kTableSize - 2;
+
+  union {
+    u32       table[kTableSize];
+    ClockElem clock[kClockCount];
+  };
+
+  ClockBlock() {
+  }
+};
+
 const int kTidBits = 13;
-const unsigned kMaxTid = 1 << kTidBits;
+// Reduce kMaxTid by kClockCount because one slot in ClockBlock table is
+// occupied by reference counter, so total number of elements we can store
+// in SyncClock is kClockCount * (kTableSize - 1).
+const unsigned kMaxTid = (1 << kTidBits) - ClockBlock::kClockCount;
 #if !SANITIZER_GO
 const unsigned kMaxTidInClock = kMaxTid * 2;  // This includes msb 'freed' bit.
 #else
 const unsigned kMaxTidInClock = kMaxTid;  // Go does not track freed memory.
 #endif
-const int kClkBits = 42;
-const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
 const uptr kShadowStackSize = 64 * 1024;
 
 // Count of shadow values in a shadow cell.
@@ -74,7 +99,7 @@ const bool kCollectHistory = false;
 const bool kCollectHistory = true;
 #endif
 
-const unsigned kInvalidTid = (unsigned)-1;
+const u16 kInvalidTid = kMaxTid + 1;
 
 // The following "build consistency" machinery ensures that all source files
 // are built in the same configuration. Inconsistent builds lead to
diff --git a/lib/tsan/rtl/tsan_mman.cc b/lib/tsan/rtl/tsan_mman.cc
index 1434cf688ce9..f79dccddba9f 100644
--- a/lib/tsan/rtl/tsan_mman.cc
+++ b/lib/tsan/rtl/tsan_mman.cc
@@ -10,6 +10,7 @@
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
 //===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_allocator_checks.h"
 #include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
diff --git a/lib/tsan/rtl/tsan_platform_linux.cc b/lib/tsan/rtl/tsan_platform_linux.cc
index 0ba01babe69a..ead1e5704989 100644
--- a/lib/tsan/rtl/tsan_platform_linux.cc
+++ b/lib/tsan/rtl/tsan_platform_linux.cc
@@ -286,7 +286,7 @@ void InitializePlatform() {
 int ExtractResolvFDs(void *state, int *fds, int nfd) {
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
   int cnt = 0;
-  __res_state *statp = (__res_state*)state;
+  struct __res_state *statp = (struct __res_state*)state;
   for (int i = 0; i < MAXNS && cnt < nfd; i++) {
     if (statp->_u._ext.nsaddrs[i] && statp->_u._ext.nssocks[i] != -1)
       fds[cnt++] = statp->_u._ext.nssocks[i];
diff --git a/lib/tsan/tests/unit/tsan_clock_test.cc b/lib/tsan/tests/unit/tsan_clock_test.cc
index 73104dd6b9d4..f6230e1be565 100644
--- a/lib/tsan/tests/unit/tsan_clock_test.cc
+++ b/lib/tsan/tests/unit/tsan_clock_test.cc
@@ -53,6 +53,31 @@ TEST(Clock, ChunkedBasic) {
   chunked.Reset(&cache);
 }
 
+static const uptr interesting_sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66,
+    100, 124, 125, 126, 127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254,
+    255};
+
+TEST(Clock, Iter) {
+  const uptr n = ARRAY_SIZE(interesting_sizes);
+  for (uptr fi = 0; fi < n; fi++) {
+    const uptr size = interesting_sizes[fi];
+    SyncClock sync;
+    ThreadClock vector(0);
+    for (uptr i = 0; i < size; i++)
+      vector.set(&cache, i, i + 1);
+    if (size != 0)
+      vector.release(&cache, &sync);
+    uptr i = 0;
+    for (ClockElem &ce : sync) {
+      ASSERT_LT(i, size);
+      ASSERT_EQ(sync.get_clean(i), ce.epoch);
+      i++;
+    }
+    ASSERT_EQ(i, size);
+    sync.Reset(&cache);
+  }
+}
+
 TEST(Clock, AcquireRelease) {
   ThreadClock vector1(100);
   vector1.tick();
@@ -216,13 +241,11 @@ TEST(Clock, Growth) {
 
 TEST(Clock, Growth2) {
   // Test clock growth for every pair of sizes:
-  const uptr sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66, 100, 124, 125, 126,
-      127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254, 255};
-  const uptr n = sizeof(sizes) / sizeof(sizes[0]);
+  const uptr n = ARRAY_SIZE(interesting_sizes);
   for (uptr fi = 0; fi < n; fi++) {
     for (uptr ti = fi + 1; ti < n; ti++) {
-      const uptr from = sizes[fi];
-      const uptr to = sizes[ti];
+      const uptr from = interesting_sizes[fi];
+      const uptr to = interesting_sizes[ti];
       SyncClock sync;
       ThreadClock vector(0);
       for (uptr i = 0; i < from; i++)
diff --git a/lib/ubsan/ubsan_handlers.cc b/lib/ubsan/ubsan_handlers.cc
index 185752719aff..75a4490a1843 100644
--- a/lib/ubsan/ubsan_handlers.cc
+++ b/lib/ubsan/ubsan_handlers.cc
@@ -573,14 +573,19 @@ static void handlePointerOverflowImpl(PointerOverflowData *Data,
 
   ScopedReport R(Opts, Loc, ET);
 
-  if ((sptr(Base) >= 0) == (sptr(Result) >= 0))
-    Diag(Loc, DL_Error, "unsigned pointer index expression result is %0, "
-                        "preceding its base %1")
-        << (void *)Result << (void *)Base;
-  else
+  if ((sptr(Base) >= 0) == (sptr(Result) >= 0)) {
+    if (Base > Result)
+      Diag(Loc, DL_Error, "addition of unsigned offset to %0 overflowed to %1")
+          << (void *)Base << (void *)Result;
+    else
+      Diag(Loc, DL_Error,
+           "subtraction of unsigned offset from %0 overflowed to %1")
+          << (void *)Base << (void *)Result;
+  } else {
     Diag(Loc, DL_Error,
          "pointer index expression with base %0 overflowed to %1")
         << (void *)Base << (void *)Result;
+  }
 }
 
 void __ubsan::__ubsan_handle_pointer_overflow(PointerOverflowData *Data,
diff --git a/test/asan/TestCases/allocator_returns_null.cc b/test/asan/TestCases/allocator_returns_null.cc
index 90e25b55e727..8ce002f04d61 100644
--- a/test/asan/TestCases/allocator_returns_null.cc
+++ b/test/asan/TestCases/allocator_returns_null.cc
@@ -36,10 +36,13 @@
 // RUN: %env_asan_opts=allocator_may_return_null=1     %run %t new-nothrow 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-nnNULL
 
+// UNSUPPORTED: win32
+
 #include <assert.h>
-#include <string.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <limits>
 #include <new>
 
@@ -84,6 +87,8 @@ int main(int argc, char **argv) {
     assert(0);
   }
 
+  fprintf(stderr, "errno: %d\n", errno);
+
   // The NULL pointer is printed differently on different systems, while (long)0
   // is always the same.
   fprintf(stderr, "x: %lx\n", (long)x);
@@ -108,14 +113,19 @@ int main(int argc, char **argv) {
 // CHECK-nnCRASH: AddressSanitizer's allocator is terminating the process
 
 // CHECK-mNULL: malloc:
+// CHECK-mNULL: errno: 12
 // CHECK-mNULL: x: 0
 // CHECK-cNULL: calloc:
+// CHECK-cNULL: errno: 12
 // CHECK-cNULL: x: 0
 // CHECK-coNULL: calloc-overflow:
+// CHECK-coNULL: errno: 12
 // CHECK-coNULL: x: 0
 // CHECK-rNULL: realloc:
+// CHECK-rNULL: errno: 12
 // CHECK-rNULL: x: 0
 // CHECK-mrNULL: realloc-after-malloc:
+// CHECK-mrNULL: errno: 12
 // CHECK-mrNULL: x: 0
 // CHECK-nnNULL: new-nothrow:
 // CHECK-nnNULL: x: 0
diff --git a/test/lsan/TestCases/allocator_returns_null.cc b/test/lsan/TestCases/allocator_returns_null.cc
index ab2c734e1e58..28dd696dc673 100644
--- a/test/lsan/TestCases/allocator_returns_null.cc
+++ b/test/lsan/TestCases/allocator_returns_null.cc
@@ -37,9 +37,10 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-nnNULL
 
 #include <assert.h>
-#include <string.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <limits>
 #include <new>
 
@@ -86,6 +87,8 @@ int main(int argc, char **argv) {
     assert(0);
   }
 
+  fprintf(stderr, "errno: %d\n", errno);
+
   // The NULL pointer is printed differently on different systems, while (long)0
   // is always the same.
   fprintf(stderr, "x: %zu\n", (size_t)x);
@@ -110,14 +113,19 @@ int main(int argc, char **argv) {
 // CHECK-nnCRASH: Sanitizer's allocator is terminating the process
 
 // CHECK-mNULL: malloc:
+// CHECK-mNULL: errno: 12
 // CHECK-mNULL: x: 0
 // CHECK-cNULL: calloc:
+// CHECK-cNULL: errno: 12
 // CHECK-cNULL: x: 0
 // CHECK-coNULL: calloc-overflow:
+// CHECK-coNULL: errno: 12
 // CHECK-coNULL: x: 0
 // CHECK-rNULL: realloc:
+// CHECK-rNULL: errno: 12
 // CHECK-rNULL: x: 0
 // CHECK-mrNULL: realloc-after-malloc:
+// CHECK-mrNULL: errno: 12
 // CHECK-mrNULL: x: 0
 // CHECK-nnNULL: new-nothrow:
 // CHECK-nnNULL: x: 0
diff --git a/test/msan/allocator_returns_null.cc b/test/msan/allocator_returns_null.cc
index 2c7c32d404fc..583b5b4f76be 100644
--- a/test/msan/allocator_returns_null.cc
+++ b/test/msan/allocator_returns_null.cc
@@ -36,11 +36,13 @@
 // RUN: MSAN_OPTIONS=allocator_may_return_null=1     %run %t new-nothrow 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-nnNULL
 
+// UNSUPPORTED: win32
 
 #include <assert.h>
-#include <string.h>
+#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <limits>
 #include <new>
 
@@ -85,6 +87,8 @@ int main(int argc, char **argv) {
     assert(0);
   }
 
+  fprintf(stderr, "errno: %d\n", errno);
+
   // The NULL pointer is printed differently on different systems, while (long)0
   // is always the same.
   fprintf(stderr, "x: %lx\n", (long)x);
@@ -109,14 +113,19 @@ int main(int argc, char **argv) {
 // CHECK-nnCRASH: MemorySanitizer's allocator is terminating the process
 
 // CHECK-mNULL: malloc:
+// CHECK-mNULL: errno: 12
 // CHECK-mNULL: x: 0
 // CHECK-cNULL: calloc:
+// CHECK-cNULL: errno: 12
 // CHECK-cNULL: x: 0
 // CHECK-coNULL: calloc-overflow:
+// CHECK-coNULL: errno: 12
 // CHECK-coNULL: x: 0
 // CHECK-rNULL: realloc:
+// CHECK-rNULL: errno: 12
 // CHECK-rNULL: x: 0
 // CHECK-mrNULL: realloc-after-malloc:
+// CHECK-mrNULL: errno: 12
 // CHECK-mrNULL: x: 0
 // CHECK-nnNULL: new-nothrow:
 // CHECK-nnNULL: x: 0
diff --git a/test/scudo/memalign.cpp b/test/scudo/memalign.cpp
index 856128f2489f..82c54af8b0e4 100644
--- a/test/scudo/memalign.cpp
+++ b/test/scudo/memalign.cpp
@@ -65,15 +65,15 @@ int main(int argc, char **argv)
     // Size is not a multiple of alignment.
     p = aligned_alloc(alignment, size >> 1);
     assert(!p);
-    p = (void *)0x42UL;
+    void *p_unchanged = (void *)0x42UL;
+    p = p_unchanged;
     // Alignment is not a power of 2.
     err = posix_memalign(&p, 3, size);
-    assert(!p);
+    assert(p == p_unchanged);
     assert(err == EINVAL);
-    p = (void *)0x42UL;
     // Alignment is a power of 2, but not a multiple of size(void *).
     err = posix_memalign(&p, 2, size);
-    assert(!p);
+    assert(p == p_unchanged);
     assert(err == EINVAL);
   }
   return 0;
diff --git a/test/tsan/Linux/check_memcpy.cc b/test/tsan/Linux/check_memcpy.cc
index 8ad04c07cf51..b81efa42ad52 100644
--- a/test/tsan/Linux/check_memcpy.cc
+++ b/test/tsan/Linux/check_memcpy.cc
@@ -5,6 +5,8 @@
 // RUN: %clangxx_tsan -O1 %s -o %t
 // RUN: llvm-objdump -d %t | FileCheck %s
 
+// REQUIRES: compiler-rt-optimized
+
 int main() {
   return 0;
 }
diff --git a/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp b/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp
index 991374b5a676..0002c713f866 100644
--- a/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp
+++ b/test/ubsan/TestCases/Pointer/unsigned-index-expression.cpp
@@ -1,13 +1,20 @@
-// RUN: %clangxx -fsanitize=pointer-overflow %s -o %t
+// RUN: %clangxx -std=c++11 -fsanitize=pointer-overflow %s -o %t
 // RUN: %t 2>&1 | FileCheck %s
 
 int main(int argc, char *argv[]) {
   char c;
   char *p = &c;
-  unsigned long long offset = -1;
+  unsigned long long neg_1 = -1;
 
-  // CHECK: unsigned-index-expression.cpp:[[@LINE+1]]:15: runtime error: unsigned pointer index expression result is 0x{{.*}}, preceding its base 0x{{.*}}
-  char *q = p + offset;
+  // CHECK: unsigned-index-expression.cpp:[[@LINE+1]]:15: runtime error: addition of unsigned offset to 0x{{.*}} overflowed to 0x{{.*}}
+  char *q = p + neg_1;
+
+  // CHECK: unsigned-index-expression.cpp:[[@LINE+1]]:16: runtime error: subtraction of unsigned offset from 0x{{.*}} overflowed to 0x{{.*}}
+  char *q1 = p - neg_1;
+
+  // CHECK: unsigned-index-expression.cpp:[[@LINE+2]]:16: runtime error: pointer index expression with base 0x{{0*}} overflowed to 0x{{.*}}
+  char *n = nullptr;
+  char *q2 = n - 1ULL;
 
   return 0;
 }